1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45#include "stringlib/eq.h"
46
47#ifdef MS_WINDOWS
48#include <windows.h>
49#endif
50
51/*[clinic input]
52class str "PyUnicodeObject *" "&PyUnicode_Type"
53[clinic start generated code]*/
54/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
55
56/* --- Globals ------------------------------------------------------------
57
58NOTE: In the interpreter's initialization phase, some globals are currently
59      initialized dynamically as needed. In the process Unicode objects may
60      be created before the Unicode type is ready.
61
62*/
63
64
65#ifdef __cplusplus
66extern "C" {
67#endif
68
69/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
72#ifdef Py_DEBUG
73#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
74#else
75#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
77
78#define _PyUnicode_UTF8(op)                             \
79    (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op)                              \
81    (assert(_PyUnicode_CHECK(op)),                      \
82     assert(PyUnicode_IS_READY(op)),                    \
83     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
84         ((char*)((PyASCIIObject*)(op) + 1)) :          \
85         _PyUnicode_UTF8(op))
86#define _PyUnicode_UTF8_LENGTH(op)                      \
87    (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op)                       \
89    (assert(_PyUnicode_CHECK(op)),                      \
90     assert(PyUnicode_IS_READY(op)),                    \
91     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
92         ((PyASCIIObject*)(op))->length :               \
93         _PyUnicode_UTF8_LENGTH(op))
94#define _PyUnicode_WSTR(op)                             \
95    (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op)                      \
97    (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op)                           \
99    (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op)                            \
101    (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op)                             \
103    (((PyASCIIObject *)(op))->hash)
104#define _PyUnicode_KIND(op)                             \
105    (assert(_PyUnicode_CHECK(op)),                      \
106     ((PyASCIIObject *)(op))->state.kind)
107#define _PyUnicode_GET_LENGTH(op)                       \
108    (assert(_PyUnicode_CHECK(op)),                      \
109     ((PyASCIIObject *)(op))->length)
110#define _PyUnicode_DATA_ANY(op)                         \
111    (((PyUnicodeObject*)(op))->data.any)
112
113#undef PyUnicode_READY
114#define PyUnicode_READY(op)                             \
115    (assert(_PyUnicode_CHECK(op)),                      \
116     (PyUnicode_IS_READY(op) ?                          \
117      0 :                                               \
118      _PyUnicode_Ready(op)))
119
120#define _PyUnicode_SHARE_UTF8(op)                       \
121    (assert(_PyUnicode_CHECK(op)),                      \
122     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
123     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op)                       \
125    (assert(_PyUnicode_CHECK(op)),                      \
126     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
128/* true if the Unicode object has an allocated UTF-8 memory block
129   (not shared with other data) */
130#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
131    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
132      && _PyUnicode_UTF8(op)                            \
133      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
135/* true if the Unicode object has an allocated wstr memory block
136   (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
138    ((_PyUnicode_WSTR(op) &&                            \
139      (!PyUnicode_IS_READY(op) ||                       \
140       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
142/* Generic helper macro to convert characters of different types.
143   from_type and to_type have to be valid type names, begin and end
144   are pointers to the source characters which should be of type
145   "from_type *".  to is a pointer of type "to_type *" and points to the
146   buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148    do {                                                \
149        to_type *_to = (to_type *)(to);                \
150        const from_type *_iter = (from_type *)(begin);  \
151        const from_type *_end = (from_type *)(end);     \
152        Py_ssize_t n = (_end) - (_iter);                \
153        const from_type *_unrolled_end =                \
154            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
155        while (_iter < (_unrolled_end)) {               \
156            _to[0] = (to_type) _iter[0];                \
157            _to[1] = (to_type) _iter[1];                \
158            _to[2] = (to_type) _iter[2];                \
159            _to[3] = (to_type) _iter[3];                \
160            _iter += 4; _to += 4;                       \
161        }                                               \
162        while (_iter < (_end))                          \
163            *_to++ = (to_type) *_iter++;                \
164    } while (0)
165
166#ifdef MS_WINDOWS
167   /* On Windows, overallocate by 50% is the best factor */
168#  define OVERALLOCATE_FACTOR 2
169#else
170   /* On Linux, overallocate by 25% is the best factor */
171#  define OVERALLOCATE_FACTOR 4
172#endif
173
174/* This dictionary holds all interned unicode strings.  Note that references
175   to strings in this dictionary are *not* counted in the string's ob_refcnt.
176   When the interned string reaches a refcnt of 0 the string deallocation
177   function will delete the reference from this dictionary.
178
179   Another way to look at this is that to say that the actual reference
180   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
181*/
182static PyObject *interned = NULL;
183
184/* The empty Unicode object is shared to improve performance. */
185static PyObject *unicode_empty = NULL;
186
187#define _Py_INCREF_UNICODE_EMPTY()                      \
188    do {                                                \
189        if (unicode_empty != NULL)                      \
190            Py_INCREF(unicode_empty);                   \
191        else {                                          \
192            unicode_empty = PyUnicode_New(0, 0);        \
193            if (unicode_empty != NULL) {                \
194                Py_INCREF(unicode_empty);               \
195                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196            }                                           \
197        }                                               \
198    } while (0)
199
200#define _Py_RETURN_UNICODE_EMPTY()                      \
201    do {                                                \
202        _Py_INCREF_UNICODE_EMPTY();                     \
203        return unicode_empty;                           \
204    } while (0)
205
206/* Forward declaration */
207static inline int
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
210/* List of static strings. */
211static _Py_Identifier *static_strings = NULL;
212
213/* Single character Unicode strings in the Latin-1 range are being
214   shared as well. */
215static PyObject *unicode_latin1[256] = {NULL};
216
217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
219    0, 0, 0, 0, 0, 0, 0, 0,
220/*     case 0x0009: * CHARACTER TABULATION */
221/*     case 0x000A: * LINE FEED */
222/*     case 0x000B: * LINE TABULATION */
223/*     case 0x000C: * FORM FEED */
224/*     case 0x000D: * CARRIAGE RETURN */
225    0, 1, 1, 1, 1, 1, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227/*     case 0x001C: * FILE SEPARATOR */
228/*     case 0x001D: * GROUP SEPARATOR */
229/*     case 0x001E: * RECORD SEPARATOR */
230/*     case 0x001F: * UNIT SEPARATOR */
231    0, 0, 0, 0, 1, 1, 1, 1,
232/*     case 0x0020: * SPACE */
233    1, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0,
237
238    0, 0, 0, 0, 0, 0, 0, 0,
239    0, 0, 0, 0, 0, 0, 0, 0,
240    0, 0, 0, 0, 0, 0, 0, 0,
241    0, 0, 0, 0, 0, 0, 0, 0,
242    0, 0, 0, 0, 0, 0, 0, 0,
243    0, 0, 0, 0, 0, 0, 0, 0,
244    0, 0, 0, 0, 0, 0, 0, 0,
245    0, 0, 0, 0, 0, 0, 0, 0
246};
247
248/* forward */
249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
250static PyObject* get_latin1_char(unsigned char ch);
251static int unicode_modifiable(PyObject *unicode);
252
253
254static PyObject *
255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
262unicode_encode_call_errorhandler(const char *errors,
263       PyObject **errorHandler,const char *encoding, const char *reason,
264       PyObject *unicode, PyObject **exceptionObject,
265       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
267static void
268raise_encode_exception(PyObject **exceptionObject,
269                       const char *encoding,
270                       PyObject *unicode,
271                       Py_ssize_t startpos, Py_ssize_t endpos,
272                       const char *reason);
273
274/* Same for linebreaks */
275static const unsigned char ascii_linebreak[] = {
276    0, 0, 0, 0, 0, 0, 0, 0,
277/*         0x000A, * LINE FEED */
278/*         0x000B, * LINE TABULATION */
279/*         0x000C, * FORM FEED */
280/*         0x000D, * CARRIAGE RETURN */
281    0, 0, 1, 1, 1, 1, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0,
283/*         0x001C, * FILE SEPARATOR */
284/*         0x001D, * GROUP SEPARATOR */
285/*         0x001E, * RECORD SEPARATOR */
286    0, 0, 0, 0, 1, 1, 1, 0,
287    0, 0, 0, 0, 0, 0, 0, 0,
288    0, 0, 0, 0, 0, 0, 0, 0,
289    0, 0, 0, 0, 0, 0, 0, 0,
290    0, 0, 0, 0, 0, 0, 0, 0,
291
292    0, 0, 0, 0, 0, 0, 0, 0,
293    0, 0, 0, 0, 0, 0, 0, 0,
294    0, 0, 0, 0, 0, 0, 0, 0,
295    0, 0, 0, 0, 0, 0, 0, 0,
296    0, 0, 0, 0, 0, 0, 0, 0,
297    0, 0, 0, 0, 0, 0, 0, 0,
298    0, 0, 0, 0, 0, 0, 0, 0,
299    0, 0, 0, 0, 0, 0, 0, 0
300};
301
302#include "clinic/unicodeobject.c.h"
303
304typedef enum {
305    _Py_ERROR_UNKNOWN=0,
306    _Py_ERROR_STRICT,
307    _Py_ERROR_SURROGATEESCAPE,
308    _Py_ERROR_REPLACE,
309    _Py_ERROR_IGNORE,
310    _Py_ERROR_BACKSLASHREPLACE,
311    _Py_ERROR_SURROGATEPASS,
312    _Py_ERROR_XMLCHARREFREPLACE,
313    _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
319    if (errors == NULL || strcmp(errors, "strict") == 0) {
320        return _Py_ERROR_STRICT;
321    }
322    if (strcmp(errors, "surrogateescape") == 0) {
323        return _Py_ERROR_SURROGATEESCAPE;
324    }
325    if (strcmp(errors, "replace") == 0) {
326        return _Py_ERROR_REPLACE;
327    }
328    if (strcmp(errors, "ignore") == 0) {
329        return _Py_ERROR_IGNORE;
330    }
331    if (strcmp(errors, "backslashreplace") == 0) {
332        return _Py_ERROR_BACKSLASHREPLACE;
333    }
334    if (strcmp(errors, "surrogatepass") == 0) {
335        return _Py_ERROR_SURROGATEPASS;
336    }
337    if (strcmp(errors, "xmlcharrefreplace") == 0) {
338        return _Py_ERROR_XMLCHARREFREPLACE;
339    }
340    return _Py_ERROR_OTHER;
341}
342
343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344   This function is kept for backward compatibility with the old API. */
345Py_UNICODE
346PyUnicode_GetMax(void)
347{
348#ifdef Py_UNICODE_WIDE
349    return 0x10FFFF;
350#else
351    /* This is actually an illegal character, so it should
352       not be passed to unichr. */
353    return 0xFFFF;
354#endif
355}
356
357#ifdef Py_DEBUG
358int
359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
360{
361    PyASCIIObject *ascii;
362    unsigned int kind;
363
364    assert(PyUnicode_Check(op));
365
366    ascii = (PyASCIIObject *)op;
367    kind = ascii->state.kind;
368
369    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
370        assert(kind == PyUnicode_1BYTE_KIND);
371        assert(ascii->state.ready == 1);
372    }
373    else {
374        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
375        void *data;
376
377        if (ascii->state.compact == 1) {
378            data = compact + 1;
379            assert(kind == PyUnicode_1BYTE_KIND
380                   || kind == PyUnicode_2BYTE_KIND
381                   || kind == PyUnicode_4BYTE_KIND);
382            assert(ascii->state.ascii == 0);
383            assert(ascii->state.ready == 1);
384            assert (compact->utf8 != data);
385        }
386        else {
387            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389            data = unicode->data.any;
390            if (kind == PyUnicode_WCHAR_KIND) {
391                assert(ascii->length == 0);
392                assert(ascii->hash == -1);
393                assert(ascii->state.compact == 0);
394                assert(ascii->state.ascii == 0);
395                assert(ascii->state.ready == 0);
396                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
397                assert(ascii->wstr != NULL);
398                assert(data == NULL);
399                assert(compact->utf8 == NULL);
400            }
401            else {
402                assert(kind == PyUnicode_1BYTE_KIND
403                       || kind == PyUnicode_2BYTE_KIND
404                       || kind == PyUnicode_4BYTE_KIND);
405                assert(ascii->state.compact == 0);
406                assert(ascii->state.ready == 1);
407                assert(data != NULL);
408                if (ascii->state.ascii) {
409                    assert (compact->utf8 == data);
410                    assert (compact->utf8_length == ascii->length);
411                }
412                else
413                    assert (compact->utf8 != data);
414            }
415        }
416        if (kind != PyUnicode_WCHAR_KIND) {
417            if (
418#if SIZEOF_WCHAR_T == 2
419                kind == PyUnicode_2BYTE_KIND
420#else
421                kind == PyUnicode_4BYTE_KIND
422#endif
423               )
424            {
425                assert(ascii->wstr == data);
426                assert(compact->wstr_length == ascii->length);
427            } else
428                assert(ascii->wstr != data);
429        }
430
431        if (compact->utf8 == NULL)
432            assert(compact->utf8_length == 0);
433        if (ascii->wstr == NULL)
434            assert(compact->wstr_length == 0);
435    }
436    /* check that the best kind is used */
437    if (check_content && kind != PyUnicode_WCHAR_KIND)
438    {
439        Py_ssize_t i;
440        Py_UCS4 maxchar = 0;
441        void *data;
442        Py_UCS4 ch;
443
444        data = PyUnicode_DATA(ascii);
445        for (i=0; i < ascii->length; i++)
446        {
447            ch = PyUnicode_READ(kind, data, i);
448            if (ch > maxchar)
449                maxchar = ch;
450        }
451        if (kind == PyUnicode_1BYTE_KIND) {
452            if (ascii->state.ascii == 0) {
453                assert(maxchar >= 128);
454                assert(maxchar <= 255);
455            }
456            else
457                assert(maxchar < 128);
458        }
459        else if (kind == PyUnicode_2BYTE_KIND) {
460            assert(maxchar >= 0x100);
461            assert(maxchar <= 0xFFFF);
462        }
463        else {
464            assert(maxchar >= 0x10000);
465            assert(maxchar <= MAX_UNICODE);
466        }
467        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
468    }
469    return 1;
470}
471#endif
472
473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477    Py_ssize_t len;
478
479    len = _PyUnicode_WSTR_LENGTH(unicode);
480    if (len == 0) {
481        Py_DECREF(unicode);
482        _Py_RETURN_UNICODE_EMPTY();
483    }
484
485    if (len == 1) {
486        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
487        if ((Py_UCS4)ch < 256) {
488            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489            Py_DECREF(unicode);
490            return latin1_char;
491        }
492    }
493
494    if (_PyUnicode_Ready(unicode) < 0) {
495        Py_DECREF(unicode);
496        return NULL;
497    }
498#else
499    assert(Py_REFCNT(unicode) == 1);
500
501    /* don't make the result ready in debug mode to ensure that the caller
502       makes the string ready before using it */
503    assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505    return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511    Py_ssize_t length;
512
513    length = PyUnicode_GET_LENGTH(unicode);
514    if (length == 0) {
515        if (unicode != unicode_empty) {
516            Py_DECREF(unicode);
517            _Py_RETURN_UNICODE_EMPTY();
518        }
519        return unicode_empty;
520    }
521
522    if (length == 1) {
523        void *data = PyUnicode_DATA(unicode);
524        int kind = PyUnicode_KIND(unicode);
525        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
526        if (ch < 256) {
527            PyObject *latin1_char = unicode_latin1[ch];
528            if (latin1_char != NULL) {
529                if (unicode != latin1_char) {
530                    Py_INCREF(latin1_char);
531                    Py_DECREF(unicode);
532                }
533                return latin1_char;
534            }
535            else {
536                assert(_PyUnicode_CheckConsistency(unicode, 1));
537                Py_INCREF(unicode);
538                unicode_latin1[ch] = unicode;
539                return unicode;
540            }
541        }
542    }
543
544    assert(_PyUnicode_CheckConsistency(unicode, 1));
545    return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551    assert(_PyUnicode_CHECK(unicode));
552    if (PyUnicode_IS_READY(unicode))
553        return unicode_result_ready(unicode);
554    else
555        return unicode_result_wchar(unicode);
556}
557
558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561    if (PyUnicode_CheckExact(unicode)) {
562        if (PyUnicode_READY(unicode) == -1)
563            return NULL;
564        Py_INCREF(unicode);
565        return unicode;
566    }
567    else
568        /* Subtype -- return genuine unicode string with the same value. */
569        return _PyUnicode_Copy(unicode);
570}
571
572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573   ASCII, Latin1, UTF-8, etc. */
574static char*
575backslashreplace(_PyBytesWriter *writer, char *str,
576                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
578    Py_ssize_t size, i;
579    Py_UCS4 ch;
580    enum PyUnicode_Kind kind;
581    void *data;
582
583    assert(PyUnicode_IS_READY(unicode));
584    kind = PyUnicode_KIND(unicode);
585    data = PyUnicode_DATA(unicode);
586
587    size = 0;
588    /* determine replacement size */
589    for (i = collstart; i < collend; ++i) {
590        Py_ssize_t incr;
591
592        ch = PyUnicode_READ(kind, data, i);
593        if (ch < 0x100)
594            incr = 2+2;
595        else if (ch < 0x10000)
596            incr = 2+4;
597        else {
598            assert(ch <= MAX_UNICODE);
599            incr = 2+8;
600        }
601        if (size > PY_SSIZE_T_MAX - incr) {
602            PyErr_SetString(PyExc_OverflowError,
603                            "encoded result is too long for a Python string");
604            return NULL;
605        }
606        size += incr;
607    }
608
609    str = _PyBytesWriter_Prepare(writer, str, size);
610    if (str == NULL)
611        return NULL;
612
613    /* generate replacement */
614    for (i = collstart; i < collend; ++i) {
615        ch = PyUnicode_READ(kind, data, i);
616        *str++ = '\\';
617        if (ch >= 0x00010000) {
618            *str++ = 'U';
619            *str++ = Py_hexdigits[(ch>>28)&0xf];
620            *str++ = Py_hexdigits[(ch>>24)&0xf];
621            *str++ = Py_hexdigits[(ch>>20)&0xf];
622            *str++ = Py_hexdigits[(ch>>16)&0xf];
623            *str++ = Py_hexdigits[(ch>>12)&0xf];
624            *str++ = Py_hexdigits[(ch>>8)&0xf];
625        }
626        else if (ch >= 0x100) {
627            *str++ = 'u';
628            *str++ = Py_hexdigits[(ch>>12)&0xf];
629            *str++ = Py_hexdigits[(ch>>8)&0xf];
630        }
631        else
632            *str++ = 'x';
633        *str++ = Py_hexdigits[(ch>>4)&0xf];
634        *str++ = Py_hexdigits[ch&0xf];
635    }
636    return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640   ASCII, Latin1, UTF-8, etc. */
641static char*
642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
643                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
645    Py_ssize_t size, i;
646    Py_UCS4 ch;
647    enum PyUnicode_Kind kind;
648    void *data;
649
650    assert(PyUnicode_IS_READY(unicode));
651    kind = PyUnicode_KIND(unicode);
652    data = PyUnicode_DATA(unicode);
653
654    size = 0;
655    /* determine replacement size */
656    for (i = collstart; i < collend; ++i) {
657        Py_ssize_t incr;
658
659        ch = PyUnicode_READ(kind, data, i);
660        if (ch < 10)
661            incr = 2+1+1;
662        else if (ch < 100)
663            incr = 2+2+1;
664        else if (ch < 1000)
665            incr = 2+3+1;
666        else if (ch < 10000)
667            incr = 2+4+1;
668        else if (ch < 100000)
669            incr = 2+5+1;
670        else if (ch < 1000000)
671            incr = 2+6+1;
672        else {
673            assert(ch <= MAX_UNICODE);
674            incr = 2+7+1;
675        }
676        if (size > PY_SSIZE_T_MAX - incr) {
677            PyErr_SetString(PyExc_OverflowError,
678                            "encoded result is too long for a Python string");
679            return NULL;
680        }
681        size += incr;
682    }
683
684    str = _PyBytesWriter_Prepare(writer, str, size);
685    if (str == NULL)
686        return NULL;
687
688    /* generate replacement */
689    for (i = collstart; i < collend; ++i) {
690        str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691    }
692    return str;
693}
694
695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698   to keep things simple, we use a single bitmask, using the least 5
699   bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
713#define BLOOM_MASK unsigned long
714
715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
716
717#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
718
719#define BLOOM_LINEBREAK(ch)                                             \
720    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
721     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
722
723static inline BLOOM_MASK
724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
725{
726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
727    do {                                               \
728        TYPE *data = (TYPE *)PTR;                      \
729        TYPE *end = data + LEN;                        \
730        Py_UCS4 ch;                                    \
731        for (; data != end; data++) {                  \
732            ch = *data;                                \
733            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734        }                                              \
735        break;                                         \
736    } while (0)
737
738    /* calculate simple bloom-style bitmask for a given unicode string */
739
740    BLOOM_MASK mask;
741
742    mask = 0;
743    switch (kind) {
744    case PyUnicode_1BYTE_KIND:
745        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746        break;
747    case PyUnicode_2BYTE_KIND:
748        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749        break;
750    case PyUnicode_4BYTE_KIND:
751        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752        break;
753    default:
754        assert(0);
755    }
756    return mask;
757
758#undef BLOOM_UPDATE
759}
760
761static int
762ensure_unicode(PyObject *obj)
763{
764    if (!PyUnicode_Check(obj)) {
765        PyErr_Format(PyExc_TypeError,
766                     "must be str, not %.100s",
767                     Py_TYPE(obj)->tp_name);
768        return -1;
769    }
770    return PyUnicode_READY(obj);
771}
772
773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
791#include "stringlib/replace.h"
792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
802#include "stringlib/replace.h"
803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
813#include "stringlib/replace.h"
814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
822#include "stringlib/undef.h"
823
824/* --- Unicode Object ----------------------------------------------------- */
825
826static PyObject *
827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
828
829static inline Py_ssize_t
830findchar(const void *s, int kind,
831         Py_ssize_t size, Py_UCS4 ch,
832         int direction)
833{
834    switch (kind) {
835    case PyUnicode_1BYTE_KIND:
836        if ((Py_UCS1) ch != ch)
837            return -1;
838        if (direction > 0)
839            return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840        else
841            return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
842    case PyUnicode_2BYTE_KIND:
843        if ((Py_UCS2) ch != ch)
844            return -1;
845        if (direction > 0)
846            return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847        else
848            return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
849    case PyUnicode_4BYTE_KIND:
850        if (direction > 0)
851            return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852        else
853            return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
854    default:
855        assert(0);
856        return -1;
857    }
858}
859
860#ifdef Py_DEBUG
861/* Fill the data of a Unicode string with invalid characters to detect bugs
862   earlier.
863
864   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866   invalid character in Unicode 6.0. */
867static void
868unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869{
870    int kind = PyUnicode_KIND(unicode);
871    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873    if (length <= old_length)
874        return;
875    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876}
877#endif
878
879static PyObject*
880resize_compact(PyObject *unicode, Py_ssize_t length)
881{
882    Py_ssize_t char_size;
883    Py_ssize_t struct_size;
884    Py_ssize_t new_size;
885    int share_wstr;
886    PyObject *new_unicode;
887#ifdef Py_DEBUG
888    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889#endif
890
891    assert(unicode_modifiable(unicode));
892    assert(PyUnicode_IS_READY(unicode));
893    assert(PyUnicode_IS_COMPACT(unicode));
894
895    char_size = PyUnicode_KIND(unicode);
896    if (PyUnicode_IS_ASCII(unicode))
897        struct_size = sizeof(PyASCIIObject);
898    else
899        struct_size = sizeof(PyCompactUnicodeObject);
900    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
901
902    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903        PyErr_NoMemory();
904        return NULL;
905    }
906    new_size = (struct_size + (length + 1) * char_size);
907
908    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909        PyObject_DEL(_PyUnicode_UTF8(unicode));
910        _PyUnicode_UTF8(unicode) = NULL;
911        _PyUnicode_UTF8_LENGTH(unicode) = 0;
912    }
913    _Py_DEC_REFTOTAL;
914    _Py_ForgetReference(unicode);
915
916    new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
917    if (new_unicode == NULL) {
918        _Py_NewReference(unicode);
919        PyErr_NoMemory();
920        return NULL;
921    }
922    unicode = new_unicode;
923    _Py_NewReference(unicode);
924
925    _PyUnicode_LENGTH(unicode) = length;
926    if (share_wstr) {
927        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
928        if (!PyUnicode_IS_ASCII(unicode))
929            _PyUnicode_WSTR_LENGTH(unicode) = length;
930    }
931    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932        PyObject_DEL(_PyUnicode_WSTR(unicode));
933        _PyUnicode_WSTR(unicode) = NULL;
934        if (!PyUnicode_IS_ASCII(unicode))
935            _PyUnicode_WSTR_LENGTH(unicode) = 0;
936    }
937#ifdef Py_DEBUG
938    unicode_fill_invalid(unicode, old_length);
939#endif
940    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941                    length, 0);
942    assert(_PyUnicode_CheckConsistency(unicode, 0));
943    return unicode;
944}
945
946static int
947resize_inplace(PyObject *unicode, Py_ssize_t length)
948{
949    wchar_t *wstr;
950    Py_ssize_t new_size;
951    assert(!PyUnicode_IS_COMPACT(unicode));
952    assert(Py_REFCNT(unicode) == 1);
953
954    if (PyUnicode_IS_READY(unicode)) {
955        Py_ssize_t char_size;
956        int share_wstr, share_utf8;
957        void *data;
958#ifdef Py_DEBUG
959        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960#endif
961
962        data = _PyUnicode_DATA_ANY(unicode);
963        char_size = PyUnicode_KIND(unicode);
964        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
966
967        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968            PyErr_NoMemory();
969            return -1;
970        }
971        new_size = (length + 1) * char_size;
972
973        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974        {
975            PyObject_DEL(_PyUnicode_UTF8(unicode));
976            _PyUnicode_UTF8(unicode) = NULL;
977            _PyUnicode_UTF8_LENGTH(unicode) = 0;
978        }
979
980        data = (PyObject *)PyObject_REALLOC(data, new_size);
981        if (data == NULL) {
982            PyErr_NoMemory();
983            return -1;
984        }
985        _PyUnicode_DATA_ANY(unicode) = data;
986        if (share_wstr) {
987            _PyUnicode_WSTR(unicode) = data;
988            _PyUnicode_WSTR_LENGTH(unicode) = length;
989        }
990        if (share_utf8) {
991            _PyUnicode_UTF8(unicode) = data;
992            _PyUnicode_UTF8_LENGTH(unicode) = length;
993        }
994        _PyUnicode_LENGTH(unicode) = length;
995        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
996#ifdef Py_DEBUG
997        unicode_fill_invalid(unicode, old_length);
998#endif
999        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1000            assert(_PyUnicode_CheckConsistency(unicode, 0));
1001            return 0;
1002        }
1003    }
1004    assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006    /* check for integer overflow */
1007    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1008        PyErr_NoMemory();
1009        return -1;
1010    }
1011    new_size = sizeof(wchar_t) * (length + 1);
1012    wstr =  _PyUnicode_WSTR(unicode);
1013    wstr = PyObject_REALLOC(wstr, new_size);
1014    if (!wstr) {
1015        PyErr_NoMemory();
1016        return -1;
1017    }
1018    _PyUnicode_WSTR(unicode) = wstr;
1019    _PyUnicode_WSTR(unicode)[length] = 0;
1020    _PyUnicode_WSTR_LENGTH(unicode) = length;
1021    assert(_PyUnicode_CheckConsistency(unicode, 0));
1022    return 0;
1023}
1024
1025static PyObject*
1026resize_copy(PyObject *unicode, Py_ssize_t length)
1027{
1028    Py_ssize_t copy_length;
1029    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1030        PyObject *copy;
1031
1032        if (PyUnicode_READY(unicode) == -1)
1033            return NULL;
1034
1035        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036        if (copy == NULL)
1037            return NULL;
1038
1039        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1040        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1041        return copy;
1042    }
1043    else {
1044        PyObject *w;
1045
1046        w = (PyObject*)_PyUnicode_New(length);
1047        if (w == NULL)
1048            return NULL;
1049        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050        copy_length = Py_MIN(copy_length, length);
1051        memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1052                  copy_length * sizeof(wchar_t));
1053        return w;
1054    }
1055}
1056
1057/* We allocate one more byte to make sure the string is
1058   Ux0000 terminated; some code (e.g. new_identifier)
1059   relies on that.
1060
1061   XXX This allocator could further be enhanced by assuring that the
1062   free list never reduces its size below 1.
1063
1064*/
1065
1066static PyUnicodeObject *
1067_PyUnicode_New(Py_ssize_t length)
1068{
1069    PyUnicodeObject *unicode;
1070    size_t new_size;
1071
1072    /* Optimization for empty strings */
1073    if (length == 0 && unicode_empty != NULL) {
1074        Py_INCREF(unicode_empty);
1075        return (PyUnicodeObject*)unicode_empty;
1076    }
1077
1078    /* Ensure we won't overflow the size. */
1079    if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1080        return (PyUnicodeObject *)PyErr_NoMemory();
1081    }
1082    if (length < 0) {
1083        PyErr_SetString(PyExc_SystemError,
1084                        "Negative size passed to _PyUnicode_New");
1085        return NULL;
1086    }
1087
1088    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089    if (unicode == NULL)
1090        return NULL;
1091    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1092
1093    _PyUnicode_WSTR_LENGTH(unicode) = length;
1094    _PyUnicode_HASH(unicode) = -1;
1095    _PyUnicode_STATE(unicode).interned = 0;
1096    _PyUnicode_STATE(unicode).kind = 0;
1097    _PyUnicode_STATE(unicode).compact = 0;
1098    _PyUnicode_STATE(unicode).ready = 0;
1099    _PyUnicode_STATE(unicode).ascii = 0;
1100    _PyUnicode_DATA_ANY(unicode) = NULL;
1101    _PyUnicode_LENGTH(unicode) = 0;
1102    _PyUnicode_UTF8(unicode) = NULL;
1103    _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
1105    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106    if (!_PyUnicode_WSTR(unicode)) {
1107        Py_DECREF(unicode);
1108        PyErr_NoMemory();
1109        return NULL;
1110    }
1111
1112    /* Initialize the first element to guard against cases where
1113     * the caller fails before initializing str -- unicode_resize()
1114     * reads str[0], and the Keep-Alive optimization can keep memory
1115     * allocated for str alive across a call to unicode_dealloc(unicode).
1116     * We don't want unicode_resize to read uninitialized memory in
1117     * that case.
1118     */
1119    _PyUnicode_WSTR(unicode)[0] = 0;
1120    _PyUnicode_WSTR(unicode)[length] = 0;
1121
1122    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1123    return unicode;
1124}
1125
1126static const char*
1127unicode_kind_name(PyObject *unicode)
1128{
1129    /* don't check consistency: unicode_kind_name() is called from
1130       _PyUnicode_Dump() */
1131    if (!PyUnicode_IS_COMPACT(unicode))
1132    {
1133        if (!PyUnicode_IS_READY(unicode))
1134            return "wstr";
1135        switch (PyUnicode_KIND(unicode))
1136        {
1137        case PyUnicode_1BYTE_KIND:
1138            if (PyUnicode_IS_ASCII(unicode))
1139                return "legacy ascii";
1140            else
1141                return "legacy latin1";
1142        case PyUnicode_2BYTE_KIND:
1143            return "legacy UCS2";
1144        case PyUnicode_4BYTE_KIND:
1145            return "legacy UCS4";
1146        default:
1147            return "<legacy invalid kind>";
1148        }
1149    }
1150    assert(PyUnicode_IS_READY(unicode));
1151    switch (PyUnicode_KIND(unicode)) {
1152    case PyUnicode_1BYTE_KIND:
1153        if (PyUnicode_IS_ASCII(unicode))
1154            return "ascii";
1155        else
1156            return "latin1";
1157    case PyUnicode_2BYTE_KIND:
1158        return "UCS2";
1159    case PyUnicode_4BYTE_KIND:
1160        return "UCS4";
1161    default:
1162        return "<invalid compact kind>";
1163    }
1164}
1165
1166#ifdef Py_DEBUG
1167/* Functions wrapping macros for use in debugger */
1168char *_PyUnicode_utf8(void *unicode){
1169    return PyUnicode_UTF8(unicode);
1170}
1171
1172void *_PyUnicode_compact_data(void *unicode) {
1173    return _PyUnicode_COMPACT_DATA(unicode);
1174}
1175void *_PyUnicode_data(void *unicode){
1176    printf("obj %p\n", unicode);
1177    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182    return PyUnicode_DATA(unicode);
1183}
1184
1185void
1186_PyUnicode_Dump(PyObject *op)
1187{
1188    PyASCIIObject *ascii = (PyASCIIObject *)op;
1189    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191    void *data;
1192
1193    if (ascii->state.compact)
1194    {
1195        if (ascii->state.ascii)
1196            data = (ascii + 1);
1197        else
1198            data = (compact + 1);
1199    }
1200    else
1201        data = unicode->data.any;
1202    printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203           unicode_kind_name(op), ascii->length);
1204
1205    if (ascii->wstr == data)
1206        printf("shared ");
1207    printf("wstr=%p", ascii->wstr);
1208
1209    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1210        printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1211        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212            printf("shared ");
1213        printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214               compact->utf8, compact->utf8_length);
1215    }
1216    printf(", data=%p\n", data);
1217}
1218#endif
1219
1220PyObject *
1221PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222{
1223    PyObject *obj;
1224    PyCompactUnicodeObject *unicode;
1225    void *data;
1226    enum PyUnicode_Kind kind;
1227    int is_sharing, is_ascii;
1228    Py_ssize_t char_size;
1229    Py_ssize_t struct_size;
1230
1231    /* Optimization for empty strings */
1232    if (size == 0 && unicode_empty != NULL) {
1233        Py_INCREF(unicode_empty);
1234        return unicode_empty;
1235    }
1236
1237    is_ascii = 0;
1238    is_sharing = 0;
1239    struct_size = sizeof(PyCompactUnicodeObject);
1240    if (maxchar < 128) {
1241        kind = PyUnicode_1BYTE_KIND;
1242        char_size = 1;
1243        is_ascii = 1;
1244        struct_size = sizeof(PyASCIIObject);
1245    }
1246    else if (maxchar < 256) {
1247        kind = PyUnicode_1BYTE_KIND;
1248        char_size = 1;
1249    }
1250    else if (maxchar < 65536) {
1251        kind = PyUnicode_2BYTE_KIND;
1252        char_size = 2;
1253        if (sizeof(wchar_t) == 2)
1254            is_sharing = 1;
1255    }
1256    else {
1257        if (maxchar > MAX_UNICODE) {
1258            PyErr_SetString(PyExc_SystemError,
1259                            "invalid maximum character passed to PyUnicode_New");
1260            return NULL;
1261        }
1262        kind = PyUnicode_4BYTE_KIND;
1263        char_size = 4;
1264        if (sizeof(wchar_t) == 4)
1265            is_sharing = 1;
1266    }
1267
1268    /* Ensure we won't overflow the size. */
1269    if (size < 0) {
1270        PyErr_SetString(PyExc_SystemError,
1271                        "Negative size passed to PyUnicode_New");
1272        return NULL;
1273    }
1274    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275        return PyErr_NoMemory();
1276
1277    /* Duplicated allocation code from _PyObject_New() instead of a call to
1278     * PyObject_New() so we are able to allocate space for the object and
1279     * it's data buffer.
1280     */
1281    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282    if (obj == NULL)
1283        return PyErr_NoMemory();
1284    obj = PyObject_INIT(obj, &PyUnicode_Type);
1285    if (obj == NULL)
1286        return NULL;
1287
1288    unicode = (PyCompactUnicodeObject *)obj;
1289    if (is_ascii)
1290        data = ((PyASCIIObject*)obj) + 1;
1291    else
1292        data = unicode + 1;
1293    _PyUnicode_LENGTH(unicode) = size;
1294    _PyUnicode_HASH(unicode) = -1;
1295    _PyUnicode_STATE(unicode).interned = 0;
1296    _PyUnicode_STATE(unicode).kind = kind;
1297    _PyUnicode_STATE(unicode).compact = 1;
1298    _PyUnicode_STATE(unicode).ready = 1;
1299    _PyUnicode_STATE(unicode).ascii = is_ascii;
1300    if (is_ascii) {
1301        ((char*)data)[size] = 0;
1302        _PyUnicode_WSTR(unicode) = NULL;
1303    }
1304    else if (kind == PyUnicode_1BYTE_KIND) {
1305        ((char*)data)[size] = 0;
1306        _PyUnicode_WSTR(unicode) = NULL;
1307        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1308        unicode->utf8 = NULL;
1309        unicode->utf8_length = 0;
1310    }
1311    else {
1312        unicode->utf8 = NULL;
1313        unicode->utf8_length = 0;
1314        if (kind == PyUnicode_2BYTE_KIND)
1315            ((Py_UCS2*)data)[size] = 0;
1316        else /* kind == PyUnicode_4BYTE_KIND */
1317            ((Py_UCS4*)data)[size] = 0;
1318        if (is_sharing) {
1319            _PyUnicode_WSTR_LENGTH(unicode) = size;
1320            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321        }
1322        else {
1323            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324            _PyUnicode_WSTR(unicode) = NULL;
1325        }
1326    }
1327#ifdef Py_DEBUG
1328    unicode_fill_invalid((PyObject*)unicode, 0);
1329#endif
1330    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1331    return obj;
1332}
1333
1334#if SIZEOF_WCHAR_T == 2
1335/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336   will decode surrogate pairs, the other conversions are implemented as macros
1337   for efficiency.
1338
1339   This function assumes that unicode can hold one more code point than wstr
1340   characters for a terminating null character. */
1341static void
1342unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1343                              PyObject *unicode)
1344{
1345    const wchar_t *iter;
1346    Py_UCS4 *ucs4_out;
1347
1348    assert(unicode != NULL);
1349    assert(_PyUnicode_CHECK(unicode));
1350    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353    for (iter = begin; iter < end; ) {
1354        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355                           _PyUnicode_GET_LENGTH(unicode)));
1356        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357            && (iter+1) < end
1358            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1359        {
1360            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1361            iter += 2;
1362        }
1363        else {
1364            *ucs4_out++ = *iter;
1365            iter++;
1366        }
1367    }
1368    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369                        _PyUnicode_GET_LENGTH(unicode)));
1370
1371}
1372#endif
1373
1374static int
1375unicode_check_modifiable(PyObject *unicode)
1376{
1377    if (!unicode_modifiable(unicode)) {
1378        PyErr_SetString(PyExc_SystemError,
1379                        "Cannot modify a string currently used");
1380        return -1;
1381    }
1382    return 0;
1383}
1384
1385static int
1386_copy_characters(PyObject *to, Py_ssize_t to_start,
1387                 PyObject *from, Py_ssize_t from_start,
1388                 Py_ssize_t how_many, int check_maxchar)
1389{
1390    unsigned int from_kind, to_kind;
1391    void *from_data, *to_data;
1392
1393    assert(0 <= how_many);
1394    assert(0 <= from_start);
1395    assert(0 <= to_start);
1396    assert(PyUnicode_Check(from));
1397    assert(PyUnicode_IS_READY(from));
1398    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1399
1400    assert(PyUnicode_Check(to));
1401    assert(PyUnicode_IS_READY(to));
1402    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
1404    if (how_many == 0)
1405        return 0;
1406
1407    from_kind = PyUnicode_KIND(from);
1408    from_data = PyUnicode_DATA(from);
1409    to_kind = PyUnicode_KIND(to);
1410    to_data = PyUnicode_DATA(to);
1411
1412#ifdef Py_DEBUG
1413    if (!check_maxchar
1414        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415    {
1416        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417        Py_UCS4 ch;
1418        Py_ssize_t i;
1419        for (i=0; i < how_many; i++) {
1420            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421            assert(ch <= to_maxchar);
1422        }
1423    }
1424#endif
1425
1426    if (from_kind == to_kind) {
1427        if (check_maxchar
1428            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429        {
1430            /* Writing Latin-1 characters into an ASCII string requires to
1431               check that all written characters are pure ASCII */
1432            Py_UCS4 max_char;
1433            max_char = ucs1lib_find_max_char(from_data,
1434                                             (Py_UCS1*)from_data + how_many);
1435            if (max_char >= 128)
1436                return -1;
1437        }
1438        memcpy((char*)to_data + to_kind * to_start,
1439                  (char*)from_data + from_kind * from_start,
1440                  to_kind * how_many);
1441    }
1442    else if (from_kind == PyUnicode_1BYTE_KIND
1443             && to_kind == PyUnicode_2BYTE_KIND)
1444    {
1445        _PyUnicode_CONVERT_BYTES(
1446            Py_UCS1, Py_UCS2,
1447            PyUnicode_1BYTE_DATA(from) + from_start,
1448            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449            PyUnicode_2BYTE_DATA(to) + to_start
1450            );
1451    }
1452    else if (from_kind == PyUnicode_1BYTE_KIND
1453             && to_kind == PyUnicode_4BYTE_KIND)
1454    {
1455        _PyUnicode_CONVERT_BYTES(
1456            Py_UCS1, Py_UCS4,
1457            PyUnicode_1BYTE_DATA(from) + from_start,
1458            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459            PyUnicode_4BYTE_DATA(to) + to_start
1460            );
1461    }
1462    else if (from_kind == PyUnicode_2BYTE_KIND
1463             && to_kind == PyUnicode_4BYTE_KIND)
1464    {
1465        _PyUnicode_CONVERT_BYTES(
1466            Py_UCS2, Py_UCS4,
1467            PyUnicode_2BYTE_DATA(from) + from_start,
1468            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469            PyUnicode_4BYTE_DATA(to) + to_start
1470            );
1471    }
1472    else {
1473        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
1475        if (!check_maxchar) {
1476            if (from_kind == PyUnicode_2BYTE_KIND
1477                && to_kind == PyUnicode_1BYTE_KIND)
1478            {
1479                _PyUnicode_CONVERT_BYTES(
1480                    Py_UCS2, Py_UCS1,
1481                    PyUnicode_2BYTE_DATA(from) + from_start,
1482                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483                    PyUnicode_1BYTE_DATA(to) + to_start
1484                    );
1485            }
1486            else if (from_kind == PyUnicode_4BYTE_KIND
1487                     && to_kind == PyUnicode_1BYTE_KIND)
1488            {
1489                _PyUnicode_CONVERT_BYTES(
1490                    Py_UCS4, Py_UCS1,
1491                    PyUnicode_4BYTE_DATA(from) + from_start,
1492                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493                    PyUnicode_1BYTE_DATA(to) + to_start
1494                    );
1495            }
1496            else if (from_kind == PyUnicode_4BYTE_KIND
1497                     && to_kind == PyUnicode_2BYTE_KIND)
1498            {
1499                _PyUnicode_CONVERT_BYTES(
1500                    Py_UCS4, Py_UCS2,
1501                    PyUnicode_4BYTE_DATA(from) + from_start,
1502                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503                    PyUnicode_2BYTE_DATA(to) + to_start
1504                    );
1505            }
1506            else {
1507                assert(0);
1508                return -1;
1509            }
1510        }
1511        else {
1512            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1513            Py_UCS4 ch;
1514            Py_ssize_t i;
1515
1516            for (i=0; i < how_many; i++) {
1517                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1518                if (ch > to_maxchar)
1519                    return -1;
1520                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521            }
1522        }
1523    }
1524    return 0;
1525}
1526
1527void
1528_PyUnicode_FastCopyCharacters(
1529    PyObject *to, Py_ssize_t to_start,
1530    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1531{
1532    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533}
1534
1535Py_ssize_t
1536PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537                         PyObject *from, Py_ssize_t from_start,
1538                         Py_ssize_t how_many)
1539{
1540    int err;
1541
1542    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543        PyErr_BadInternalCall();
1544        return -1;
1545    }
1546
1547    if (PyUnicode_READY(from) == -1)
1548        return -1;
1549    if (PyUnicode_READY(to) == -1)
1550        return -1;
1551
1552    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1553        PyErr_SetString(PyExc_IndexError, "string index out of range");
1554        return -1;
1555    }
1556    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1557        PyErr_SetString(PyExc_IndexError, "string index out of range");
1558        return -1;
1559    }
1560    if (how_many < 0) {
1561        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562        return -1;
1563    }
1564    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1565    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566        PyErr_Format(PyExc_SystemError,
1567                     "Cannot write %zi characters at %zi "
1568                     "in a string of %zi characters",
1569                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1570        return -1;
1571    }
1572
1573    if (how_many == 0)
1574        return 0;
1575
1576    if (unicode_check_modifiable(to))
1577        return -1;
1578
1579    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580    if (err) {
1581        PyErr_Format(PyExc_SystemError,
1582                     "Cannot copy %s characters "
1583                     "into a string of %s characters",
1584                     unicode_kind_name(from),
1585                     unicode_kind_name(to));
1586        return -1;
1587    }
1588    return how_many;
1589}
1590
1591/* Find the maximum code point and count the number of surrogate pairs so a
1592   correct string length can be computed before converting a string to UCS4.
1593   This function counts single surrogates as a character and not as a pair.
1594
1595   Return 0 on success, or -1 on error. */
1596static int
1597find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1599{
1600    const wchar_t *iter;
1601    Py_UCS4 ch;
1602
1603    assert(num_surrogates != NULL && maxchar != NULL);
1604    *num_surrogates = 0;
1605    *maxchar = 0;
1606
1607    for (iter = begin; iter < end; ) {
1608#if SIZEOF_WCHAR_T == 2
1609        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610            && (iter+1) < end
1611            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612        {
1613            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614            ++(*num_surrogates);
1615            iter += 2;
1616        }
1617        else
1618#endif
1619        {
1620            ch = *iter;
1621            iter++;
1622        }
1623        if (ch > *maxchar) {
1624            *maxchar = ch;
1625            if (*maxchar > MAX_UNICODE) {
1626                PyErr_Format(PyExc_ValueError,
1627                             "character U+%x is not in range [U+0000; U+10ffff]",
1628                             ch);
1629                return -1;
1630            }
1631        }
1632    }
1633    return 0;
1634}
1635
1636int
1637_PyUnicode_Ready(PyObject *unicode)
1638{
1639    wchar_t *end;
1640    Py_UCS4 maxchar = 0;
1641    Py_ssize_t num_surrogates;
1642#if SIZEOF_WCHAR_T == 2
1643    Py_ssize_t length_wo_surrogates;
1644#endif
1645
1646    /* _PyUnicode_Ready() is only intended for old-style API usage where
1647       strings were created using _PyObject_New() and where no canonical
1648       representation (the str field) has been set yet aka strings
1649       which are not yet ready. */
1650    assert(_PyUnicode_CHECK(unicode));
1651    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1652    assert(_PyUnicode_WSTR(unicode) != NULL);
1653    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1654    assert(_PyUnicode_UTF8(unicode) == NULL);
1655    /* Actually, it should neither be interned nor be anything else: */
1656    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1657
1658    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1659    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1660                                &maxchar, &num_surrogates) == -1)
1661        return -1;
1662
1663    if (maxchar < 256) {
1664        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665        if (!_PyUnicode_DATA_ANY(unicode)) {
1666            PyErr_NoMemory();
1667            return -1;
1668        }
1669        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1670                                _PyUnicode_WSTR(unicode), end,
1671                                PyUnicode_1BYTE_DATA(unicode));
1672        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675        if (maxchar < 128) {
1676            _PyUnicode_STATE(unicode).ascii = 1;
1677            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1678            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1679        }
1680        else {
1681            _PyUnicode_STATE(unicode).ascii = 0;
1682            _PyUnicode_UTF8(unicode) = NULL;
1683            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1684        }
1685        PyObject_FREE(_PyUnicode_WSTR(unicode));
1686        _PyUnicode_WSTR(unicode) = NULL;
1687        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688    }
1689    /* In this case we might have to convert down from 4-byte native
1690       wchar_t to 2-byte unicode. */
1691    else if (maxchar < 65536) {
1692        assert(num_surrogates == 0 &&
1693               "FindMaxCharAndNumSurrogatePairs() messed up");
1694
1695#if SIZEOF_WCHAR_T == 2
1696        /* We can share representations and are done. */
1697        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1698        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1701        _PyUnicode_UTF8(unicode) = NULL;
1702        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1703#else
1704        /* sizeof(wchar_t) == 4 */
1705        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1706            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1707        if (!_PyUnicode_DATA_ANY(unicode)) {
1708            PyErr_NoMemory();
1709            return -1;
1710        }
1711        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712                                _PyUnicode_WSTR(unicode), end,
1713                                PyUnicode_2BYTE_DATA(unicode));
1714        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1717        _PyUnicode_UTF8(unicode) = NULL;
1718        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1719        PyObject_FREE(_PyUnicode_WSTR(unicode));
1720        _PyUnicode_WSTR(unicode) = NULL;
1721        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722#endif
1723    }
1724    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725    else {
1726#if SIZEOF_WCHAR_T == 2
1727        /* in case the native representation is 2-bytes, we need to allocate a
1728           new normalized 4-byte version. */
1729        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1730        if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731            PyErr_NoMemory();
1732            return -1;
1733        }
1734        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735        if (!_PyUnicode_DATA_ANY(unicode)) {
1736            PyErr_NoMemory();
1737            return -1;
1738        }
1739        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1741        _PyUnicode_UTF8(unicode) = NULL;
1742        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1743        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744        _PyUnicode_STATE(unicode).ready = 1;
1745        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1746        PyObject_FREE(_PyUnicode_WSTR(unicode));
1747        _PyUnicode_WSTR(unicode) = NULL;
1748        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749#else
1750        assert(num_surrogates == 0);
1751
1752        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1753        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1754        _PyUnicode_UTF8(unicode) = NULL;
1755        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1756        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757#endif
1758        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759    }
1760    _PyUnicode_STATE(unicode).ready = 1;
1761    assert(_PyUnicode_CheckConsistency(unicode, 1));
1762    return 0;
1763}
1764
1765static void
1766unicode_dealloc(PyObject *unicode)
1767{
1768    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1769    case SSTATE_NOT_INTERNED:
1770        break;
1771
1772    case SSTATE_INTERNED_MORTAL:
1773        /* revive dead object temporarily for DelItem */
1774        Py_REFCNT(unicode) = 3;
1775        if (PyDict_DelItem(interned, unicode) != 0)
1776            Py_FatalError(
1777                "deletion of interned string failed");
1778        break;
1779
1780    case SSTATE_INTERNED_IMMORTAL:
1781        Py_FatalError("Immortal interned string died.");
1782
1783    default:
1784        Py_FatalError("Inconsistent interned string state.");
1785    }
1786
1787    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1788        PyObject_DEL(_PyUnicode_WSTR(unicode));
1789    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1790        PyObject_DEL(_PyUnicode_UTF8(unicode));
1791    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1793
1794    Py_TYPE(unicode)->tp_free(unicode);
1795}
1796
1797#ifdef Py_DEBUG
1798static int
1799unicode_is_singleton(PyObject *unicode)
1800{
1801    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802    if (unicode == unicode_empty)
1803        return 1;
1804    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805    {
1806        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807        if (ch < 256 && unicode_latin1[ch] == unicode)
1808            return 1;
1809    }
1810    return 0;
1811}
1812#endif
1813
1814static int
1815unicode_modifiable(PyObject *unicode)
1816{
1817    assert(_PyUnicode_CHECK(unicode));
1818    if (Py_REFCNT(unicode) != 1)
1819        return 0;
1820    if (_PyUnicode_HASH(unicode) != -1)
1821        return 0;
1822    if (PyUnicode_CHECK_INTERNED(unicode))
1823        return 0;
1824    if (!PyUnicode_CheckExact(unicode))
1825        return 0;
1826#ifdef Py_DEBUG
1827    /* singleton refcount is greater than 1 */
1828    assert(!unicode_is_singleton(unicode));
1829#endif
1830    return 1;
1831}
1832
1833static int
1834unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835{
1836    PyObject *unicode;
1837    Py_ssize_t old_length;
1838
1839    assert(p_unicode != NULL);
1840    unicode = *p_unicode;
1841
1842    assert(unicode != NULL);
1843    assert(PyUnicode_Check(unicode));
1844    assert(0 <= length);
1845
1846    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1847        old_length = PyUnicode_WSTR_LENGTH(unicode);
1848    else
1849        old_length = PyUnicode_GET_LENGTH(unicode);
1850    if (old_length == length)
1851        return 0;
1852
1853    if (length == 0) {
1854        _Py_INCREF_UNICODE_EMPTY();
1855        if (!unicode_empty)
1856            return -1;
1857        Py_SETREF(*p_unicode, unicode_empty);
1858        return 0;
1859    }
1860
1861    if (!unicode_modifiable(unicode)) {
1862        PyObject *copy = resize_copy(unicode, length);
1863        if (copy == NULL)
1864            return -1;
1865        Py_SETREF(*p_unicode, copy);
1866        return 0;
1867    }
1868
1869    if (PyUnicode_IS_COMPACT(unicode)) {
1870        PyObject *new_unicode = resize_compact(unicode, length);
1871        if (new_unicode == NULL)
1872            return -1;
1873        *p_unicode = new_unicode;
1874        return 0;
1875    }
1876    return resize_inplace(unicode, length);
1877}
1878
1879int
1880PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1881{
1882    PyObject *unicode;
1883    if (p_unicode == NULL) {
1884        PyErr_BadInternalCall();
1885        return -1;
1886    }
1887    unicode = *p_unicode;
1888    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1889    {
1890        PyErr_BadInternalCall();
1891        return -1;
1892    }
1893    return unicode_resize(p_unicode, length);
1894}
1895
1896/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1897
1898   WARNING: The function doesn't copy the terminating null character and
1899   doesn't check the maximum character (may write a latin1 character in an
1900   ASCII string). */
1901static void
1902unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903                   const char *str, Py_ssize_t len)
1904{
1905    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906    void *data = PyUnicode_DATA(unicode);
1907    const char *end = str + len;
1908
1909    switch (kind) {
1910    case PyUnicode_1BYTE_KIND: {
1911        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1912#ifdef Py_DEBUG
1913        if (PyUnicode_IS_ASCII(unicode)) {
1914            Py_UCS4 maxchar = ucs1lib_find_max_char(
1915                (const Py_UCS1*)str,
1916                (const Py_UCS1*)str + len);
1917            assert(maxchar < 128);
1918        }
1919#endif
1920        memcpy((char *) data + index, str, len);
1921        break;
1922    }
1923    case PyUnicode_2BYTE_KIND: {
1924        Py_UCS2 *start = (Py_UCS2 *)data + index;
1925        Py_UCS2 *ucs2 = start;
1926        assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
1928        for (; str < end; ++ucs2, ++str)
1929            *ucs2 = (Py_UCS2)*str;
1930
1931        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1932        break;
1933    }
1934    default: {
1935        Py_UCS4 *start = (Py_UCS4 *)data + index;
1936        Py_UCS4 *ucs4 = start;
1937        assert(kind == PyUnicode_4BYTE_KIND);
1938        assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
1940        for (; str < end; ++ucs4, ++str)
1941            *ucs4 = (Py_UCS4)*str;
1942
1943        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1944    }
1945    }
1946}
1947
1948static PyObject*
1949get_latin1_char(unsigned char ch)
1950{
1951    PyObject *unicode = unicode_latin1[ch];
1952    if (!unicode) {
1953        unicode = PyUnicode_New(1, ch);
1954        if (!unicode)
1955            return NULL;
1956        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1957        assert(_PyUnicode_CheckConsistency(unicode, 1));
1958        unicode_latin1[ch] = unicode;
1959    }
1960    Py_INCREF(unicode);
1961    return unicode;
1962}
1963
1964static PyObject*
1965unicode_char(Py_UCS4 ch)
1966{
1967    PyObject *unicode;
1968
1969    assert(ch <= MAX_UNICODE);
1970
1971    if (ch < 256)
1972        return get_latin1_char(ch);
1973
1974    unicode = PyUnicode_New(1, ch);
1975    if (unicode == NULL)
1976        return NULL;
1977    switch (PyUnicode_KIND(unicode)) {
1978    case PyUnicode_1BYTE_KIND:
1979        PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980        break;
1981    case PyUnicode_2BYTE_KIND:
1982        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983        break;
1984    default:
1985        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987    }
1988    assert(_PyUnicode_CheckConsistency(unicode, 1));
1989    return unicode;
1990}
1991
1992PyObject *
1993PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1994{
1995    PyObject *unicode;
1996    Py_UCS4 maxchar = 0;
1997    Py_ssize_t num_surrogates;
1998
1999    if (u == NULL)
2000        return (PyObject*)_PyUnicode_New(size);
2001
2002    /* If the Unicode data is known at construction time, we can apply
2003       some optimizations which share commonly used objects. */
2004
2005    /* Optimization for empty strings */
2006    if (size == 0)
2007        _Py_RETURN_UNICODE_EMPTY();
2008
2009    /* Single character Unicode objects in the Latin-1 range are
2010       shared when using this constructor */
2011    if (size == 1 && (Py_UCS4)*u < 256)
2012        return get_latin1_char((unsigned char)*u);
2013
2014    /* If not empty and not single character, copy the Unicode data
2015       into the new object */
2016    if (find_maxchar_surrogates(u, u + size,
2017                                &maxchar, &num_surrogates) == -1)
2018        return NULL;
2019
2020    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2021    if (!unicode)
2022        return NULL;
2023
2024    switch (PyUnicode_KIND(unicode)) {
2025    case PyUnicode_1BYTE_KIND:
2026        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2027                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028        break;
2029    case PyUnicode_2BYTE_KIND:
2030#if Py_UNICODE_SIZE == 2
2031        memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2032#else
2033        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2034                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035#endif
2036        break;
2037    case PyUnicode_4BYTE_KIND:
2038#if SIZEOF_WCHAR_T == 2
2039        /* This is the only case which has to process surrogates, thus
2040           a simple copy loop is not enough and we need a function. */
2041        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2042#else
2043        assert(num_surrogates == 0);
2044        memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2045#endif
2046        break;
2047    default:
2048        assert(0 && "Impossible state");
2049    }
2050
2051    return unicode_result(unicode);
2052}
2053
2054PyObject *
2055PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2056{
2057    if (size < 0) {
2058        PyErr_SetString(PyExc_SystemError,
2059                        "Negative size passed to PyUnicode_FromStringAndSize");
2060        return NULL;
2061    }
2062    if (u != NULL)
2063        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064    else
2065        return (PyObject *)_PyUnicode_New(size);
2066}
2067
2068PyObject *
2069PyUnicode_FromString(const char *u)
2070{
2071    size_t size = strlen(u);
2072    if (size > PY_SSIZE_T_MAX) {
2073        PyErr_SetString(PyExc_OverflowError, "input too long");
2074        return NULL;
2075    }
2076    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2077}
2078
2079PyObject *
2080_PyUnicode_FromId(_Py_Identifier *id)
2081{
2082    if (!id->object) {
2083        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084                                                  strlen(id->string),
2085                                                  NULL, NULL);
2086        if (!id->object)
2087            return NULL;
2088        PyUnicode_InternInPlace(&id->object);
2089        assert(!id->next);
2090        id->next = static_strings;
2091        static_strings = id;
2092    }
2093    return id->object;
2094}
2095
2096void
2097_PyUnicode_ClearStaticStrings()
2098{
2099    _Py_Identifier *tmp, *s = static_strings;
2100    while (s) {
2101        Py_CLEAR(s->object);
2102        tmp = s->next;
2103        s->next = NULL;
2104        s = tmp;
2105    }
2106    static_strings = NULL;
2107}
2108
2109/* Internal function, doesn't check maximum character */
2110
2111PyObject*
2112_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2113{
2114    const unsigned char *s = (const unsigned char *)buffer;
2115    PyObject *unicode;
2116    if (size == 1) {
2117#ifdef Py_DEBUG
2118        assert((unsigned char)s[0] < 128);
2119#endif
2120        return get_latin1_char(s[0]);
2121    }
2122    unicode = PyUnicode_New(size, 127);
2123    if (!unicode)
2124        return NULL;
2125    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126    assert(_PyUnicode_CheckConsistency(unicode, 1));
2127    return unicode;
2128}
2129
2130static Py_UCS4
2131kind_maxchar_limit(unsigned int kind)
2132{
2133    switch (kind) {
2134    case PyUnicode_1BYTE_KIND:
2135        return 0x80;
2136    case PyUnicode_2BYTE_KIND:
2137        return 0x100;
2138    case PyUnicode_4BYTE_KIND:
2139        return 0x10000;
2140    default:
2141        assert(0 && "invalid kind");
2142        return MAX_UNICODE;
2143    }
2144}
2145
2146static inline Py_UCS4
2147align_maxchar(Py_UCS4 maxchar)
2148{
2149    if (maxchar <= 127)
2150        return 127;
2151    else if (maxchar <= 255)
2152        return 255;
2153    else if (maxchar <= 65535)
2154        return 65535;
2155    else
2156        return MAX_UNICODE;
2157}
2158
2159static PyObject*
2160_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2161{
2162    PyObject *res;
2163    unsigned char max_char;
2164
2165    if (size == 0)
2166        _Py_RETURN_UNICODE_EMPTY();
2167    assert(size > 0);
2168    if (size == 1)
2169        return get_latin1_char(u[0]);
2170
2171    max_char = ucs1lib_find_max_char(u, u + size);
2172    res = PyUnicode_New(size, max_char);
2173    if (!res)
2174        return NULL;
2175    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2176    assert(_PyUnicode_CheckConsistency(res, 1));
2177    return res;
2178}
2179
2180static PyObject*
2181_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2182{
2183    PyObject *res;
2184    Py_UCS2 max_char;
2185
2186    if (size == 0)
2187        _Py_RETURN_UNICODE_EMPTY();
2188    assert(size > 0);
2189    if (size == 1)
2190        return unicode_char(u[0]);
2191
2192    max_char = ucs2lib_find_max_char(u, u + size);
2193    res = PyUnicode_New(size, max_char);
2194    if (!res)
2195        return NULL;
2196    if (max_char >= 256)
2197        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2198    else {
2199        _PyUnicode_CONVERT_BYTES(
2200            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201    }
2202    assert(_PyUnicode_CheckConsistency(res, 1));
2203    return res;
2204}
2205
2206static PyObject*
2207_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2208{
2209    PyObject *res;
2210    Py_UCS4 max_char;
2211
2212    if (size == 0)
2213        _Py_RETURN_UNICODE_EMPTY();
2214    assert(size > 0);
2215    if (size == 1)
2216        return unicode_char(u[0]);
2217
2218    max_char = ucs4lib_find_max_char(u, u + size);
2219    res = PyUnicode_New(size, max_char);
2220    if (!res)
2221        return NULL;
2222    if (max_char < 256)
2223        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224                                 PyUnicode_1BYTE_DATA(res));
2225    else if (max_char < 0x10000)
2226        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227                                 PyUnicode_2BYTE_DATA(res));
2228    else
2229        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2230    assert(_PyUnicode_CheckConsistency(res, 1));
2231    return res;
2232}
2233
2234PyObject*
2235PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236{
2237    if (size < 0) {
2238        PyErr_SetString(PyExc_ValueError, "size must be positive");
2239        return NULL;
2240    }
2241    switch (kind) {
2242    case PyUnicode_1BYTE_KIND:
2243        return _PyUnicode_FromUCS1(buffer, size);
2244    case PyUnicode_2BYTE_KIND:
2245        return _PyUnicode_FromUCS2(buffer, size);
2246    case PyUnicode_4BYTE_KIND:
2247        return _PyUnicode_FromUCS4(buffer, size);
2248    default:
2249        PyErr_SetString(PyExc_SystemError, "invalid kind");
2250        return NULL;
2251    }
2252}
2253
2254Py_UCS4
2255_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256{
2257    enum PyUnicode_Kind kind;
2258    void *startptr, *endptr;
2259
2260    assert(PyUnicode_IS_READY(unicode));
2261    assert(0 <= start);
2262    assert(end <= PyUnicode_GET_LENGTH(unicode));
2263    assert(start <= end);
2264
2265    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266        return PyUnicode_MAX_CHAR_VALUE(unicode);
2267
2268    if (start == end)
2269        return 127;
2270
2271    if (PyUnicode_IS_ASCII(unicode))
2272        return 127;
2273
2274    kind = PyUnicode_KIND(unicode);
2275    startptr = PyUnicode_DATA(unicode);
2276    endptr = (char *)startptr + end * kind;
2277    startptr = (char *)startptr + start * kind;
2278    switch(kind) {
2279    case PyUnicode_1BYTE_KIND:
2280        return ucs1lib_find_max_char(startptr, endptr);
2281    case PyUnicode_2BYTE_KIND:
2282        return ucs2lib_find_max_char(startptr, endptr);
2283    case PyUnicode_4BYTE_KIND:
2284        return ucs4lib_find_max_char(startptr, endptr);
2285    default:
2286        assert(0);
2287        return 0;
2288    }
2289}
2290
2291/* Ensure that a string uses the most efficient storage, if it is not the
2292   case: create a new string with of the right kind. Write NULL into *p_unicode
2293   on error. */
2294static void
2295unicode_adjust_maxchar(PyObject **p_unicode)
2296{
2297    PyObject *unicode, *copy;
2298    Py_UCS4 max_char;
2299    Py_ssize_t len;
2300    unsigned int kind;
2301
2302    assert(p_unicode != NULL);
2303    unicode = *p_unicode;
2304    assert(PyUnicode_IS_READY(unicode));
2305    if (PyUnicode_IS_ASCII(unicode))
2306        return;
2307
2308    len = PyUnicode_GET_LENGTH(unicode);
2309    kind = PyUnicode_KIND(unicode);
2310    if (kind == PyUnicode_1BYTE_KIND) {
2311        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2312        max_char = ucs1lib_find_max_char(u, u + len);
2313        if (max_char >= 128)
2314            return;
2315    }
2316    else if (kind == PyUnicode_2BYTE_KIND) {
2317        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2318        max_char = ucs2lib_find_max_char(u, u + len);
2319        if (max_char >= 256)
2320            return;
2321    }
2322    else {
2323        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2324        assert(kind == PyUnicode_4BYTE_KIND);
2325        max_char = ucs4lib_find_max_char(u, u + len);
2326        if (max_char >= 0x10000)
2327            return;
2328    }
2329    copy = PyUnicode_New(len, max_char);
2330    if (copy != NULL)
2331        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2332    Py_DECREF(unicode);
2333    *p_unicode = copy;
2334}
2335
2336PyObject*
2337_PyUnicode_Copy(PyObject *unicode)
2338{
2339    Py_ssize_t length;
2340    PyObject *copy;
2341
2342    if (!PyUnicode_Check(unicode)) {
2343        PyErr_BadInternalCall();
2344        return NULL;
2345    }
2346    if (PyUnicode_READY(unicode) == -1)
2347        return NULL;
2348
2349    length = PyUnicode_GET_LENGTH(unicode);
2350    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2351    if (!copy)
2352        return NULL;
2353    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354
2355    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2356              length * PyUnicode_KIND(unicode));
2357    assert(_PyUnicode_CheckConsistency(copy, 1));
2358    return copy;
2359}
2360
2361
2362/* Widen Unicode objects to larger buffers. Don't write terminating null
2363   character. Return NULL on error. */
2364
2365void*
2366_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367{
2368    Py_ssize_t len;
2369    void *result;
2370    unsigned int skind;
2371
2372    if (PyUnicode_READY(s) == -1)
2373        return NULL;
2374
2375    len = PyUnicode_GET_LENGTH(s);
2376    skind = PyUnicode_KIND(s);
2377    if (skind >= kind) {
2378        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2379        return NULL;
2380    }
2381    switch (kind) {
2382    case PyUnicode_2BYTE_KIND:
2383        result = PyMem_New(Py_UCS2, len);
2384        if (!result)
2385            return PyErr_NoMemory();
2386        assert(skind == PyUnicode_1BYTE_KIND);
2387        _PyUnicode_CONVERT_BYTES(
2388            Py_UCS1, Py_UCS2,
2389            PyUnicode_1BYTE_DATA(s),
2390            PyUnicode_1BYTE_DATA(s) + len,
2391            result);
2392        return result;
2393    case PyUnicode_4BYTE_KIND:
2394        result = PyMem_New(Py_UCS4, len);
2395        if (!result)
2396            return PyErr_NoMemory();
2397        if (skind == PyUnicode_2BYTE_KIND) {
2398            _PyUnicode_CONVERT_BYTES(
2399                Py_UCS2, Py_UCS4,
2400                PyUnicode_2BYTE_DATA(s),
2401                PyUnicode_2BYTE_DATA(s) + len,
2402                result);
2403        }
2404        else {
2405            assert(skind == PyUnicode_1BYTE_KIND);
2406            _PyUnicode_CONVERT_BYTES(
2407                Py_UCS1, Py_UCS4,
2408                PyUnicode_1BYTE_DATA(s),
2409                PyUnicode_1BYTE_DATA(s) + len,
2410                result);
2411        }
2412        return result;
2413    default:
2414        break;
2415    }
2416    PyErr_SetString(PyExc_SystemError, "invalid kind");
2417    return NULL;
2418}
2419
2420static Py_UCS4*
2421as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422        int copy_null)
2423{
2424    int kind;
2425    void *data;
2426    Py_ssize_t len, targetlen;
2427    if (PyUnicode_READY(string) == -1)
2428        return NULL;
2429    kind = PyUnicode_KIND(string);
2430    data = PyUnicode_DATA(string);
2431    len = PyUnicode_GET_LENGTH(string);
2432    targetlen = len;
2433    if (copy_null)
2434        targetlen++;
2435    if (!target) {
2436        target = PyMem_New(Py_UCS4, targetlen);
2437        if (!target) {
2438            PyErr_NoMemory();
2439            return NULL;
2440        }
2441    }
2442    else {
2443        if (targetsize < targetlen) {
2444            PyErr_Format(PyExc_SystemError,
2445                         "string is longer than the buffer");
2446            if (copy_null && 0 < targetsize)
2447                target[0] = 0;
2448            return NULL;
2449        }
2450    }
2451    if (kind == PyUnicode_1BYTE_KIND) {
2452        Py_UCS1 *start = (Py_UCS1 *) data;
2453        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2454    }
2455    else if (kind == PyUnicode_2BYTE_KIND) {
2456        Py_UCS2 *start = (Py_UCS2 *) data;
2457        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458    }
2459    else {
2460        assert(kind == PyUnicode_4BYTE_KIND);
2461        memcpy(target, data, len * sizeof(Py_UCS4));
2462    }
2463    if (copy_null)
2464        target[len] = 0;
2465    return target;
2466}
2467
2468Py_UCS4*
2469PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470                 int copy_null)
2471{
2472    if (target == NULL || targetsize < 0) {
2473        PyErr_BadInternalCall();
2474        return NULL;
2475    }
2476    return as_ucs4(string, target, targetsize, copy_null);
2477}
2478
2479Py_UCS4*
2480PyUnicode_AsUCS4Copy(PyObject *string)
2481{
2482    return as_ucs4(string, NULL, 0, 1);
2483}
2484
2485#ifdef HAVE_WCHAR_H
2486
2487PyObject *
2488PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2489{
2490    if (w == NULL) {
2491        if (size == 0)
2492            _Py_RETURN_UNICODE_EMPTY();
2493        PyErr_BadInternalCall();
2494        return NULL;
2495    }
2496
2497    if (size == -1) {
2498        size = wcslen(w);
2499    }
2500
2501    return PyUnicode_FromUnicode(w, size);
2502}
2503
2504#endif /* HAVE_WCHAR_H */
2505
2506/* maximum number of characters required for output of %lld or %p.
2507   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2509#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2510
2511static int
2512unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513                             Py_ssize_t width, Py_ssize_t precision)
2514{
2515    Py_ssize_t length, fill, arglen;
2516    Py_UCS4 maxchar;
2517
2518    if (PyUnicode_READY(str) == -1)
2519        return -1;
2520
2521    length = PyUnicode_GET_LENGTH(str);
2522    if ((precision == -1 || precision >= length)
2523        && width <= length)
2524        return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526    if (precision != -1)
2527        length = Py_MIN(precision, length);
2528
2529    arglen = Py_MAX(length, width);
2530    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532    else
2533        maxchar = writer->maxchar;
2534
2535    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536        return -1;
2537
2538    if (width > length) {
2539        fill = width - length;
2540        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541            return -1;
2542        writer->pos += fill;
2543    }
2544
2545    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546                                  str, 0, length);
2547    writer->pos += length;
2548    return 0;
2549}
2550
2551static int
2552unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553                              Py_ssize_t width, Py_ssize_t precision)
2554{
2555    /* UTF-8 */
2556    Py_ssize_t length;
2557    PyObject *unicode;
2558    int res;
2559
2560    length = strlen(str);
2561    if (precision != -1)
2562        length = Py_MIN(length, precision);
2563    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564    if (unicode == NULL)
2565        return -1;
2566
2567    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568    Py_DECREF(unicode);
2569    return res;
2570}
2571
2572static const char*
2573unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574                       const char *f, va_list *vargs)
2575{
2576    const char *p;
2577    Py_ssize_t len;
2578    int zeropad;
2579    Py_ssize_t width;
2580    Py_ssize_t precision;
2581    int longflag;
2582    int longlongflag;
2583    int size_tflag;
2584    Py_ssize_t fill;
2585
2586    p = f;
2587    f++;
2588    zeropad = 0;
2589    if (*f == '0') {
2590        zeropad = 1;
2591        f++;
2592    }
2593
2594    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2595    width = -1;
2596    if (Py_ISDIGIT((unsigned)*f)) {
2597        width = *f - '0';
2598        f++;
2599        while (Py_ISDIGIT((unsigned)*f)) {
2600            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2601                PyErr_SetString(PyExc_ValueError,
2602                                "width too big");
2603                return NULL;
2604            }
2605            width = (width * 10) + (*f - '0');
2606            f++;
2607        }
2608    }
2609    precision = -1;
2610    if (*f == '.') {
2611        f++;
2612        if (Py_ISDIGIT((unsigned)*f)) {
2613            precision = (*f - '0');
2614            f++;
2615            while (Py_ISDIGIT((unsigned)*f)) {
2616                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617                    PyErr_SetString(PyExc_ValueError,
2618                                    "precision too big");
2619                    return NULL;
2620                }
2621                precision = (precision * 10) + (*f - '0');
2622                f++;
2623            }
2624        }
2625        if (*f == '%') {
2626            /* "%.3%s" => f points to "3" */
2627            f--;
2628        }
2629    }
2630    if (*f == '\0') {
2631        /* bogus format "%.123" => go backward, f points to "3" */
2632        f--;
2633    }
2634
2635    /* Handle %ld, %lu, %lld and %llu. */
2636    longflag = 0;
2637    longlongflag = 0;
2638    size_tflag = 0;
2639    if (*f == 'l') {
2640        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2641            longflag = 1;
2642            ++f;
2643        }
2644        else if (f[1] == 'l' &&
2645                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2646            longlongflag = 1;
2647            f += 2;
2648        }
2649    }
2650    /* handle the size_t flag. */
2651    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2652        size_tflag = 1;
2653        ++f;
2654    }
2655
2656    if (f[1] == '\0')
2657        writer->overallocate = 0;
2658
2659    switch (*f) {
2660    case 'c':
2661    {
2662        int ordinal = va_arg(*vargs, int);
2663        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2664            PyErr_SetString(PyExc_OverflowError,
2665                            "character argument not in range(0x110000)");
2666            return NULL;
2667        }
2668        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2669            return NULL;
2670        break;
2671    }
2672
2673    case 'i':
2674    case 'd':
2675    case 'u':
2676    case 'x':
2677    {
2678        /* used by sprintf */
2679        char buffer[MAX_LONG_LONG_CHARS];
2680        Py_ssize_t arglen;
2681
2682        if (*f == 'u') {
2683            if (longflag)
2684                len = sprintf(buffer, "%lu",
2685                        va_arg(*vargs, unsigned long));
2686            else if (longlongflag)
2687                len = sprintf(buffer, "%llu",
2688                        va_arg(*vargs, unsigned long long));
2689            else if (size_tflag)
2690                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2691                        va_arg(*vargs, size_t));
2692            else
2693                len = sprintf(buffer, "%u",
2694                        va_arg(*vargs, unsigned int));
2695        }
2696        else if (*f == 'x') {
2697            len = sprintf(buffer, "%x", va_arg(*vargs, int));
2698        }
2699        else {
2700            if (longflag)
2701                len = sprintf(buffer, "%li",
2702                        va_arg(*vargs, long));
2703            else if (longlongflag)
2704                len = sprintf(buffer, "%lli",
2705                        va_arg(*vargs, long long));
2706            else if (size_tflag)
2707                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2708                        va_arg(*vargs, Py_ssize_t));
2709            else
2710                len = sprintf(buffer, "%i",
2711                        va_arg(*vargs, int));
2712        }
2713        assert(len >= 0);
2714
2715        if (precision < len)
2716            precision = len;
2717
2718        arglen = Py_MAX(precision, width);
2719        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720            return NULL;
2721
2722        if (width > precision) {
2723            Py_UCS4 fillchar;
2724            fill = width - precision;
2725            fillchar = zeropad?'0':' ';
2726            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727                return NULL;
2728            writer->pos += fill;
2729        }
2730        if (precision > len) {
2731            fill = precision - len;
2732            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733                return NULL;
2734            writer->pos += fill;
2735        }
2736
2737        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738            return NULL;
2739        break;
2740    }
2741
2742    case 'p':
2743    {
2744        char number[MAX_LONG_LONG_CHARS];
2745
2746        len = sprintf(number, "%p", va_arg(*vargs, void*));
2747        assert(len >= 0);
2748
2749        /* %p is ill-defined:  ensure leading 0x. */
2750        if (number[1] == 'X')
2751            number[1] = 'x';
2752        else if (number[1] != 'x') {
2753            memmove(number + 2, number,
2754                    strlen(number) + 1);
2755            number[0] = '0';
2756            number[1] = 'x';
2757            len += 2;
2758        }
2759
2760        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2761            return NULL;
2762        break;
2763    }
2764
2765    case 's':
2766    {
2767        /* UTF-8 */
2768        const char *s = va_arg(*vargs, const char*);
2769        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2770            return NULL;
2771        break;
2772    }
2773
2774    case 'U':
2775    {
2776        PyObject *obj = va_arg(*vargs, PyObject *);
2777        assert(obj && _PyUnicode_CHECK(obj));
2778
2779        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2780            return NULL;
2781        break;
2782    }
2783
2784    case 'V':
2785    {
2786        PyObject *obj = va_arg(*vargs, PyObject *);
2787        const char *str = va_arg(*vargs, const char *);
2788        if (obj) {
2789            assert(_PyUnicode_CHECK(obj));
2790            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2791                return NULL;
2792        }
2793        else {
2794            assert(str != NULL);
2795            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2796                return NULL;
2797        }
2798        break;
2799    }
2800
2801    case 'S':
2802    {
2803        PyObject *obj = va_arg(*vargs, PyObject *);
2804        PyObject *str;
2805        assert(obj);
2806        str = PyObject_Str(obj);
2807        if (!str)
2808            return NULL;
2809        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2810            Py_DECREF(str);
2811            return NULL;
2812        }
2813        Py_DECREF(str);
2814        break;
2815    }
2816
2817    case 'R':
2818    {
2819        PyObject *obj = va_arg(*vargs, PyObject *);
2820        PyObject *repr;
2821        assert(obj);
2822        repr = PyObject_Repr(obj);
2823        if (!repr)
2824            return NULL;
2825        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2826            Py_DECREF(repr);
2827            return NULL;
2828        }
2829        Py_DECREF(repr);
2830        break;
2831    }
2832
2833    case 'A':
2834    {
2835        PyObject *obj = va_arg(*vargs, PyObject *);
2836        PyObject *ascii;
2837        assert(obj);
2838        ascii = PyObject_ASCII(obj);
2839        if (!ascii)
2840            return NULL;
2841        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2842            Py_DECREF(ascii);
2843            return NULL;
2844        }
2845        Py_DECREF(ascii);
2846        break;
2847    }
2848
2849    case '%':
2850        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2851            return NULL;
2852        break;
2853
2854    default:
2855        /* if we stumble upon an unknown formatting code, copy the rest
2856           of the format string to the output string. (we cannot just
2857           skip the code, since there's no way to know what's in the
2858           argument list) */
2859        len = strlen(p);
2860        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2861            return NULL;
2862        f = p+len;
2863        return f;
2864    }
2865
2866    f++;
2867    return f;
2868}
2869
2870PyObject *
2871PyUnicode_FromFormatV(const char *format, va_list vargs)
2872{
2873    va_list vargs2;
2874    const char *f;
2875    _PyUnicodeWriter writer;
2876
2877    _PyUnicodeWriter_Init(&writer);
2878    writer.min_length = strlen(format) + 100;
2879    writer.overallocate = 1;
2880
2881    // Copy varags to be able to pass a reference to a subfunction.
2882    va_copy(vargs2, vargs);
2883
2884    for (f = format; *f; ) {
2885        if (*f == '%') {
2886            f = unicode_fromformat_arg(&writer, f, &vargs2);
2887            if (f == NULL)
2888                goto fail;
2889        }
2890        else {
2891            const char *p;
2892            Py_ssize_t len;
2893
2894            p = f;
2895            do
2896            {
2897                if ((unsigned char)*p > 127) {
2898                    PyErr_Format(PyExc_ValueError,
2899                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2900                        "string, got a non-ASCII byte: 0x%02x",
2901                        (unsigned char)*p);
2902                    goto fail;
2903                }
2904                p++;
2905            }
2906            while (*p != '\0' && *p != '%');
2907            len = p - f;
2908
2909            if (*p == '\0')
2910                writer.overallocate = 0;
2911
2912            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2913                goto fail;
2914
2915            f = p;
2916        }
2917    }
2918    va_end(vargs2);
2919    return _PyUnicodeWriter_Finish(&writer);
2920
2921  fail:
2922    va_end(vargs2);
2923    _PyUnicodeWriter_Dealloc(&writer);
2924    return NULL;
2925}
2926
2927PyObject *
2928PyUnicode_FromFormat(const char *format, ...)
2929{
2930    PyObject* ret;
2931    va_list vargs;
2932
2933#ifdef HAVE_STDARG_PROTOTYPES
2934    va_start(vargs, format);
2935#else
2936    va_start(vargs);
2937#endif
2938    ret = PyUnicode_FromFormatV(format, vargs);
2939    va_end(vargs);
2940    return ret;
2941}
2942
2943#ifdef HAVE_WCHAR_H
2944
2945/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946   convert a Unicode object to a wide character string.
2947
2948   - If w is NULL: return the number of wide characters (including the null
2949     character) required to convert the unicode object. Ignore size argument.
2950
2951   - Otherwise: return the number of wide characters (excluding the null
2952     character) written into w. Write at most size wide characters (including
2953     the null character). */
2954static Py_ssize_t
2955unicode_aswidechar(PyObject *unicode,
2956                   wchar_t *w,
2957                   Py_ssize_t size)
2958{
2959    Py_ssize_t res;
2960    const wchar_t *wstr;
2961
2962    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2963    if (wstr == NULL)
2964        return -1;
2965
2966    if (w != NULL) {
2967        if (size > res)
2968            size = res + 1;
2969        else
2970            res = size;
2971        memcpy(w, wstr, size * sizeof(wchar_t));
2972        return res;
2973    }
2974    else
2975        return res + 1;
2976}
2977
2978Py_ssize_t
2979PyUnicode_AsWideChar(PyObject *unicode,
2980                     wchar_t *w,
2981                     Py_ssize_t size)
2982{
2983    if (unicode == NULL) {
2984        PyErr_BadInternalCall();
2985        return -1;
2986    }
2987    return unicode_aswidechar(unicode, w, size);
2988}
2989
2990wchar_t*
2991PyUnicode_AsWideCharString(PyObject *unicode,
2992                           Py_ssize_t *size)
2993{
2994    wchar_t* buffer;
2995    Py_ssize_t buflen;
2996
2997    if (unicode == NULL) {
2998        PyErr_BadInternalCall();
2999        return NULL;
3000    }
3001
3002    buflen = unicode_aswidechar(unicode, NULL, 0);
3003    if (buflen == -1)
3004        return NULL;
3005    buffer = PyMem_NEW(wchar_t, buflen);
3006    if (buffer == NULL) {
3007        PyErr_NoMemory();
3008        return NULL;
3009    }
3010    buflen = unicode_aswidechar(unicode, buffer, buflen);
3011    if (buflen == -1) {
3012        PyMem_FREE(buffer);
3013        return NULL;
3014    }
3015    if (size != NULL)
3016        *size = buflen;
3017    return buffer;
3018}
3019
3020#endif /* HAVE_WCHAR_H */
3021
3022PyObject *
3023PyUnicode_FromOrdinal(int ordinal)
3024{
3025    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3026        PyErr_SetString(PyExc_ValueError,
3027                        "chr() arg not in range(0x110000)");
3028        return NULL;
3029    }
3030
3031    return unicode_char((Py_UCS4)ordinal);
3032}
3033
3034PyObject *
3035PyUnicode_FromObject(PyObject *obj)
3036{
3037    /* XXX Perhaps we should make this API an alias of
3038       PyObject_Str() instead ?! */
3039    if (PyUnicode_CheckExact(obj)) {
3040        if (PyUnicode_READY(obj) == -1)
3041            return NULL;
3042        Py_INCREF(obj);
3043        return obj;
3044    }
3045    if (PyUnicode_Check(obj)) {
3046        /* For a Unicode subtype that's not a Unicode object,
3047           return a true Unicode object with the same data. */
3048        return _PyUnicode_Copy(obj);
3049    }
3050    PyErr_Format(PyExc_TypeError,
3051                 "Can't convert '%.100s' object to str implicitly",
3052                 Py_TYPE(obj)->tp_name);
3053    return NULL;
3054}
3055
3056PyObject *
3057PyUnicode_FromEncodedObject(PyObject *obj,
3058                            const char *encoding,
3059                            const char *errors)
3060{
3061    Py_buffer buffer;
3062    PyObject *v;
3063
3064    if (obj == NULL) {
3065        PyErr_BadInternalCall();
3066        return NULL;
3067    }
3068
3069    /* Decoding bytes objects is the most common case and should be fast */
3070    if (PyBytes_Check(obj)) {
3071        if (PyBytes_GET_SIZE(obj) == 0)
3072            _Py_RETURN_UNICODE_EMPTY();
3073        v = PyUnicode_Decode(
3074                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075                encoding, errors);
3076        return v;
3077    }
3078
3079    if (PyUnicode_Check(obj)) {
3080        PyErr_SetString(PyExc_TypeError,
3081                        "decoding str is not supported");
3082        return NULL;
3083    }
3084
3085    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087        PyErr_Format(PyExc_TypeError,
3088                     "decoding to str: need a bytes-like object, %.80s found",
3089                     Py_TYPE(obj)->tp_name);
3090        return NULL;
3091    }
3092
3093    if (buffer.len == 0) {
3094        PyBuffer_Release(&buffer);
3095        _Py_RETURN_UNICODE_EMPTY();
3096    }
3097
3098    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3099    PyBuffer_Release(&buffer);
3100    return v;
3101}
3102
3103/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3104   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3105   longer than lower_len-1). */
3106int
3107_Py_normalize_encoding(const char *encoding,
3108                       char *lower,
3109                       size_t lower_len)
3110{
3111    const char *e;
3112    char *l;
3113    char *l_end;
3114    int punct;
3115
3116    assert(encoding != NULL);
3117
3118    e = encoding;
3119    l = lower;
3120    l_end = &lower[lower_len - 1];
3121    punct = 0;
3122    while (1) {
3123        char c = *e;
3124        if (c == 0) {
3125            break;
3126        }
3127
3128        if (Py_ISALNUM(c) || c == '.') {
3129            if (punct && l != lower) {
3130                if (l == l_end) {
3131                    return 0;
3132                }
3133                *l++ = '_';
3134            }
3135            punct = 0;
3136
3137            if (l == l_end) {
3138                return 0;
3139            }
3140            *l++ = Py_TOLOWER(c);
3141        }
3142        else {
3143            punct = 1;
3144        }
3145
3146        e++;
3147    }
3148    *l = '\0';
3149    return 1;
3150}
3151
3152PyObject *
3153PyUnicode_Decode(const char *s,
3154                 Py_ssize_t size,
3155                 const char *encoding,
3156                 const char *errors)
3157{
3158    PyObject *buffer = NULL, *unicode;
3159    Py_buffer info;
3160    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161
3162    if (encoding == NULL) {
3163        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164    }
3165
3166    /* Shortcuts for common default encodings */
3167    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168        char *lower = buflower;
3169
3170        /* Fast paths */
3171        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172            lower += 3;
3173            if (*lower == '_') {
3174                /* Match "utf8" and "utf_8" */
3175                lower++;
3176            }
3177
3178            if (lower[0] == '8' && lower[1] == 0) {
3179                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180            }
3181            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183            }
3184            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186            }
3187        }
3188        else {
3189            if (strcmp(lower, "ascii") == 0
3190                || strcmp(lower, "us_ascii") == 0) {
3191                return PyUnicode_DecodeASCII(s, size, errors);
3192            }
3193    #ifdef MS_WINDOWS
3194            else if (strcmp(lower, "mbcs") == 0) {
3195                return PyUnicode_DecodeMBCS(s, size, errors);
3196            }
3197    #endif
3198            else if (strcmp(lower, "latin1") == 0
3199                     || strcmp(lower, "latin_1") == 0
3200                     || strcmp(lower, "iso_8859_1") == 0
3201                     || strcmp(lower, "iso8859_1") == 0) {
3202                return PyUnicode_DecodeLatin1(s, size, errors);
3203            }
3204        }
3205    }
3206
3207    /* Decode via the codec registry */
3208    buffer = NULL;
3209    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3210        goto onError;
3211    buffer = PyMemoryView_FromBuffer(&info);
3212    if (buffer == NULL)
3213        goto onError;
3214    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3215    if (unicode == NULL)
3216        goto onError;
3217    if (!PyUnicode_Check(unicode)) {
3218        PyErr_Format(PyExc_TypeError,
3219                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3220                     "use codecs.decode() to decode to arbitrary types",
3221                     encoding,
3222                     Py_TYPE(unicode)->tp_name);
3223        Py_DECREF(unicode);
3224        goto onError;
3225    }
3226    Py_DECREF(buffer);
3227    return unicode_result(unicode);
3228
3229  onError:
3230    Py_XDECREF(buffer);
3231    return NULL;
3232}
3233
3234PyObject *
3235PyUnicode_AsDecodedObject(PyObject *unicode,
3236                          const char *encoding,
3237                          const char *errors)
3238{
3239    if (!PyUnicode_Check(unicode)) {
3240        PyErr_BadArgument();
3241        return NULL;
3242    }
3243
3244    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3245                     "PyUnicode_AsDecodedObject() is deprecated; "
3246                     "use PyCodec_Decode() to decode from str", 1) < 0)
3247        return NULL;
3248
3249    if (encoding == NULL)
3250        encoding = PyUnicode_GetDefaultEncoding();
3251
3252    /* Decode via the codec registry */
3253    return PyCodec_Decode(unicode, encoding, errors);
3254}
3255
3256PyObject *
3257PyUnicode_AsDecodedUnicode(PyObject *unicode,
3258                           const char *encoding,
3259                           const char *errors)
3260{
3261    PyObject *v;
3262
3263    if (!PyUnicode_Check(unicode)) {
3264        PyErr_BadArgument();
3265        goto onError;
3266    }
3267
3268    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3269                     "PyUnicode_AsDecodedUnicode() is deprecated; "
3270                     "use PyCodec_Decode() to decode from str to str", 1) < 0)
3271        return NULL;
3272
3273    if (encoding == NULL)
3274        encoding = PyUnicode_GetDefaultEncoding();
3275
3276    /* Decode via the codec registry */
3277    v = PyCodec_Decode(unicode, encoding, errors);
3278    if (v == NULL)
3279        goto onError;
3280    if (!PyUnicode_Check(v)) {
3281        PyErr_Format(PyExc_TypeError,
3282                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3283                     "use codecs.decode() to decode to arbitrary types",
3284                     encoding,
3285                     Py_TYPE(unicode)->tp_name);
3286        Py_DECREF(v);
3287        goto onError;
3288    }
3289    return unicode_result(v);
3290
3291  onError:
3292    return NULL;
3293}
3294
3295PyObject *
3296PyUnicode_Encode(const Py_UNICODE *s,
3297                 Py_ssize_t size,
3298                 const char *encoding,
3299                 const char *errors)
3300{
3301    PyObject *v, *unicode;
3302
3303    unicode = PyUnicode_FromUnicode(s, size);
3304    if (unicode == NULL)
3305        return NULL;
3306    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3307    Py_DECREF(unicode);
3308    return v;
3309}
3310
3311PyObject *
3312PyUnicode_AsEncodedObject(PyObject *unicode,
3313                          const char *encoding,
3314                          const char *errors)
3315{
3316    PyObject *v;
3317
3318    if (!PyUnicode_Check(unicode)) {
3319        PyErr_BadArgument();
3320        goto onError;
3321    }
3322
3323    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3324                     "PyUnicode_AsEncodedObject() is deprecated; "
3325                     "use PyUnicode_AsEncodedString() to encode from str to bytes "
3326                     "or PyCodec_Encode() for generic encoding", 1) < 0)
3327        return NULL;
3328
3329    if (encoding == NULL)
3330        encoding = PyUnicode_GetDefaultEncoding();
3331
3332    /* Encode via the codec registry */
3333    v = PyCodec_Encode(unicode, encoding, errors);
3334    if (v == NULL)
3335        goto onError;
3336    return v;
3337
3338  onError:
3339    return NULL;
3340}
3341
3342static size_t
3343wcstombs_errorpos(const wchar_t *wstr)
3344{
3345    size_t len;
3346#if SIZEOF_WCHAR_T == 2
3347    wchar_t buf[3];
3348#else
3349    wchar_t buf[2];
3350#endif
3351    char outbuf[MB_LEN_MAX];
3352    const wchar_t *start, *previous;
3353
3354#if SIZEOF_WCHAR_T == 2
3355    buf[2] = 0;
3356#else
3357    buf[1] = 0;
3358#endif
3359    start = wstr;
3360    while (*wstr != L'\0')
3361    {
3362        previous = wstr;
3363#if SIZEOF_WCHAR_T == 2
3364        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3365            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3366        {
3367            buf[0] = wstr[0];
3368            buf[1] = wstr[1];
3369            wstr += 2;
3370        }
3371        else {
3372            buf[0] = *wstr;
3373            buf[1] = 0;
3374            wstr++;
3375        }
3376#else
3377        buf[0] = *wstr;
3378        wstr++;
3379#endif
3380        len = wcstombs(outbuf, buf, sizeof(outbuf));
3381        if (len == (size_t)-1)
3382            return previous - start;
3383    }
3384
3385    /* failed to find the unencodable character */
3386    return 0;
3387}
3388
3389static int
3390locale_error_handler(const char *errors, int *surrogateescape)
3391{
3392    _Py_error_handler error_handler = get_error_handler(errors);
3393    switch (error_handler)
3394    {
3395    case _Py_ERROR_STRICT:
3396        *surrogateescape = 0;
3397        return 0;
3398    case _Py_ERROR_SURROGATEESCAPE:
3399        *surrogateescape = 1;
3400        return 0;
3401    default:
3402        PyErr_Format(PyExc_ValueError,
3403                     "only 'strict' and 'surrogateescape' error handlers "
3404                     "are supported, not '%s'",
3405                     errors);
3406        return -1;
3407    }
3408}
3409
3410PyObject *
3411PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3412{
3413    Py_ssize_t wlen, wlen2;
3414    wchar_t *wstr;
3415    PyObject *bytes = NULL;
3416    char *errmsg;
3417    PyObject *reason = NULL;
3418    PyObject *exc;
3419    size_t error_pos;
3420    int surrogateescape;
3421
3422    if (locale_error_handler(errors, &surrogateescape) < 0)
3423        return NULL;
3424
3425    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3426    if (wstr == NULL)
3427        return NULL;
3428
3429    wlen2 = wcslen(wstr);
3430    if (wlen2 != wlen) {
3431        PyMem_Free(wstr);
3432        PyErr_SetString(PyExc_ValueError, "embedded null character");
3433        return NULL;
3434    }
3435
3436    if (surrogateescape) {
3437        /* "surrogateescape" error handler */
3438        char *str;
3439
3440        str = Py_EncodeLocale(wstr, &error_pos);
3441        if (str == NULL) {
3442            if (error_pos == (size_t)-1) {
3443                PyErr_NoMemory();
3444                PyMem_Free(wstr);
3445                return NULL;
3446            }
3447            else {
3448                goto encode_error;
3449            }
3450        }
3451        PyMem_Free(wstr);
3452
3453        bytes = PyBytes_FromString(str);
3454        PyMem_Free(str);
3455    }
3456    else {
3457        /* strict mode */
3458        size_t len, len2;
3459
3460        len = wcstombs(NULL, wstr, 0);
3461        if (len == (size_t)-1) {
3462            error_pos = (size_t)-1;
3463            goto encode_error;
3464        }
3465
3466        bytes = PyBytes_FromStringAndSize(NULL, len);
3467        if (bytes == NULL) {
3468            PyMem_Free(wstr);
3469            return NULL;
3470        }
3471
3472        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3473        if (len2 == (size_t)-1 || len2 > len) {
3474            error_pos = (size_t)-1;
3475            goto encode_error;
3476        }
3477        PyMem_Free(wstr);
3478    }
3479    return bytes;
3480
3481encode_error:
3482    errmsg = strerror(errno);
3483    assert(errmsg != NULL);
3484
3485    if (error_pos == (size_t)-1)
3486        error_pos = wcstombs_errorpos(wstr);
3487
3488    PyMem_Free(wstr);
3489    Py_XDECREF(bytes);
3490
3491    if (errmsg != NULL) {
3492        size_t errlen;
3493        wstr = Py_DecodeLocale(errmsg, &errlen);
3494        if (wstr != NULL) {
3495            reason = PyUnicode_FromWideChar(wstr, errlen);
3496            PyMem_RawFree(wstr);
3497        } else
3498            errmsg = NULL;
3499    }
3500    if (errmsg == NULL)
3501        reason = PyUnicode_FromString(
3502            "wcstombs() encountered an unencodable "
3503            "wide character");
3504    if (reason == NULL)
3505        return NULL;
3506
3507    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3508                                "locale", unicode,
3509                                (Py_ssize_t)error_pos,
3510                                (Py_ssize_t)(error_pos+1),
3511                                reason);
3512    Py_DECREF(reason);
3513    if (exc != NULL) {
3514        PyCodec_StrictErrors(exc);
3515        Py_XDECREF(exc);
3516    }
3517    return NULL;
3518}
3519
3520PyObject *
3521PyUnicode_EncodeFSDefault(PyObject *unicode)
3522{
3523#if defined(__APPLE__)
3524    return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3525#else
3526    PyInterpreterState *interp = PyThreadState_GET()->interp;
3527    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3528       cannot use it to encode and decode filenames before it is loaded. Load
3529       the Python codec requires to encode at least its own filename. Use the C
3530       version of the locale codec until the codec registry is initialized and
3531       the Python codec is loaded.
3532
3533       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3534       cannot only rely on it: check also interp->fscodec_initialized for
3535       subinterpreters. */
3536    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3537        return PyUnicode_AsEncodedString(unicode,
3538                                         Py_FileSystemDefaultEncoding,
3539                                         Py_FileSystemDefaultEncodeErrors);
3540    }
3541    else {
3542        return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
3543    }
3544#endif
3545}
3546
3547PyObject *
3548PyUnicode_AsEncodedString(PyObject *unicode,
3549                          const char *encoding,
3550                          const char *errors)
3551{
3552    PyObject *v;
3553    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3554
3555    if (!PyUnicode_Check(unicode)) {
3556        PyErr_BadArgument();
3557        return NULL;
3558    }
3559
3560    if (encoding == NULL) {
3561        return _PyUnicode_AsUTF8String(unicode, errors);
3562    }
3563
3564    /* Shortcuts for common default encodings */
3565    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3566        char *lower = buflower;
3567
3568        /* Fast paths */
3569        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3570            lower += 3;
3571            if (*lower == '_') {
3572                /* Match "utf8" and "utf_8" */
3573                lower++;
3574            }
3575
3576            if (lower[0] == '8' && lower[1] == 0) {
3577                return _PyUnicode_AsUTF8String(unicode, errors);
3578            }
3579            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3580                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3581            }
3582            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3583                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3584            }
3585        }
3586        else {
3587            if (strcmp(lower, "ascii") == 0
3588                || strcmp(lower, "us_ascii") == 0) {
3589                return _PyUnicode_AsASCIIString(unicode, errors);
3590            }
3591#ifdef MS_WINDOWS
3592            else if (strcmp(lower, "mbcs") == 0) {
3593                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3594            }
3595#endif
3596            else if (strcmp(lower, "latin1") == 0 ||
3597                     strcmp(lower, "latin_1") == 0 ||
3598                     strcmp(lower, "iso_8859_1") == 0 ||
3599                     strcmp(lower, "iso8859_1") == 0) {
3600                return _PyUnicode_AsLatin1String(unicode, errors);
3601            }
3602        }
3603    }
3604
3605    /* Encode via the codec registry */
3606    v = _PyCodec_EncodeText(unicode, encoding, errors);
3607    if (v == NULL)
3608        return NULL;
3609
3610    /* The normal path */
3611    if (PyBytes_Check(v))
3612        return v;
3613
3614    /* If the codec returns a buffer, raise a warning and convert to bytes */
3615    if (PyByteArray_Check(v)) {
3616        int error;
3617        PyObject *b;
3618
3619        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3620            "encoder %s returned bytearray instead of bytes; "
3621            "use codecs.encode() to encode to arbitrary types",
3622            encoding);
3623        if (error) {
3624            Py_DECREF(v);
3625            return NULL;
3626        }
3627
3628        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3629        Py_DECREF(v);
3630        return b;
3631    }
3632
3633    PyErr_Format(PyExc_TypeError,
3634                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3635                 "use codecs.encode() to encode to arbitrary types",
3636                 encoding,
3637                 Py_TYPE(v)->tp_name);
3638    Py_DECREF(v);
3639    return NULL;
3640}
3641
3642PyObject *
3643PyUnicode_AsEncodedUnicode(PyObject *unicode,
3644                           const char *encoding,
3645                           const char *errors)
3646{
3647    PyObject *v;
3648
3649    if (!PyUnicode_Check(unicode)) {
3650        PyErr_BadArgument();
3651        goto onError;
3652    }
3653
3654    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3655                     "PyUnicode_AsEncodedUnicode() is deprecated; "
3656                     "use PyCodec_Encode() to encode from str to str", 1) < 0)
3657        return NULL;
3658
3659    if (encoding == NULL)
3660        encoding = PyUnicode_GetDefaultEncoding();
3661
3662    /* Encode via the codec registry */
3663    v = PyCodec_Encode(unicode, encoding, errors);
3664    if (v == NULL)
3665        goto onError;
3666    if (!PyUnicode_Check(v)) {
3667        PyErr_Format(PyExc_TypeError,
3668                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3669                     "use codecs.encode() to encode to arbitrary types",
3670                     encoding,
3671                     Py_TYPE(v)->tp_name);
3672        Py_DECREF(v);
3673        goto onError;
3674    }
3675    return v;
3676
3677  onError:
3678    return NULL;
3679}
3680
3681static size_t
3682mbstowcs_errorpos(const char *str, size_t len)
3683{
3684#ifdef HAVE_MBRTOWC
3685    const char *start = str;
3686    mbstate_t mbs;
3687    size_t converted;
3688    wchar_t ch;
3689
3690    memset(&mbs, 0, sizeof mbs);
3691    while (len)
3692    {
3693        converted = mbrtowc(&ch, str, len, &mbs);
3694        if (converted == 0)
3695            /* Reached end of string */
3696            break;
3697        if (converted == (size_t)-1 || converted == (size_t)-2) {
3698            /* Conversion error or incomplete character */
3699            return str - start;
3700        }
3701        else {
3702            str += converted;
3703            len -= converted;
3704        }
3705    }
3706    /* failed to find the undecodable byte sequence */
3707    return 0;
3708#endif
3709    return 0;
3710}
3711
3712PyObject*
3713PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3714                              const char *errors)
3715{
3716    wchar_t smallbuf[256];
3717    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3718    wchar_t *wstr;
3719    size_t wlen, wlen2;
3720    PyObject *unicode;
3721    int surrogateescape;
3722    size_t error_pos;
3723    char *errmsg;
3724    PyObject *reason = NULL;   /* initialize to prevent gcc warning */
3725    PyObject *exc;
3726
3727    if (locale_error_handler(errors, &surrogateescape) < 0)
3728        return NULL;
3729
3730    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3731        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3732        return NULL;
3733    }
3734
3735    if (surrogateescape) {
3736        /* "surrogateescape" error handler */
3737        wstr = Py_DecodeLocale(str, &wlen);
3738        if (wstr == NULL) {
3739            if (wlen == (size_t)-1)
3740                PyErr_NoMemory();
3741            else
3742                PyErr_SetFromErrno(PyExc_OSError);
3743            return NULL;
3744        }
3745
3746        unicode = PyUnicode_FromWideChar(wstr, wlen);
3747        PyMem_RawFree(wstr);
3748    }
3749    else {
3750        /* strict mode */
3751#ifndef HAVE_BROKEN_MBSTOWCS
3752        wlen = mbstowcs(NULL, str, 0);
3753#else
3754        wlen = len;
3755#endif
3756        if (wlen == (size_t)-1)
3757            goto decode_error;
3758        if (wlen+1 <= smallbuf_len) {
3759            wstr = smallbuf;
3760        }
3761        else {
3762            wstr = PyMem_New(wchar_t, wlen+1);
3763            if (!wstr)
3764                return PyErr_NoMemory();
3765        }
3766
3767        wlen2 = mbstowcs(wstr, str, wlen+1);
3768        if (wlen2 == (size_t)-1) {
3769            if (wstr != smallbuf)
3770                PyMem_Free(wstr);
3771            goto decode_error;
3772        }
3773#ifdef HAVE_BROKEN_MBSTOWCS
3774        assert(wlen2 == wlen);
3775#endif
3776        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3777        if (wstr != smallbuf)
3778            PyMem_Free(wstr);
3779    }
3780    return unicode;
3781
3782decode_error:
3783    reason = NULL;
3784    errmsg = strerror(errno);
3785    assert(errmsg != NULL);
3786
3787    error_pos = mbstowcs_errorpos(str, len);
3788    if (errmsg != NULL) {
3789        size_t errlen;
3790        wstr = Py_DecodeLocale(errmsg, &errlen);
3791        if (wstr != NULL) {
3792            reason = PyUnicode_FromWideChar(wstr, errlen);
3793            PyMem_RawFree(wstr);
3794        }
3795    }
3796    if (reason == NULL)
3797        reason = PyUnicode_FromString(
3798            "mbstowcs() encountered an invalid multibyte sequence");
3799    if (reason == NULL)
3800        return NULL;
3801
3802    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803                                "locale", str, len,
3804                                (Py_ssize_t)error_pos,
3805                                (Py_ssize_t)(error_pos+1),
3806                                reason);
3807    Py_DECREF(reason);
3808    if (exc != NULL) {
3809        PyCodec_StrictErrors(exc);
3810        Py_XDECREF(exc);
3811    }
3812    return NULL;
3813}
3814
3815PyObject*
3816PyUnicode_DecodeLocale(const char *str, const char *errors)
3817{
3818    Py_ssize_t size = (Py_ssize_t)strlen(str);
3819    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3820}
3821
3822
3823PyObject*
3824PyUnicode_DecodeFSDefault(const char *s) {
3825    Py_ssize_t size = (Py_ssize_t)strlen(s);
3826    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827}
3828
3829PyObject*
3830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831{
3832#if defined(__APPLE__)
3833    return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3834#else
3835    PyInterpreterState *interp = PyThreadState_GET()->interp;
3836    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837       cannot use it to encode and decode filenames before it is loaded. Load
3838       the Python codec requires to encode at least its own filename. Use the C
3839       version of the locale codec until the codec registry is initialized and
3840       the Python codec is loaded.
3841
3842       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843       cannot only rely on it: check also interp->fscodec_initialized for
3844       subinterpreters. */
3845    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3846        return PyUnicode_Decode(s, size,
3847                                Py_FileSystemDefaultEncoding,
3848                                Py_FileSystemDefaultEncodeErrors);
3849    }
3850    else {
3851        return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
3852    }
3853#endif
3854}
3855
3856
3857int
3858PyUnicode_FSConverter(PyObject* arg, void* addr)
3859{
3860    PyObject *path = NULL;
3861    PyObject *output = NULL;
3862    Py_ssize_t size;
3863    void *data;
3864    if (arg == NULL) {
3865        Py_DECREF(*(PyObject**)addr);
3866        *(PyObject**)addr = NULL;
3867        return 1;
3868    }
3869    path = PyOS_FSPath(arg);
3870    if (path == NULL) {
3871        return 0;
3872    }
3873    if (PyBytes_Check(path)) {
3874        output = path;
3875    }
3876    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3877        output = PyUnicode_EncodeFSDefault(path);
3878        Py_DECREF(path);
3879        if (!output) {
3880            return 0;
3881        }
3882        assert(PyBytes_Check(output));
3883    }
3884
3885    size = PyBytes_GET_SIZE(output);
3886    data = PyBytes_AS_STRING(output);
3887    if ((size_t)size != strlen(data)) {
3888        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3889        Py_DECREF(output);
3890        return 0;
3891    }
3892    *(PyObject**)addr = output;
3893    return Py_CLEANUP_SUPPORTED;
3894}
3895
3896
3897int
3898PyUnicode_FSDecoder(PyObject* arg, void* addr)
3899{
3900    int is_buffer = 0;
3901    PyObject *path = NULL;
3902    PyObject *output = NULL;
3903    if (arg == NULL) {
3904        Py_DECREF(*(PyObject**)addr);
3905        return 1;
3906    }
3907
3908    is_buffer = PyObject_CheckBuffer(arg);
3909    if (!is_buffer) {
3910        path = PyOS_FSPath(arg);
3911        if (path == NULL) {
3912            return 0;
3913        }
3914    }
3915    else {
3916        path = arg;
3917        Py_INCREF(arg);
3918    }
3919
3920    if (PyUnicode_Check(path)) {
3921        if (PyUnicode_READY(path) == -1) {
3922            Py_DECREF(path);
3923            return 0;
3924        }
3925        output = path;
3926    }
3927    else if (PyBytes_Check(path) || is_buffer) {
3928        PyObject *path_bytes = NULL;
3929
3930        if (!PyBytes_Check(path) &&
3931            PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3932            "path should be string, bytes, or os.PathLike, not %.200s",
3933            Py_TYPE(arg)->tp_name)) {
3934                Py_DECREF(path);
3935            return 0;
3936        }
3937        path_bytes = PyBytes_FromObject(path);
3938        Py_DECREF(path);
3939        if (!path_bytes) {
3940            return 0;
3941        }
3942        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3943                                                  PyBytes_GET_SIZE(path_bytes));
3944        Py_DECREF(path_bytes);
3945        if (!output) {
3946            return 0;
3947        }
3948    }
3949    else {
3950        PyErr_Format(PyExc_TypeError,
3951                     "path should be string, bytes, or os.PathLike, not %.200s",
3952                     Py_TYPE(arg)->tp_name);
3953        Py_DECREF(path);
3954        return 0;
3955    }
3956    if (PyUnicode_READY(output) == -1) {
3957        Py_DECREF(output);
3958        return 0;
3959    }
3960    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3961                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3962        PyErr_SetString(PyExc_ValueError, "embedded null character");
3963        Py_DECREF(output);
3964        return 0;
3965    }
3966    *(PyObject**)addr = output;
3967    return Py_CLEANUP_SUPPORTED;
3968}
3969
3970
3971char*
3972PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3973{
3974    PyObject *bytes;
3975
3976    if (!PyUnicode_Check(unicode)) {
3977        PyErr_BadArgument();
3978        return NULL;
3979    }
3980    if (PyUnicode_READY(unicode) == -1)
3981        return NULL;
3982
3983    if (PyUnicode_UTF8(unicode) == NULL) {
3984        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3985        bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3986        if (bytes == NULL)
3987            return NULL;
3988        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3989        if (_PyUnicode_UTF8(unicode) == NULL) {
3990            PyErr_NoMemory();
3991            Py_DECREF(bytes);
3992            return NULL;
3993        }
3994        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3995        memcpy(_PyUnicode_UTF8(unicode),
3996                  PyBytes_AS_STRING(bytes),
3997                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3998        Py_DECREF(bytes);
3999    }
4000
4001    if (psize)
4002        *psize = PyUnicode_UTF8_LENGTH(unicode);
4003    return PyUnicode_UTF8(unicode);
4004}
4005
4006char*
4007PyUnicode_AsUTF8(PyObject *unicode)
4008{
4009    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4010}
4011
4012Py_UNICODE *
4013PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4014{
4015    const unsigned char *one_byte;
4016#if SIZEOF_WCHAR_T == 4
4017    const Py_UCS2 *two_bytes;
4018#else
4019    const Py_UCS4 *four_bytes;
4020    const Py_UCS4 *ucs4_end;
4021    Py_ssize_t num_surrogates;
4022#endif
4023    wchar_t *w;
4024    wchar_t *wchar_end;
4025
4026    if (!PyUnicode_Check(unicode)) {
4027        PyErr_BadArgument();
4028        return NULL;
4029    }
4030    if (_PyUnicode_WSTR(unicode) == NULL) {
4031        /* Non-ASCII compact unicode object */
4032        assert(_PyUnicode_KIND(unicode) != 0);
4033        assert(PyUnicode_IS_READY(unicode));
4034
4035        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
4036#if SIZEOF_WCHAR_T == 2
4037            four_bytes = PyUnicode_4BYTE_DATA(unicode);
4038            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
4039            num_surrogates = 0;
4040
4041            for (; four_bytes < ucs4_end; ++four_bytes) {
4042                if (*four_bytes > 0xFFFF)
4043                    ++num_surrogates;
4044            }
4045
4046            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4047                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4048            if (!_PyUnicode_WSTR(unicode)) {
4049                PyErr_NoMemory();
4050                return NULL;
4051            }
4052            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
4053
4054            w = _PyUnicode_WSTR(unicode);
4055            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4056            four_bytes = PyUnicode_4BYTE_DATA(unicode);
4057            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4058                if (*four_bytes > 0xFFFF) {
4059                    assert(*four_bytes <= MAX_UNICODE);
4060                    /* encode surrogate pair in this case */
4061                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4062                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
4063                }
4064                else
4065                    *w = *four_bytes;
4066
4067                if (w > wchar_end) {
4068                    assert(0 && "Miscalculated string end");
4069                }
4070            }
4071            *w = 0;
4072#else
4073            /* sizeof(wchar_t) == 4 */
4074            Py_FatalError("Impossible unicode object state, wstr and str "
4075                          "should share memory already.");
4076            return NULL;
4077#endif
4078        }
4079        else {
4080            if ((size_t)_PyUnicode_LENGTH(unicode) >
4081                    PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4082                PyErr_NoMemory();
4083                return NULL;
4084            }
4085            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4086                                                  (_PyUnicode_LENGTH(unicode) + 1));
4087            if (!_PyUnicode_WSTR(unicode)) {
4088                PyErr_NoMemory();
4089                return NULL;
4090            }
4091            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4092                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4093            w = _PyUnicode_WSTR(unicode);
4094            wchar_end = w + _PyUnicode_LENGTH(unicode);
4095
4096            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4097                one_byte = PyUnicode_1BYTE_DATA(unicode);
4098                for (; w < wchar_end; ++one_byte, ++w)
4099                    *w = *one_byte;
4100                /* null-terminate the wstr */
4101                *w = 0;
4102            }
4103            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
4104#if SIZEOF_WCHAR_T == 4
4105                two_bytes = PyUnicode_2BYTE_DATA(unicode);
4106                for (; w < wchar_end; ++two_bytes, ++w)
4107                    *w = *two_bytes;
4108                /* null-terminate the wstr */
4109                *w = 0;
4110#else
4111                /* sizeof(wchar_t) == 2 */
4112                PyObject_FREE(_PyUnicode_WSTR(unicode));
4113                _PyUnicode_WSTR(unicode) = NULL;
4114                Py_FatalError("Impossible unicode object state, wstr "
4115                              "and str should share memory already.");
4116                return NULL;
4117#endif
4118            }
4119            else {
4120                assert(0 && "This should never happen.");
4121            }
4122        }
4123    }
4124    if (size != NULL)
4125        *size = PyUnicode_WSTR_LENGTH(unicode);
4126    return _PyUnicode_WSTR(unicode);
4127}
4128
4129Py_UNICODE *
4130PyUnicode_AsUnicode(PyObject *unicode)
4131{
4132    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4133}
4134
4135
4136Py_ssize_t
4137PyUnicode_GetSize(PyObject *unicode)
4138{
4139    if (!PyUnicode_Check(unicode)) {
4140        PyErr_BadArgument();
4141        goto onError;
4142    }
4143    return PyUnicode_GET_SIZE(unicode);
4144
4145  onError:
4146    return -1;
4147}
4148
4149Py_ssize_t
4150PyUnicode_GetLength(PyObject *unicode)
4151{
4152    if (!PyUnicode_Check(unicode)) {
4153        PyErr_BadArgument();
4154        return -1;
4155    }
4156    if (PyUnicode_READY(unicode) == -1)
4157        return -1;
4158    return PyUnicode_GET_LENGTH(unicode);
4159}
4160
4161Py_UCS4
4162PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4163{
4164    void *data;
4165    int kind;
4166
4167    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4168        PyErr_BadArgument();
4169        return (Py_UCS4)-1;
4170    }
4171    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4172        PyErr_SetString(PyExc_IndexError, "string index out of range");
4173        return (Py_UCS4)-1;
4174    }
4175    data = PyUnicode_DATA(unicode);
4176    kind = PyUnicode_KIND(unicode);
4177    return PyUnicode_READ(kind, data, index);
4178}
4179
4180int
4181PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4182{
4183    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4184        PyErr_BadArgument();
4185        return -1;
4186    }
4187    assert(PyUnicode_IS_READY(unicode));
4188    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4189        PyErr_SetString(PyExc_IndexError, "string index out of range");
4190        return -1;
4191    }
4192    if (unicode_check_modifiable(unicode))
4193        return -1;
4194    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4195        PyErr_SetString(PyExc_ValueError, "character out of range");
4196        return -1;
4197    }
4198    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4199                    index, ch);
4200    return 0;
4201}
4202
4203const char *
4204PyUnicode_GetDefaultEncoding(void)
4205{
4206    return "utf-8";
4207}
4208
4209/* create or adjust a UnicodeDecodeError */
4210static void
4211make_decode_exception(PyObject **exceptionObject,
4212                      const char *encoding,
4213                      const char *input, Py_ssize_t length,
4214                      Py_ssize_t startpos, Py_ssize_t endpos,
4215                      const char *reason)
4216{
4217    if (*exceptionObject == NULL) {
4218        *exceptionObject = PyUnicodeDecodeError_Create(
4219            encoding, input, length, startpos, endpos, reason);
4220    }
4221    else {
4222        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4223            goto onError;
4224        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4225            goto onError;
4226        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4227            goto onError;
4228    }
4229    return;
4230
4231onError:
4232    Py_CLEAR(*exceptionObject);
4233}
4234
4235#ifdef MS_WINDOWS
4236/* error handling callback helper:
4237   build arguments, call the callback and check the arguments,
4238   if no exception occurred, copy the replacement to the output
4239   and adjust various state variables.
4240   return 0 on success, -1 on error
4241*/
4242
4243static int
4244unicode_decode_call_errorhandler_wchar(
4245    const char *errors, PyObject **errorHandler,
4246    const char *encoding, const char *reason,
4247    const char **input, const char **inend, Py_ssize_t *startinpos,
4248    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4249    PyObject **output, Py_ssize_t *outpos)
4250{
4251    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4252
4253    PyObject *restuple = NULL;
4254    PyObject *repunicode = NULL;
4255    Py_ssize_t outsize;
4256    Py_ssize_t insize;
4257    Py_ssize_t requiredsize;
4258    Py_ssize_t newpos;
4259    PyObject *inputobj = NULL;
4260    wchar_t *repwstr;
4261    Py_ssize_t repwlen;
4262
4263    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4264    outsize = _PyUnicode_WSTR_LENGTH(*output);
4265
4266    if (*errorHandler == NULL) {
4267        *errorHandler = PyCodec_LookupError(errors);
4268        if (*errorHandler == NULL)
4269            goto onError;
4270    }
4271
4272    make_decode_exception(exceptionObject,
4273        encoding,
4274        *input, *inend - *input,
4275        *startinpos, *endinpos,
4276        reason);
4277    if (*exceptionObject == NULL)
4278        goto onError;
4279
4280    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4281    if (restuple == NULL)
4282        goto onError;
4283    if (!PyTuple_Check(restuple)) {
4284        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4285        goto onError;
4286    }
4287    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4288        goto onError;
4289
4290    /* Copy back the bytes variables, which might have been modified by the
4291       callback */
4292    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4293    if (!inputobj)
4294        goto onError;
4295    if (!PyBytes_Check(inputobj)) {
4296        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4297    }
4298    *input = PyBytes_AS_STRING(inputobj);
4299    insize = PyBytes_GET_SIZE(inputobj);
4300    *inend = *input + insize;
4301    /* we can DECREF safely, as the exception has another reference,
4302       so the object won't go away. */
4303    Py_DECREF(inputobj);
4304
4305    if (newpos<0)
4306        newpos = insize+newpos;
4307    if (newpos<0 || newpos>insize) {
4308        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4309        goto onError;
4310    }
4311
4312    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4313    if (repwstr == NULL)
4314        goto onError;
4315    /* need more space? (at least enough for what we
4316       have+the replacement+the rest of the string (starting
4317       at the new input position), so we won't have to check space
4318       when there are no errors in the rest of the string) */
4319    requiredsize = *outpos;
4320    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4321        goto overflow;
4322    requiredsize += repwlen;
4323    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4324        goto overflow;
4325    requiredsize += insize - newpos;
4326    if (requiredsize > outsize) {
4327        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4328            requiredsize = 2*outsize;
4329        if (unicode_resize(output, requiredsize) < 0)
4330            goto onError;
4331    }
4332    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4333    *outpos += repwlen;
4334    *endinpos = newpos;
4335    *inptr = *input + newpos;
4336
4337    /* we made it! */
4338    Py_XDECREF(restuple);
4339    return 0;
4340
4341  overflow:
4342    PyErr_SetString(PyExc_OverflowError,
4343                    "decoded result is too long for a Python string");
4344
4345  onError:
4346    Py_XDECREF(restuple);
4347    return -1;
4348}
4349#endif   /* MS_WINDOWS */
4350
4351static int
4352unicode_decode_call_errorhandler_writer(
4353    const char *errors, PyObject **errorHandler,
4354    const char *encoding, const char *reason,
4355    const char **input, const char **inend, Py_ssize_t *startinpos,
4356    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4357    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4358{
4359    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4360
4361    PyObject *restuple = NULL;
4362    PyObject *repunicode = NULL;
4363    Py_ssize_t insize;
4364    Py_ssize_t newpos;
4365    Py_ssize_t replen;
4366    PyObject *inputobj = NULL;
4367
4368    if (*errorHandler == NULL) {
4369        *errorHandler = PyCodec_LookupError(errors);
4370        if (*errorHandler == NULL)
4371            goto onError;
4372    }
4373
4374    make_decode_exception(exceptionObject,
4375        encoding,
4376        *input, *inend - *input,
4377        *startinpos, *endinpos,
4378        reason);
4379    if (*exceptionObject == NULL)
4380        goto onError;
4381
4382    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4383    if (restuple == NULL)
4384        goto onError;
4385    if (!PyTuple_Check(restuple)) {
4386        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4387        goto onError;
4388    }
4389    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4390        goto onError;
4391
4392    /* Copy back the bytes variables, which might have been modified by the
4393       callback */
4394    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4395    if (!inputobj)
4396        goto onError;
4397    if (!PyBytes_Check(inputobj)) {
4398        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4399    }
4400    *input = PyBytes_AS_STRING(inputobj);
4401    insize = PyBytes_GET_SIZE(inputobj);
4402    *inend = *input + insize;
4403    /* we can DECREF safely, as the exception has another reference,
4404       so the object won't go away. */
4405    Py_DECREF(inputobj);
4406
4407    if (newpos<0)
4408        newpos = insize+newpos;
4409    if (newpos<0 || newpos>insize) {
4410        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4411        goto onError;
4412    }
4413
4414    if (PyUnicode_READY(repunicode) < 0)
4415        goto onError;
4416    replen = PyUnicode_GET_LENGTH(repunicode);
4417    if (replen > 1) {
4418        writer->min_length += replen - 1;
4419        writer->overallocate = 1;
4420        if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422            goto onError;
4423    }
4424    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4425        goto onError;
4426
4427    *endinpos = newpos;
4428    *inptr = *input + newpos;
4429
4430    /* we made it! */
4431    Py_XDECREF(restuple);
4432    return 0;
4433
4434  onError:
4435    Py_XDECREF(restuple);
4436    return -1;
4437}
4438
4439/* --- UTF-7 Codec -------------------------------------------------------- */
4440
4441/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4442
4443/* Three simple macros defining base-64. */
4444
4445/* Is c a base-64 character? */
4446
4447#define IS_BASE64(c) \
4448    (((c) >= 'A' && (c) <= 'Z') ||     \
4449     ((c) >= 'a' && (c) <= 'z') ||     \
4450     ((c) >= '0' && (c) <= '9') ||     \
4451     (c) == '+' || (c) == '/')
4452
4453/* given that c is a base-64 character, what is its base-64 value? */
4454
4455#define FROM_BASE64(c)                                                  \
4456    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4457     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4458     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4459     (c) == '+' ? 62 : 63)
4460
4461/* What is the base-64 character of the bottom 6 bits of n? */
4462
4463#define TO_BASE64(n)  \
4464    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465
4466/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467 * decoded as itself.  We are permissive on decoding; the only ASCII
4468 * byte not decoding to itself is the + which begins a base64
4469 * string. */
4470
4471#define DECODE_DIRECT(c)                                \
4472    ((c) <= 127 && (c) != '+')
4473
4474/* The UTF-7 encoder treats ASCII characters differently according to
4475 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476 * the above).  See RFC2152.  This array identifies these different
4477 * sets:
4478 * 0 : "Set D"
4479 *     alphanumeric and '(),-./:?
4480 * 1 : "Set O"
4481 *     !"#$%&*;<=>@[]^_`{|}
4482 * 2 : "whitespace"
4483 *     ht nl cr sp
4484 * 3 : special (must be base64 encoded)
4485 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486 */
4487
4488static
4489char utf7_category[128] = {
4490/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4491    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4492/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4493    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4494/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4495    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4496/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4497    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4498/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4499    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4500/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4501    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4502/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4503    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4504/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4505    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4506};
4507
4508/* ENCODE_DIRECT: this character should be encoded as itself.  The
4509 * answer depends on whether we are encoding set O as itself, and also
4510 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4511 * clear that the answers to these questions vary between
4512 * applications, so this code needs to be flexible.  */
4513
4514#define ENCODE_DIRECT(c, directO, directWS)             \
4515    ((c) < 128 && (c) > 0 &&                            \
4516     ((utf7_category[(c)] == 0) ||                      \
4517      (directWS && (utf7_category[(c)] == 2)) ||        \
4518      (directO && (utf7_category[(c)] == 1))))
4519
4520PyObject *
4521PyUnicode_DecodeUTF7(const char *s,
4522                     Py_ssize_t size,
4523                     const char *errors)
4524{
4525    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526}
4527
4528/* The decoder.  The only state we preserve is our read position,
4529 * i.e. how many characters we have consumed.  So if we end in the
4530 * middle of a shift sequence we have to back off the read position
4531 * and the output to the beginning of the sequence, otherwise we lose
4532 * all the shift state (seen bits, number of bits seen, high
4533 * surrogate). */
4534
4535PyObject *
4536PyUnicode_DecodeUTF7Stateful(const char *s,
4537                             Py_ssize_t size,
4538                             const char *errors,
4539                             Py_ssize_t *consumed)
4540{
4541    const char *starts = s;
4542    Py_ssize_t startinpos;
4543    Py_ssize_t endinpos;
4544    const char *e;
4545    _PyUnicodeWriter writer;
4546    const char *errmsg = "";
4547    int inShift = 0;
4548    Py_ssize_t shiftOutStart;
4549    unsigned int base64bits = 0;
4550    unsigned long base64buffer = 0;
4551    Py_UCS4 surrogate = 0;
4552    PyObject *errorHandler = NULL;
4553    PyObject *exc = NULL;
4554
4555    if (size == 0) {
4556        if (consumed)
4557            *consumed = 0;
4558        _Py_RETURN_UNICODE_EMPTY();
4559    }
4560
4561    /* Start off assuming it's all ASCII. Widen later as necessary. */
4562    _PyUnicodeWriter_Init(&writer);
4563    writer.min_length = size;
4564
4565    shiftOutStart = 0;
4566    e = s + size;
4567
4568    while (s < e) {
4569        Py_UCS4 ch;
4570      restart:
4571        ch = (unsigned char) *s;
4572
4573        if (inShift) { /* in a base-64 section */
4574            if (IS_BASE64(ch)) { /* consume a base-64 character */
4575                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576                base64bits += 6;
4577                s++;
4578                if (base64bits >= 16) {
4579                    /* we have enough bits for a UTF-16 value */
4580                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4581                    base64bits -= 16;
4582                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4583                    assert(outCh <= 0xffff);
4584                    if (surrogate) {
4585                        /* expecting a second surrogate */
4586                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4588                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4589                                goto onError;
4590                            surrogate = 0;
4591                            continue;
4592                        }
4593                        else {
4594                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4595                                goto onError;
4596                            surrogate = 0;
4597                        }
4598                    }
4599                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4600                        /* first surrogate */
4601                        surrogate = outCh;
4602                    }
4603                    else {
4604                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4605                            goto onError;
4606                    }
4607                }
4608            }
4609            else { /* now leaving a base-64 section */
4610                inShift = 0;
4611                if (base64bits > 0) { /* left-over bits */
4612                    if (base64bits >= 6) {
4613                        /* We've seen at least one base-64 character */
4614                        s++;
4615                        errmsg = "partial character in shift sequence";
4616                        goto utf7Error;
4617                    }
4618                    else {
4619                        /* Some bits remain; they should be zero */
4620                        if (base64buffer != 0) {
4621                            s++;
4622                            errmsg = "non-zero padding bits in shift sequence";
4623                            goto utf7Error;
4624                        }
4625                    }
4626                }
4627                if (surrogate && DECODE_DIRECT(ch)) {
4628                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629                        goto onError;
4630                }
4631                surrogate = 0;
4632                if (ch == '-') {
4633                    /* '-' is absorbed; other terminating
4634                       characters are preserved */
4635                    s++;
4636                }
4637            }
4638        }
4639        else if ( ch == '+' ) {
4640            startinpos = s-starts;
4641            s++; /* consume '+' */
4642            if (s < e && *s == '-') { /* '+-' encodes '+' */
4643                s++;
4644                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4645                    goto onError;
4646            }
4647            else { /* begin base64-encoded section */
4648                inShift = 1;
4649                surrogate = 0;
4650                shiftOutStart = writer.pos;
4651                base64bits = 0;
4652                base64buffer = 0;
4653            }
4654        }
4655        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4656            s++;
4657            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4658                goto onError;
4659        }
4660        else {
4661            startinpos = s-starts;
4662            s++;
4663            errmsg = "unexpected special character";
4664            goto utf7Error;
4665        }
4666        continue;
4667utf7Error:
4668        endinpos = s-starts;
4669        if (unicode_decode_call_errorhandler_writer(
4670                errors, &errorHandler,
4671                "utf7", errmsg,
4672                &starts, &e, &startinpos, &endinpos, &exc, &s,
4673                &writer))
4674            goto onError;
4675    }
4676
4677    /* end of string */
4678
4679    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680        /* if we're in an inconsistent state, that's an error */
4681        inShift = 0;
4682        if (surrogate ||
4683                (base64bits >= 6) ||
4684                (base64bits > 0 && base64buffer != 0)) {
4685            endinpos = size;
4686            if (unicode_decode_call_errorhandler_writer(
4687                    errors, &errorHandler,
4688                    "utf7", "unterminated shift sequence",
4689                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4690                    &writer))
4691                goto onError;
4692            if (s < e)
4693                goto restart;
4694        }
4695    }
4696
4697    /* return state */
4698    if (consumed) {
4699        if (inShift) {
4700            *consumed = startinpos;
4701            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4702                PyObject *result = PyUnicode_FromKindAndData(
4703                        writer.kind, writer.data, shiftOutStart);
4704                Py_XDECREF(errorHandler);
4705                Py_XDECREF(exc);
4706                _PyUnicodeWriter_Dealloc(&writer);
4707                return result;
4708            }
4709            writer.pos = shiftOutStart; /* back off output */
4710        }
4711        else {
4712            *consumed = s-starts;
4713        }
4714    }
4715
4716    Py_XDECREF(errorHandler);
4717    Py_XDECREF(exc);
4718    return _PyUnicodeWriter_Finish(&writer);
4719
4720  onError:
4721    Py_XDECREF(errorHandler);
4722    Py_XDECREF(exc);
4723    _PyUnicodeWriter_Dealloc(&writer);
4724    return NULL;
4725}
4726
4727
4728PyObject *
4729_PyUnicode_EncodeUTF7(PyObject *str,
4730                      int base64SetO,
4731                      int base64WhiteSpace,
4732                      const char *errors)
4733{
4734    int kind;
4735    void *data;
4736    Py_ssize_t len;
4737    PyObject *v;
4738    int inShift = 0;
4739    Py_ssize_t i;
4740    unsigned int base64bits = 0;
4741    unsigned long base64buffer = 0;
4742    char * out;
4743    char * start;
4744
4745    if (PyUnicode_READY(str) == -1)
4746        return NULL;
4747    kind = PyUnicode_KIND(str);
4748    data = PyUnicode_DATA(str);
4749    len = PyUnicode_GET_LENGTH(str);
4750
4751    if (len == 0)
4752        return PyBytes_FromStringAndSize(NULL, 0);
4753
4754    /* It might be possible to tighten this worst case */
4755    if (len > PY_SSIZE_T_MAX / 8)
4756        return PyErr_NoMemory();
4757    v = PyBytes_FromStringAndSize(NULL, len * 8);
4758    if (v == NULL)
4759        return NULL;
4760
4761    start = out = PyBytes_AS_STRING(v);
4762    for (i = 0; i < len; ++i) {
4763        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4764
4765        if (inShift) {
4766            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767                /* shifting out */
4768                if (base64bits) { /* output remaining bits */
4769                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770                    base64buffer = 0;
4771                    base64bits = 0;
4772                }
4773                inShift = 0;
4774                /* Characters not in the BASE64 set implicitly unshift the sequence
4775                   so no '-' is required, except if the character is itself a '-' */
4776                if (IS_BASE64(ch) || ch == '-') {
4777                    *out++ = '-';
4778                }
4779                *out++ = (char) ch;
4780            }
4781            else {
4782                goto encode_char;
4783            }
4784        }
4785        else { /* not in a shift sequence */
4786            if (ch == '+') {
4787                *out++ = '+';
4788                        *out++ = '-';
4789            }
4790            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791                *out++ = (char) ch;
4792            }
4793            else {
4794                *out++ = '+';
4795                inShift = 1;
4796                goto encode_char;
4797            }
4798        }
4799        continue;
4800encode_char:
4801        if (ch >= 0x10000) {
4802            assert(ch <= MAX_UNICODE);
4803
4804            /* code first surrogate */
4805            base64bits += 16;
4806            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4807            while (base64bits >= 6) {
4808                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809                base64bits -= 6;
4810            }
4811            /* prepare second surrogate */
4812            ch = Py_UNICODE_LOW_SURROGATE(ch);
4813        }
4814        base64bits += 16;
4815        base64buffer = (base64buffer << 16) | ch;
4816        while (base64bits >= 6) {
4817            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818            base64bits -= 6;
4819        }
4820    }
4821    if (base64bits)
4822        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823    if (inShift)
4824        *out++ = '-';
4825    if (_PyBytes_Resize(&v, out - start) < 0)
4826        return NULL;
4827    return v;
4828}
4829PyObject *
4830PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831                     Py_ssize_t size,
4832                     int base64SetO,
4833                     int base64WhiteSpace,
4834                     const char *errors)
4835{
4836    PyObject *result;
4837    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4838    if (tmp == NULL)
4839        return NULL;
4840    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4841                                   base64WhiteSpace, errors);
4842    Py_DECREF(tmp);
4843    return result;
4844}
4845
4846#undef IS_BASE64
4847#undef FROM_BASE64
4848#undef TO_BASE64
4849#undef DECODE_DIRECT
4850#undef ENCODE_DIRECT
4851
4852/* --- UTF-8 Codec -------------------------------------------------------- */
4853
4854PyObject *
4855PyUnicode_DecodeUTF8(const char *s,
4856                     Py_ssize_t size,
4857                     const char *errors)
4858{
4859    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860}
4861
4862#include "stringlib/asciilib.h"
4863#include "stringlib/codecs.h"
4864#include "stringlib/undef.h"
4865
4866#include "stringlib/ucs1lib.h"
4867#include "stringlib/codecs.h"
4868#include "stringlib/undef.h"
4869
4870#include "stringlib/ucs2lib.h"
4871#include "stringlib/codecs.h"
4872#include "stringlib/undef.h"
4873
4874#include "stringlib/ucs4lib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
4878/* Mask to quickly check whether a C 'long' contains a
4879   non-ASCII, UTF8-encoded char. */
4880#if (SIZEOF_LONG == 8)
4881# define ASCII_CHAR_MASK 0x8080808080808080UL
4882#elif (SIZEOF_LONG == 4)
4883# define ASCII_CHAR_MASK 0x80808080UL
4884#else
4885# error C 'long' size should be either 4 or 8!
4886#endif
4887
4888static Py_ssize_t
4889ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4890{
4891    const char *p = start;
4892    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4893
4894    /*
4895     * Issue #17237: m68k is a bit different from most architectures in
4896     * that objects do not use "natural alignment" - for example, int and
4897     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4898     * won't work; also, tests have shown that skipping the "optimised
4899     * version" will even speed up m68k.
4900     */
4901#if !defined(__m68k__)
4902#if SIZEOF_LONG <= SIZEOF_VOID_P
4903    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4905        /* Fast path, see in STRINGLIB(utf8_decode) for
4906           an explanation. */
4907        /* Help allocation */
4908        const char *_p = p;
4909        Py_UCS1 * q = dest;
4910        while (_p < aligned_end) {
4911            unsigned long value = *(const unsigned long *) _p;
4912            if (value & ASCII_CHAR_MASK)
4913                break;
4914            *((unsigned long *)q) = value;
4915            _p += SIZEOF_LONG;
4916            q += SIZEOF_LONG;
4917        }
4918        p = _p;
4919        while (p < end) {
4920            if ((unsigned char)*p & 0x80)
4921                break;
4922            *q++ = *p++;
4923        }
4924        return p - start;
4925    }
4926#endif
4927#endif
4928    while (p < end) {
4929        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930           for an explanation. */
4931        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4932            /* Help allocation */
4933            const char *_p = p;
4934            while (_p < aligned_end) {
4935                unsigned long value = *(unsigned long *) _p;
4936                if (value & ASCII_CHAR_MASK)
4937                    break;
4938                _p += SIZEOF_LONG;
4939            }
4940            p = _p;
4941            if (_p == end)
4942                break;
4943        }
4944        if ((unsigned char)*p & 0x80)
4945            break;
4946        ++p;
4947    }
4948    memcpy(dest, start, p - start);
4949    return p - start;
4950}
4951
4952PyObject *
4953PyUnicode_DecodeUTF8Stateful(const char *s,
4954                             Py_ssize_t size,
4955                             const char *errors,
4956                             Py_ssize_t *consumed)
4957{
4958    _PyUnicodeWriter writer;
4959    const char *starts = s;
4960    const char *end = s + size;
4961
4962    Py_ssize_t startinpos;
4963    Py_ssize_t endinpos;
4964    const char *errmsg = "";
4965    PyObject *error_handler_obj = NULL;
4966    PyObject *exc = NULL;
4967    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4968
4969    if (size == 0) {
4970        if (consumed)
4971            *consumed = 0;
4972        _Py_RETURN_UNICODE_EMPTY();
4973    }
4974
4975    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976    if (size == 1 && (unsigned char)s[0] < 128) {
4977        if (consumed)
4978            *consumed = 1;
4979        return get_latin1_char((unsigned char)s[0]);
4980    }
4981
4982    _PyUnicodeWriter_Init(&writer);
4983    writer.min_length = size;
4984    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4985        goto onError;
4986
4987    writer.pos = ascii_decode(s, end, writer.data);
4988    s += writer.pos;
4989    while (s < end) {
4990        Py_UCS4 ch;
4991        int kind = writer.kind;
4992
4993        if (kind == PyUnicode_1BYTE_KIND) {
4994            if (PyUnicode_IS_ASCII(writer.buffer))
4995                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4996            else
4997                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4998        } else if (kind == PyUnicode_2BYTE_KIND) {
4999            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5000        } else {
5001            assert(kind == PyUnicode_4BYTE_KIND);
5002            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5003        }
5004
5005        switch (ch) {
5006        case 0:
5007            if (s == end || consumed)
5008                goto End;
5009            errmsg = "unexpected end of data";
5010            startinpos = s - starts;
5011            endinpos = end - starts;
5012            break;
5013        case 1:
5014            errmsg = "invalid start byte";
5015            startinpos = s - starts;
5016            endinpos = startinpos + 1;
5017            break;
5018        case 2:
5019        case 3:
5020        case 4:
5021            errmsg = "invalid continuation byte";
5022            startinpos = s - starts;
5023            endinpos = startinpos + ch - 1;
5024            break;
5025        default:
5026            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5027                goto onError;
5028            continue;
5029        }
5030
5031        if (error_handler == _Py_ERROR_UNKNOWN)
5032            error_handler = get_error_handler(errors);
5033
5034        switch (error_handler) {
5035        case _Py_ERROR_IGNORE:
5036            s += (endinpos - startinpos);
5037            break;
5038
5039        case _Py_ERROR_REPLACE:
5040            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041                goto onError;
5042            s += (endinpos - startinpos);
5043            break;
5044
5045        case _Py_ERROR_SURROGATEESCAPE:
5046        {
5047            Py_ssize_t i;
5048
5049            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050                goto onError;
5051            for (i=startinpos; i<endinpos; i++) {
5052                ch = (Py_UCS4)(unsigned char)(starts[i]);
5053                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054                                ch + 0xdc00);
5055                writer.pos++;
5056            }
5057            s += (endinpos - startinpos);
5058            break;
5059        }
5060
5061        default:
5062            if (unicode_decode_call_errorhandler_writer(
5063                    errors, &error_handler_obj,
5064                    "utf-8", errmsg,
5065                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5066                    &writer))
5067                goto onError;
5068        }
5069    }
5070
5071End:
5072    if (consumed)
5073        *consumed = s - starts;
5074
5075    Py_XDECREF(error_handler_obj);
5076    Py_XDECREF(exc);
5077    return _PyUnicodeWriter_Finish(&writer);
5078
5079onError:
5080    Py_XDECREF(error_handler_obj);
5081    Py_XDECREF(exc);
5082    _PyUnicodeWriter_Dealloc(&writer);
5083    return NULL;
5084}
5085
5086#if defined(__APPLE__) || defined(__ANDROID__)
5087
5088/* Simplified UTF-8 decoder using surrogateescape error handler,
5089   used to decode the command line arguments on Mac OS X and Android.
5090
5091   Return a pointer to a newly allocated wide character string (use
5092   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
5093
5094wchar_t*
5095_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096{
5097    const char *e;
5098    wchar_t *unicode;
5099    Py_ssize_t outpos;
5100
5101    /* Note: size will always be longer than the resulting Unicode
5102       character count */
5103    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
5104        return NULL;
5105    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5106    if (!unicode)
5107        return NULL;
5108
5109    /* Unpack UTF-8 encoded data */
5110    e = s + size;
5111    outpos = 0;
5112    while (s < e) {
5113        Py_UCS4 ch;
5114#if SIZEOF_WCHAR_T == 4
5115        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5116#else
5117        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5118#endif
5119        if (ch > 0xFF) {
5120#if SIZEOF_WCHAR_T == 4
5121            assert(0);
5122#else
5123            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5124            /*  compute and append the two surrogates: */
5125            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127#endif
5128        }
5129        else {
5130            if (!ch && s == e)
5131                break;
5132            /* surrogateescape */
5133            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134        }
5135    }
5136    unicode[outpos] = L'\0';
5137    return unicode;
5138}
5139
5140#endif /* __APPLE__ or __ANDROID__ */
5141
5142/* Primary internal function which creates utf8 encoded bytes objects.
5143
5144   Allocation strategy:  if the string is short, convert into a stack buffer
5145   and allocate exactly as much space needed at the end.  Else allocate the
5146   maximum possible needed (4 result bytes per Unicode character), and return
5147   the excess memory at the end.
5148*/
5149PyObject *
5150_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5151{
5152    enum PyUnicode_Kind kind;
5153    void *data;
5154    Py_ssize_t size;
5155
5156    if (!PyUnicode_Check(unicode)) {
5157        PyErr_BadArgument();
5158        return NULL;
5159    }
5160
5161    if (PyUnicode_READY(unicode) == -1)
5162        return NULL;
5163
5164    if (PyUnicode_UTF8(unicode))
5165        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166                                         PyUnicode_UTF8_LENGTH(unicode));
5167
5168    kind = PyUnicode_KIND(unicode);
5169    data = PyUnicode_DATA(unicode);
5170    size = PyUnicode_GET_LENGTH(unicode);
5171
5172    switch (kind) {
5173    default:
5174        assert(0);
5175    case PyUnicode_1BYTE_KIND:
5176        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177        assert(!PyUnicode_IS_ASCII(unicode));
5178        return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179    case PyUnicode_2BYTE_KIND:
5180        return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181    case PyUnicode_4BYTE_KIND:
5182        return ucs4lib_utf8_encoder(unicode, data, size, errors);
5183    }
5184}
5185
5186PyObject *
5187PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188                     Py_ssize_t size,
5189                     const char *errors)
5190{
5191    PyObject *v, *unicode;
5192
5193    unicode = PyUnicode_FromUnicode(s, size);
5194    if (unicode == NULL)
5195        return NULL;
5196    v = _PyUnicode_AsUTF8String(unicode, errors);
5197    Py_DECREF(unicode);
5198    return v;
5199}
5200
5201PyObject *
5202PyUnicode_AsUTF8String(PyObject *unicode)
5203{
5204    return _PyUnicode_AsUTF8String(unicode, NULL);
5205}
5206
5207/* --- UTF-32 Codec ------------------------------------------------------- */
5208
5209PyObject *
5210PyUnicode_DecodeUTF32(const char *s,
5211                      Py_ssize_t size,
5212                      const char *errors,
5213                      int *byteorder)
5214{
5215    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216}
5217
5218PyObject *
5219PyUnicode_DecodeUTF32Stateful(const char *s,
5220                              Py_ssize_t size,
5221                              const char *errors,
5222                              int *byteorder,
5223                              Py_ssize_t *consumed)
5224{
5225    const char *starts = s;
5226    Py_ssize_t startinpos;
5227    Py_ssize_t endinpos;
5228    _PyUnicodeWriter writer;
5229    const unsigned char *q, *e;
5230    int le, bo = 0;       /* assume native ordering by default */
5231    const char *encoding;
5232    const char *errmsg = "";
5233    PyObject *errorHandler = NULL;
5234    PyObject *exc = NULL;
5235
5236    q = (unsigned char *)s;
5237    e = q + size;
5238
5239    if (byteorder)
5240        bo = *byteorder;
5241
5242    /* Check for BOM marks (U+FEFF) in the input and adjust current
5243       byte order setting accordingly. In native mode, the leading BOM
5244       mark is skipped, in all other modes, it is copied to the output
5245       stream as-is (giving a ZWNBSP character). */
5246    if (bo == 0 && size >= 4) {
5247        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5248        if (bom == 0x0000FEFF) {
5249            bo = -1;
5250            q += 4;
5251        }
5252        else if (bom == 0xFFFE0000) {
5253            bo = 1;
5254            q += 4;
5255        }
5256        if (byteorder)
5257            *byteorder = bo;
5258    }
5259
5260    if (q == e) {
5261        if (consumed)
5262            *consumed = size;
5263        _Py_RETURN_UNICODE_EMPTY();
5264    }
5265
5266#ifdef WORDS_BIGENDIAN
5267    le = bo < 0;
5268#else
5269    le = bo <= 0;
5270#endif
5271    encoding = le ? "utf-32-le" : "utf-32-be";
5272
5273    _PyUnicodeWriter_Init(&writer);
5274    writer.min_length = (e - q + 3) / 4;
5275    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5276        goto onError;
5277
5278    while (1) {
5279        Py_UCS4 ch = 0;
5280        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5281
5282        if (e - q >= 4) {
5283            enum PyUnicode_Kind kind = writer.kind;
5284            void *data = writer.data;
5285            const unsigned char *last = e - 4;
5286            Py_ssize_t pos = writer.pos;
5287            if (le) {
5288                do {
5289                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5290                    if (ch > maxch)
5291                        break;
5292                    if (kind != PyUnicode_1BYTE_KIND &&
5293                        Py_UNICODE_IS_SURROGATE(ch))
5294                        break;
5295                    PyUnicode_WRITE(kind, data, pos++, ch);
5296                    q += 4;
5297                } while (q <= last);
5298            }
5299            else {
5300                do {
5301                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5302                    if (ch > maxch)
5303                        break;
5304                    if (kind != PyUnicode_1BYTE_KIND &&
5305                        Py_UNICODE_IS_SURROGATE(ch))
5306                        break;
5307                    PyUnicode_WRITE(kind, data, pos++, ch);
5308                    q += 4;
5309                } while (q <= last);
5310            }
5311            writer.pos = pos;
5312        }
5313
5314        if (Py_UNICODE_IS_SURROGATE(ch)) {
5315            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5316            startinpos = ((const char *)q) - starts;
5317            endinpos = startinpos + 4;
5318        }
5319        else if (ch <= maxch) {
5320            if (q == e || consumed)
5321                break;
5322            /* remaining bytes at the end? (size should be divisible by 4) */
5323            errmsg = "truncated data";
5324            startinpos = ((const char *)q) - starts;
5325            endinpos = ((const char *)e) - starts;
5326        }
5327        else {
5328            if (ch < 0x110000) {
5329                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5330                    goto onError;
5331                q += 4;
5332                continue;
5333            }
5334            errmsg = "code point not in range(0x110000)";
5335            startinpos = ((const char *)q) - starts;
5336            endinpos = startinpos + 4;
5337        }
5338
5339        /* The remaining input chars are ignored if the callback
5340           chooses to skip the input */
5341        if (unicode_decode_call_errorhandler_writer(
5342                errors, &errorHandler,
5343                encoding, errmsg,
5344                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5345                &writer))
5346            goto onError;
5347    }
5348
5349    if (consumed)
5350        *consumed = (const char *)q-starts;
5351
5352    Py_XDECREF(errorHandler);
5353    Py_XDECREF(exc);
5354    return _PyUnicodeWriter_Finish(&writer);
5355
5356  onError:
5357    _PyUnicodeWriter_Dealloc(&writer);
5358    Py_XDECREF(errorHandler);
5359    Py_XDECREF(exc);
5360    return NULL;
5361}
5362
5363PyObject *
5364_PyUnicode_EncodeUTF32(PyObject *str,
5365                       const char *errors,
5366                       int byteorder)
5367{
5368    enum PyUnicode_Kind kind;
5369    const void *data;
5370    Py_ssize_t len;
5371    PyObject *v;
5372    uint32_t *out;
5373#if PY_LITTLE_ENDIAN
5374    int native_ordering = byteorder <= 0;
5375#else
5376    int native_ordering = byteorder >= 0;
5377#endif
5378    const char *encoding;
5379    Py_ssize_t nsize, pos;
5380    PyObject *errorHandler = NULL;
5381    PyObject *exc = NULL;
5382    PyObject *rep = NULL;
5383
5384    if (!PyUnicode_Check(str)) {
5385        PyErr_BadArgument();
5386        return NULL;
5387    }
5388    if (PyUnicode_READY(str) == -1)
5389        return NULL;
5390    kind = PyUnicode_KIND(str);
5391    data = PyUnicode_DATA(str);
5392    len = PyUnicode_GET_LENGTH(str);
5393
5394    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5395        return PyErr_NoMemory();
5396    nsize = len + (byteorder == 0);
5397    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5398    if (v == NULL)
5399        return NULL;
5400
5401    /* output buffer is 4-bytes aligned */
5402    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5403    out = (uint32_t *)PyBytes_AS_STRING(v);
5404    if (byteorder == 0)
5405        *out++ = 0xFEFF;
5406    if (len == 0)
5407        goto done;
5408
5409    if (byteorder == -1)
5410        encoding = "utf-32-le";
5411    else if (byteorder == 1)
5412        encoding = "utf-32-be";
5413    else
5414        encoding = "utf-32";
5415
5416    if (kind == PyUnicode_1BYTE_KIND) {
5417        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418        goto done;
5419    }
5420
5421    pos = 0;
5422    while (pos < len) {
5423        Py_ssize_t repsize, moreunits;
5424
5425        if (kind == PyUnicode_2BYTE_KIND) {
5426            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427                                        &out, native_ordering);
5428        }
5429        else {
5430            assert(kind == PyUnicode_4BYTE_KIND);
5431            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432                                        &out, native_ordering);
5433        }
5434        if (pos == len)
5435            break;
5436
5437        rep = unicode_encode_call_errorhandler(
5438                errors, &errorHandler,
5439                encoding, "surrogates not allowed",
5440                str, &exc, pos, pos + 1, &pos);
5441        if (!rep)
5442            goto error;
5443
5444        if (PyBytes_Check(rep)) {
5445            repsize = PyBytes_GET_SIZE(rep);
5446            if (repsize & 3) {
5447                raise_encode_exception(&exc, encoding,
5448                                       str, pos - 1, pos,
5449                                       "surrogates not allowed");
5450                goto error;
5451            }
5452            moreunits = repsize / 4;
5453        }
5454        else {
5455            assert(PyUnicode_Check(rep));
5456            if (PyUnicode_READY(rep) < 0)
5457                goto error;
5458            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459            if (!PyUnicode_IS_ASCII(rep)) {
5460                raise_encode_exception(&exc, encoding,
5461                                       str, pos - 1, pos,
5462                                       "surrogates not allowed");
5463                goto error;
5464            }
5465        }
5466
5467        /* four bytes are reserved for each surrogate */
5468        if (moreunits > 1) {
5469            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5470            Py_ssize_t morebytes = 4 * (moreunits - 1);
5471            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472                /* integer overflow */
5473                PyErr_NoMemory();
5474                goto error;
5475            }
5476            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477                goto error;
5478            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5479        }
5480
5481        if (PyBytes_Check(rep)) {
5482            memcpy(out, PyBytes_AS_STRING(rep), repsize);
5483            out += moreunits;
5484        } else /* rep is unicode */ {
5485            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5486            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487                                 &out, native_ordering);
5488        }
5489
5490        Py_CLEAR(rep);
5491    }
5492
5493    /* Cut back to size actually needed. This is necessary for, for example,
5494       encoding of a string containing isolated surrogates and the 'ignore'
5495       handler is used. */
5496    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5497    if (nsize != PyBytes_GET_SIZE(v))
5498      _PyBytes_Resize(&v, nsize);
5499    Py_XDECREF(errorHandler);
5500    Py_XDECREF(exc);
5501  done:
5502    return v;
5503  error:
5504    Py_XDECREF(rep);
5505    Py_XDECREF(errorHandler);
5506    Py_XDECREF(exc);
5507    Py_XDECREF(v);
5508    return NULL;
5509}
5510
5511PyObject *
5512PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513                      Py_ssize_t size,
5514                      const char *errors,
5515                      int byteorder)
5516{
5517    PyObject *result;
5518    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5519    if (tmp == NULL)
5520        return NULL;
5521    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522    Py_DECREF(tmp);
5523    return result;
5524}
5525
5526PyObject *
5527PyUnicode_AsUTF32String(PyObject *unicode)
5528{
5529    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5530}
5531
5532/* --- UTF-16 Codec ------------------------------------------------------- */
5533
5534PyObject *
5535PyUnicode_DecodeUTF16(const char *s,
5536                      Py_ssize_t size,
5537                      const char *errors,
5538                      int *byteorder)
5539{
5540    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541}
5542
5543PyObject *
5544PyUnicode_DecodeUTF16Stateful(const char *s,
5545                              Py_ssize_t size,
5546                              const char *errors,
5547                              int *byteorder,
5548                              Py_ssize_t *consumed)
5549{
5550    const char *starts = s;
5551    Py_ssize_t startinpos;
5552    Py_ssize_t endinpos;
5553    _PyUnicodeWriter writer;
5554    const unsigned char *q, *e;
5555    int bo = 0;       /* assume native ordering by default */
5556    int native_ordering;
5557    const char *errmsg = "";
5558    PyObject *errorHandler = NULL;
5559    PyObject *exc = NULL;
5560    const char *encoding;
5561
5562    q = (unsigned char *)s;
5563    e = q + size;
5564
5565    if (byteorder)
5566        bo = *byteorder;
5567
5568    /* Check for BOM marks (U+FEFF) in the input and adjust current
5569       byte order setting accordingly. In native mode, the leading BOM
5570       mark is skipped, in all other modes, it is copied to the output
5571       stream as-is (giving a ZWNBSP character). */
5572    if (bo == 0 && size >= 2) {
5573        const Py_UCS4 bom = (q[1] << 8) | q[0];
5574        if (bom == 0xFEFF) {
5575            q += 2;
5576            bo = -1;
5577        }
5578        else if (bom == 0xFFFE) {
5579            q += 2;
5580            bo = 1;
5581        }
5582        if (byteorder)
5583            *byteorder = bo;
5584    }
5585
5586    if (q == e) {
5587        if (consumed)
5588            *consumed = size;
5589        _Py_RETURN_UNICODE_EMPTY();
5590    }
5591
5592#if PY_LITTLE_ENDIAN
5593    native_ordering = bo <= 0;
5594    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5595#else
5596    native_ordering = bo >= 0;
5597    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5598#endif
5599
5600    /* Note: size will always be longer than the resulting Unicode
5601       character count */
5602    _PyUnicodeWriter_Init(&writer);
5603    writer.min_length = (e - q + 1) / 2;
5604    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5605        goto onError;
5606
5607    while (1) {
5608        Py_UCS4 ch = 0;
5609        if (e - q >= 2) {
5610            int kind = writer.kind;
5611            if (kind == PyUnicode_1BYTE_KIND) {
5612                if (PyUnicode_IS_ASCII(writer.buffer))
5613                    ch = asciilib_utf16_decode(&q, e,
5614                            (Py_UCS1*)writer.data, &writer.pos,
5615                            native_ordering);
5616                else
5617                    ch = ucs1lib_utf16_decode(&q, e,
5618                            (Py_UCS1*)writer.data, &writer.pos,
5619                            native_ordering);
5620            } else if (kind == PyUnicode_2BYTE_KIND) {
5621                ch = ucs2lib_utf16_decode(&q, e,
5622                        (Py_UCS2*)writer.data, &writer.pos,
5623                        native_ordering);
5624            } else {
5625                assert(kind == PyUnicode_4BYTE_KIND);
5626                ch = ucs4lib_utf16_decode(&q, e,
5627                        (Py_UCS4*)writer.data, &writer.pos,
5628                        native_ordering);
5629            }
5630        }
5631
5632        switch (ch)
5633        {
5634        case 0:
5635            /* remaining byte at the end? (size should be even) */
5636            if (q == e || consumed)
5637                goto End;
5638            errmsg = "truncated data";
5639            startinpos = ((const char *)q) - starts;
5640            endinpos = ((const char *)e) - starts;
5641            break;
5642            /* The remaining input chars are ignored if the callback
5643               chooses to skip the input */
5644        case 1:
5645            q -= 2;
5646            if (consumed)
5647                goto End;
5648            errmsg = "unexpected end of data";
5649            startinpos = ((const char *)q) - starts;
5650            endinpos = ((const char *)e) - starts;
5651            break;
5652        case 2:
5653            errmsg = "illegal encoding";
5654            startinpos = ((const char *)q) - 2 - starts;
5655            endinpos = startinpos + 2;
5656            break;
5657        case 3:
5658            errmsg = "illegal UTF-16 surrogate";
5659            startinpos = ((const char *)q) - 4 - starts;
5660            endinpos = startinpos + 2;
5661            break;
5662        default:
5663            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5664                goto onError;
5665            continue;
5666        }
5667
5668        if (unicode_decode_call_errorhandler_writer(
5669                errors,
5670                &errorHandler,
5671                encoding, errmsg,
5672                &starts,
5673                (const char **)&e,
5674                &startinpos,
5675                &endinpos,
5676                &exc,
5677                (const char **)&q,
5678                &writer))
5679            goto onError;
5680    }
5681
5682End:
5683    if (consumed)
5684        *consumed = (const char *)q-starts;
5685
5686    Py_XDECREF(errorHandler);
5687    Py_XDECREF(exc);
5688    return _PyUnicodeWriter_Finish(&writer);
5689
5690  onError:
5691    _PyUnicodeWriter_Dealloc(&writer);
5692    Py_XDECREF(errorHandler);
5693    Py_XDECREF(exc);
5694    return NULL;
5695}
5696
5697PyObject *
5698_PyUnicode_EncodeUTF16(PyObject *str,
5699                       const char *errors,
5700                       int byteorder)
5701{
5702    enum PyUnicode_Kind kind;
5703    const void *data;
5704    Py_ssize_t len;
5705    PyObject *v;
5706    unsigned short *out;
5707    Py_ssize_t pairs;
5708#if PY_BIG_ENDIAN
5709    int native_ordering = byteorder >= 0;
5710#else
5711    int native_ordering = byteorder <= 0;
5712#endif
5713    const char *encoding;
5714    Py_ssize_t nsize, pos;
5715    PyObject *errorHandler = NULL;
5716    PyObject *exc = NULL;
5717    PyObject *rep = NULL;
5718
5719    if (!PyUnicode_Check(str)) {
5720        PyErr_BadArgument();
5721        return NULL;
5722    }
5723    if (PyUnicode_READY(str) == -1)
5724        return NULL;
5725    kind = PyUnicode_KIND(str);
5726    data = PyUnicode_DATA(str);
5727    len = PyUnicode_GET_LENGTH(str);
5728
5729    pairs = 0;
5730    if (kind == PyUnicode_4BYTE_KIND) {
5731        const Py_UCS4 *in = (const Py_UCS4 *)data;
5732        const Py_UCS4 *end = in + len;
5733        while (in < end) {
5734            if (*in++ >= 0x10000) {
5735                pairs++;
5736            }
5737        }
5738    }
5739    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5740        return PyErr_NoMemory();
5741    }
5742    nsize = len + pairs + (byteorder == 0);
5743    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5744    if (v == NULL) {
5745        return NULL;
5746    }
5747
5748    /* output buffer is 2-bytes aligned */
5749    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5750    out = (unsigned short *)PyBytes_AS_STRING(v);
5751    if (byteorder == 0) {
5752        *out++ = 0xFEFF;
5753    }
5754    if (len == 0) {
5755        goto done;
5756    }
5757
5758    if (kind == PyUnicode_1BYTE_KIND) {
5759        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760        goto done;
5761    }
5762
5763    if (byteorder < 0) {
5764        encoding = "utf-16-le";
5765    }
5766    else if (byteorder > 0) {
5767        encoding = "utf-16-be";
5768    }
5769    else {
5770        encoding = "utf-16";
5771    }
5772
5773    pos = 0;
5774    while (pos < len) {
5775        Py_ssize_t repsize, moreunits;
5776
5777        if (kind == PyUnicode_2BYTE_KIND) {
5778            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779                                        &out, native_ordering);
5780        }
5781        else {
5782            assert(kind == PyUnicode_4BYTE_KIND);
5783            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784                                        &out, native_ordering);
5785        }
5786        if (pos == len)
5787            break;
5788
5789        rep = unicode_encode_call_errorhandler(
5790                errors, &errorHandler,
5791                encoding, "surrogates not allowed",
5792                str, &exc, pos, pos + 1, &pos);
5793        if (!rep)
5794            goto error;
5795
5796        if (PyBytes_Check(rep)) {
5797            repsize = PyBytes_GET_SIZE(rep);
5798            if (repsize & 1) {
5799                raise_encode_exception(&exc, encoding,
5800                                       str, pos - 1, pos,
5801                                       "surrogates not allowed");
5802                goto error;
5803            }
5804            moreunits = repsize / 2;
5805        }
5806        else {
5807            assert(PyUnicode_Check(rep));
5808            if (PyUnicode_READY(rep) < 0)
5809                goto error;
5810            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811            if (!PyUnicode_IS_ASCII(rep)) {
5812                raise_encode_exception(&exc, encoding,
5813                                       str, pos - 1, pos,
5814                                       "surrogates not allowed");
5815                goto error;
5816            }
5817        }
5818
5819        /* two bytes are reserved for each surrogate */
5820        if (moreunits > 1) {
5821            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822            Py_ssize_t morebytes = 2 * (moreunits - 1);
5823            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824                /* integer overflow */
5825                PyErr_NoMemory();
5826                goto error;
5827            }
5828            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829                goto error;
5830            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831        }
5832
5833        if (PyBytes_Check(rep)) {
5834            memcpy(out, PyBytes_AS_STRING(rep), repsize);
5835            out += moreunits;
5836        } else /* rep is unicode */ {
5837            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839                                 &out, native_ordering);
5840        }
5841
5842        Py_CLEAR(rep);
5843    }
5844
5845    /* Cut back to size actually needed. This is necessary for, for example,
5846    encoding of a string containing isolated surrogates and the 'ignore' handler
5847    is used. */
5848    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849    if (nsize != PyBytes_GET_SIZE(v))
5850      _PyBytes_Resize(&v, nsize);
5851    Py_XDECREF(errorHandler);
5852    Py_XDECREF(exc);
5853  done:
5854    return v;
5855  error:
5856    Py_XDECREF(rep);
5857    Py_XDECREF(errorHandler);
5858    Py_XDECREF(exc);
5859    Py_XDECREF(v);
5860    return NULL;
5861#undef STORECHAR
5862}
5863
5864PyObject *
5865PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866                      Py_ssize_t size,
5867                      const char *errors,
5868                      int byteorder)
5869{
5870    PyObject *result;
5871    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5872    if (tmp == NULL)
5873        return NULL;
5874    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875    Py_DECREF(tmp);
5876    return result;
5877}
5878
5879PyObject *
5880PyUnicode_AsUTF16String(PyObject *unicode)
5881{
5882    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5883}
5884
5885/* --- Unicode Escape Codec ----------------------------------------------- */
5886
5887static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5888
5889PyObject *
5890_PyUnicode_DecodeUnicodeEscape(const char *s,
5891                               Py_ssize_t size,
5892                               const char *errors,
5893                               const char **first_invalid_escape)
5894{
5895    const char *starts = s;
5896    _PyUnicodeWriter writer;
5897    const char *end;
5898    PyObject *errorHandler = NULL;
5899    PyObject *exc = NULL;
5900
5901    // so we can remember if we've seen an invalid escape char or not
5902    *first_invalid_escape = NULL;
5903
5904    if (size == 0) {
5905        _Py_RETURN_UNICODE_EMPTY();
5906    }
5907    /* Escaped strings will always be longer than the resulting
5908       Unicode string, so we start with size here and then reduce the
5909       length after conversion to the true value.
5910       (but if the error callback returns a long replacement string
5911       we'll have to allocate more space) */
5912    _PyUnicodeWriter_Init(&writer);
5913    writer.min_length = size;
5914    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5915        goto onError;
5916    }
5917
5918    end = s + size;
5919    while (s < end) {
5920        unsigned char c = (unsigned char) *s++;
5921        Py_UCS4 ch;
5922        int count;
5923        Py_ssize_t startinpos;
5924        Py_ssize_t endinpos;
5925        const char *message;
5926
5927#define WRITE_ASCII_CHAR(ch)                                                  \
5928            do {                                                              \
5929                assert(ch <= 127);                                            \
5930                assert(writer.pos < writer.size);                             \
5931                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
5932            } while(0)
5933
5934#define WRITE_CHAR(ch)                                                        \
5935            do {                                                              \
5936                if (ch <= writer.maxchar) {                                   \
5937                    assert(writer.pos < writer.size);                         \
5938                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5939                }                                                             \
5940                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5941                    goto onError;                                             \
5942                }                                                             \
5943            } while(0)
5944
5945        /* Non-escape characters are interpreted as Unicode ordinals */
5946        if (c != '\\') {
5947            WRITE_CHAR(c);
5948            continue;
5949        }
5950
5951        startinpos = s - starts - 1;
5952        /* \ - Escapes */
5953        if (s >= end) {
5954            message = "\\ at end of string";
5955            goto error;
5956        }
5957        c = (unsigned char) *s++;
5958
5959        assert(writer.pos < writer.size);
5960        switch (c) {
5961
5962            /* \x escapes */
5963        case '\n': continue;
5964        case '\\': WRITE_ASCII_CHAR('\\'); continue;
5965        case '\'': WRITE_ASCII_CHAR('\''); continue;
5966        case '\"': WRITE_ASCII_CHAR('\"'); continue;
5967        case 'b': WRITE_ASCII_CHAR('\b'); continue;
5968        /* FF */
5969        case 'f': WRITE_ASCII_CHAR('\014'); continue;
5970        case 't': WRITE_ASCII_CHAR('\t'); continue;
5971        case 'n': WRITE_ASCII_CHAR('\n'); continue;
5972        case 'r': WRITE_ASCII_CHAR('\r'); continue;
5973        /* VT */
5974        case 'v': WRITE_ASCII_CHAR('\013'); continue;
5975        /* BEL, not classic C */
5976        case 'a': WRITE_ASCII_CHAR('\007'); continue;
5977
5978            /* \OOO (octal) escapes */
5979        case '0': case '1': case '2': case '3':
5980        case '4': case '5': case '6': case '7':
5981            ch = c - '0';
5982            if (s < end && '0' <= *s && *s <= '7') {
5983                ch = (ch<<3) + *s++ - '0';
5984                if (s < end && '0' <= *s && *s <= '7') {
5985                    ch = (ch<<3) + *s++ - '0';
5986                }
5987            }
5988            WRITE_CHAR(ch);
5989            continue;
5990
5991            /* hex escapes */
5992            /* \xXX */
5993        case 'x':
5994            count = 2;
5995            message = "truncated \\xXX escape";
5996            goto hexescape;
5997
5998            /* \uXXXX */
5999        case 'u':
6000            count = 4;
6001            message = "truncated \\uXXXX escape";
6002            goto hexescape;
6003
6004            /* \UXXXXXXXX */
6005        case 'U':
6006            count = 8;
6007            message = "truncated \\UXXXXXXXX escape";
6008        hexescape:
6009            for (ch = 0; count && s < end; ++s, --count) {
6010                c = (unsigned char)*s;
6011                ch <<= 4;
6012                if (c >= '0' && c <= '9') {
6013                    ch += c - '0';
6014                }
6015                else if (c >= 'a' && c <= 'f') {
6016                    ch += c - ('a' - 10);
6017                }
6018                else if (c >= 'A' && c <= 'F') {
6019                    ch += c - ('A' - 10);
6020                }
6021                else {
6022                    break;
6023                }
6024            }
6025            if (count) {
6026                goto error;
6027            }
6028
6029            /* when we get here, ch is a 32-bit unicode character */
6030            if (ch > MAX_UNICODE) {
6031                message = "illegal Unicode character";
6032                goto error;
6033            }
6034
6035            WRITE_CHAR(ch);
6036            continue;
6037
6038            /* \N{name} */
6039        case 'N':
6040            if (ucnhash_CAPI == NULL) {
6041                /* load the unicode data module */
6042                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6043                                                PyUnicodeData_CAPSULE_NAME, 1);
6044                if (ucnhash_CAPI == NULL) {
6045                    PyErr_SetString(
6046                        PyExc_UnicodeError,
6047                        "\\N escapes not supported (can't load unicodedata module)"
6048                        );
6049                    goto onError;
6050                }
6051            }
6052
6053            message = "malformed \\N character escape";
6054            if (*s == '{') {
6055                const char *start = ++s;
6056                size_t namelen;
6057                /* look for the closing brace */
6058                while (s < end && *s != '}')
6059                    s++;
6060                namelen = s - start;
6061                if (namelen && s < end) {
6062                    /* found a name.  look it up in the unicode database */
6063                    s++;
6064                    ch = 0xffffffff; /* in case 'getcode' messes up */
6065                    if (namelen <= INT_MAX &&
6066                        ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6067                                              &ch, 0)) {
6068                        assert(ch <= MAX_UNICODE);
6069                        WRITE_CHAR(ch);
6070                        continue;
6071                    }
6072                    message = "unknown Unicode character name";
6073                }
6074            }
6075            goto error;
6076
6077        default:
6078            if (*first_invalid_escape == NULL) {
6079                *first_invalid_escape = s-1; /* Back up one char, since we've
6080                                                already incremented s. */
6081            }
6082            WRITE_ASCII_CHAR('\\');
6083            WRITE_CHAR(c);
6084            continue;
6085        }
6086
6087      error:
6088        endinpos = s-starts;
6089        writer.min_length = end - s + writer.pos;
6090        if (unicode_decode_call_errorhandler_writer(
6091                errors, &errorHandler,
6092                "unicodeescape", message,
6093                &starts, &end, &startinpos, &endinpos, &exc, &s,
6094                &writer)) {
6095            goto onError;
6096        }
6097        if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6098            goto onError;
6099        }
6100
6101#undef WRITE_ASCII_CHAR
6102#undef WRITE_CHAR
6103    }
6104
6105    Py_XDECREF(errorHandler);
6106    Py_XDECREF(exc);
6107    return _PyUnicodeWriter_Finish(&writer);
6108
6109  onError:
6110    _PyUnicodeWriter_Dealloc(&writer);
6111    Py_XDECREF(errorHandler);
6112    Py_XDECREF(exc);
6113    return NULL;
6114}
6115
6116PyObject *
6117PyUnicode_DecodeUnicodeEscape(const char *s,
6118                              Py_ssize_t size,
6119                              const char *errors)
6120{
6121    const char *first_invalid_escape;
6122    PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6123                                                      &first_invalid_escape);
6124    if (result == NULL)
6125        return NULL;
6126    if (first_invalid_escape != NULL) {
6127        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6128                             "invalid escape sequence '\\%c'",
6129                             *first_invalid_escape) < 0) {
6130            Py_DECREF(result);
6131            return NULL;
6132        }
6133    }
6134    return result;
6135}
6136
6137/* Return a Unicode-Escape string version of the Unicode object. */
6138
6139PyObject *
6140PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6141{
6142    Py_ssize_t i, len;
6143    PyObject *repr;
6144    char *p;
6145    enum PyUnicode_Kind kind;
6146    void *data;
6147    Py_ssize_t expandsize;
6148
6149    /* Initial allocation is based on the longest-possible character
6150       escape.
6151
6152       For UCS1 strings it's '\xxx', 4 bytes per source character.
6153       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6154       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6155    */
6156
6157    if (!PyUnicode_Check(unicode)) {
6158        PyErr_BadArgument();
6159        return NULL;
6160    }
6161    if (PyUnicode_READY(unicode) == -1) {
6162        return NULL;
6163    }
6164
6165    len = PyUnicode_GET_LENGTH(unicode);
6166    if (len == 0) {
6167        return PyBytes_FromStringAndSize(NULL, 0);
6168    }
6169
6170    kind = PyUnicode_KIND(unicode);
6171    data = PyUnicode_DATA(unicode);
6172    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6173       bytes, and 1 byte characters 4. */
6174    expandsize = kind * 2 + 2;
6175    if (len > PY_SSIZE_T_MAX / expandsize) {
6176        return PyErr_NoMemory();
6177    }
6178    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6179    if (repr == NULL) {
6180        return NULL;
6181    }
6182
6183    p = PyBytes_AS_STRING(repr);
6184    for (i = 0; i < len; i++) {
6185        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6186
6187        /* U+0000-U+00ff range */
6188        if (ch < 0x100) {
6189            if (ch >= ' ' && ch < 127) {
6190                if (ch != '\\') {
6191                    /* Copy printable US ASCII as-is */
6192                    *p++ = (char) ch;
6193                }
6194                /* Escape backslashes */
6195                else {
6196                    *p++ = '\\';
6197                    *p++ = '\\';
6198                }
6199            }
6200
6201            /* Map special whitespace to '\t', \n', '\r' */
6202            else if (ch == '\t') {
6203                *p++ = '\\';
6204                *p++ = 't';
6205            }
6206            else if (ch == '\n') {
6207                *p++ = '\\';
6208                *p++ = 'n';
6209            }
6210            else if (ch == '\r') {
6211                *p++ = '\\';
6212                *p++ = 'r';
6213            }
6214
6215            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6216            else {
6217                *p++ = '\\';
6218                *p++ = 'x';
6219                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220                *p++ = Py_hexdigits[ch & 0x000F];
6221            }
6222        }
6223        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6224        else if (ch < 0x10000) {
6225            *p++ = '\\';
6226            *p++ = 'u';
6227            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6228            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6229            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230            *p++ = Py_hexdigits[ch & 0x000F];
6231        }
6232        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6233        else {
6234
6235            /* Make sure that the first two digits are zero */
6236            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6237            *p++ = '\\';
6238            *p++ = 'U';
6239            *p++ = '0';
6240            *p++ = '0';
6241            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6242            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6243            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6244            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6245            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6246            *p++ = Py_hexdigits[ch & 0x0000000F];
6247        }
6248    }
6249
6250    assert(p - PyBytes_AS_STRING(repr) > 0);
6251    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6252        return NULL;
6253    }
6254    return repr;
6255}
6256
6257PyObject *
6258PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6259                              Py_ssize_t size)
6260{
6261    PyObject *result;
6262    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6263    if (tmp == NULL) {
6264        return NULL;
6265    }
6266
6267    result = PyUnicode_AsUnicodeEscapeString(tmp);
6268    Py_DECREF(tmp);
6269    return result;
6270}
6271
6272/* --- Raw Unicode Escape Codec ------------------------------------------- */
6273
6274PyObject *
6275PyUnicode_DecodeRawUnicodeEscape(const char *s,
6276                                 Py_ssize_t size,
6277                                 const char *errors)
6278{
6279    const char *starts = s;
6280    _PyUnicodeWriter writer;
6281    const char *end;
6282    PyObject *errorHandler = NULL;
6283    PyObject *exc = NULL;
6284
6285    if (size == 0) {
6286        _Py_RETURN_UNICODE_EMPTY();
6287    }
6288
6289    /* Escaped strings will always be longer than the resulting
6290       Unicode string, so we start with size here and then reduce the
6291       length after conversion to the true value. (But decoding error
6292       handler might have to resize the string) */
6293    _PyUnicodeWriter_Init(&writer);
6294     writer.min_length = size;
6295    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6296        goto onError;
6297    }
6298
6299    end = s + size;
6300    while (s < end) {
6301        unsigned char c = (unsigned char) *s++;
6302        Py_UCS4 ch;
6303        int count;
6304        Py_ssize_t startinpos;
6305        Py_ssize_t endinpos;
6306        const char *message;
6307
6308#define WRITE_CHAR(ch)                                                        \
6309            do {                                                              \
6310                if (ch <= writer.maxchar) {                                   \
6311                    assert(writer.pos < writer.size);                         \
6312                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6313                }                                                             \
6314                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6315                    goto onError;                                             \
6316                }                                                             \
6317            } while(0)
6318
6319        /* Non-escape characters are interpreted as Unicode ordinals */
6320        if (c != '\\' || s >= end) {
6321            WRITE_CHAR(c);
6322            continue;
6323        }
6324
6325        c = (unsigned char) *s++;
6326        if (c == 'u') {
6327            count = 4;
6328            message = "truncated \\uXXXX escape";
6329        }
6330        else if (c == 'U') {
6331            count = 8;
6332            message = "truncated \\UXXXXXXXX escape";
6333        }
6334        else {
6335            assert(writer.pos < writer.size);
6336            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6337            WRITE_CHAR(c);
6338            continue;
6339        }
6340        startinpos = s - starts - 2;
6341
6342        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6343        for (ch = 0; count && s < end; ++s, --count) {
6344            c = (unsigned char)*s;
6345            ch <<= 4;
6346            if (c >= '0' && c <= '9') {
6347                ch += c - '0';
6348            }
6349            else if (c >= 'a' && c <= 'f') {
6350                ch += c - ('a' - 10);
6351            }
6352            else if (c >= 'A' && c <= 'F') {
6353                ch += c - ('A' - 10);
6354            }
6355            else {
6356                break;
6357            }
6358        }
6359        if (!count) {
6360            if (ch <= MAX_UNICODE) {
6361                WRITE_CHAR(ch);
6362                continue;
6363            }
6364            message = "\\Uxxxxxxxx out of range";
6365        }
6366
6367        endinpos = s-starts;
6368        writer.min_length = end - s + writer.pos;
6369        if (unicode_decode_call_errorhandler_writer(
6370                errors, &errorHandler,
6371                "rawunicodeescape", message,
6372                &starts, &end, &startinpos, &endinpos, &exc, &s,
6373                &writer)) {
6374            goto onError;
6375        }
6376        if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6377            goto onError;
6378        }
6379
6380#undef WRITE_CHAR
6381    }
6382    Py_XDECREF(errorHandler);
6383    Py_XDECREF(exc);
6384    return _PyUnicodeWriter_Finish(&writer);
6385
6386  onError:
6387    _PyUnicodeWriter_Dealloc(&writer);
6388    Py_XDECREF(errorHandler);
6389    Py_XDECREF(exc);
6390    return NULL;
6391
6392}
6393
6394
6395PyObject *
6396PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6397{
6398    PyObject *repr;
6399    char *p;
6400    Py_ssize_t expandsize, pos;
6401    int kind;
6402    void *data;
6403    Py_ssize_t len;
6404
6405    if (!PyUnicode_Check(unicode)) {
6406        PyErr_BadArgument();
6407        return NULL;
6408    }
6409    if (PyUnicode_READY(unicode) == -1) {
6410        return NULL;
6411    }
6412    kind = PyUnicode_KIND(unicode);
6413    data = PyUnicode_DATA(unicode);
6414    len = PyUnicode_GET_LENGTH(unicode);
6415    if (kind == PyUnicode_1BYTE_KIND) {
6416        return PyBytes_FromStringAndSize(data, len);
6417    }
6418
6419    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6420       bytes, and 1 byte characters 4. */
6421    expandsize = kind * 2 + 2;
6422
6423    if (len > PY_SSIZE_T_MAX / expandsize) {
6424        return PyErr_NoMemory();
6425    }
6426    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6427    if (repr == NULL) {
6428        return NULL;
6429    }
6430    if (len == 0) {
6431        return repr;
6432    }
6433
6434    p = PyBytes_AS_STRING(repr);
6435    for (pos = 0; pos < len; pos++) {
6436        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6437
6438        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6439        if (ch < 0x100) {
6440            *p++ = (char) ch;
6441        }
6442        /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6443        else if (ch < 0x10000) {
6444            *p++ = '\\';
6445            *p++ = 'u';
6446            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6447            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6448            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6449            *p++ = Py_hexdigits[ch & 15];
6450        }
6451        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6452        else {
6453            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6454            *p++ = '\\';
6455            *p++ = 'U';
6456            *p++ = '0';
6457            *p++ = '0';
6458            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6459            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6460            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463            *p++ = Py_hexdigits[ch & 15];
6464        }
6465    }
6466
6467    assert(p > PyBytes_AS_STRING(repr));
6468    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6469        return NULL;
6470    }
6471    return repr;
6472}
6473
6474PyObject *
6475PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6476                                 Py_ssize_t size)
6477{
6478    PyObject *result;
6479    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6480    if (tmp == NULL)
6481        return NULL;
6482    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6483    Py_DECREF(tmp);
6484    return result;
6485}
6486
6487/* --- Unicode Internal Codec ------------------------------------------- */
6488
6489PyObject *
6490_PyUnicode_DecodeUnicodeInternal(const char *s,
6491                                 Py_ssize_t size,
6492                                 const char *errors)
6493{
6494    const char *starts = s;
6495    Py_ssize_t startinpos;
6496    Py_ssize_t endinpos;
6497    _PyUnicodeWriter writer;
6498    const char *end;
6499    const char *reason;
6500    PyObject *errorHandler = NULL;
6501    PyObject *exc = NULL;
6502
6503    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6504                     "unicode_internal codec has been deprecated",
6505                     1))
6506        return NULL;
6507
6508    if (size == 0)
6509        _Py_RETURN_UNICODE_EMPTY();
6510
6511    _PyUnicodeWriter_Init(&writer);
6512    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6513        PyErr_NoMemory();
6514        goto onError;
6515    }
6516    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6517
6518    end = s + size;
6519    while (s < end) {
6520        Py_UNICODE uch;
6521        Py_UCS4 ch;
6522        if (end - s < Py_UNICODE_SIZE) {
6523            endinpos = end-starts;
6524            reason = "truncated input";
6525            goto error;
6526        }
6527        /* We copy the raw representation one byte at a time because the
6528           pointer may be unaligned (see test_codeccallbacks). */
6529        ((char *) &uch)[0] = s[0];
6530        ((char *) &uch)[1] = s[1];
6531#ifdef Py_UNICODE_WIDE
6532        ((char *) &uch)[2] = s[2];
6533        ((char *) &uch)[3] = s[3];
6534#endif
6535        ch = uch;
6536#ifdef Py_UNICODE_WIDE
6537        /* We have to sanity check the raw data, otherwise doom looms for
6538           some malformed UCS-4 data. */
6539        if (ch > 0x10ffff) {
6540            endinpos = s - starts + Py_UNICODE_SIZE;
6541            reason = "illegal code point (> 0x10FFFF)";
6542            goto error;
6543        }
6544#endif
6545        s += Py_UNICODE_SIZE;
6546#ifndef Py_UNICODE_WIDE
6547        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6548        {
6549            Py_UNICODE uch2;
6550            ((char *) &uch2)[0] = s[0];
6551            ((char *) &uch2)[1] = s[1];
6552            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6553            {
6554                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6555                s += Py_UNICODE_SIZE;
6556            }
6557        }
6558#endif
6559
6560        if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6561            goto onError;
6562        continue;
6563
6564  error:
6565        startinpos = s - starts;
6566        if (unicode_decode_call_errorhandler_writer(
6567                errors, &errorHandler,
6568                "unicode_internal", reason,
6569                &starts, &end, &startinpos, &endinpos, &exc, &s,
6570                &writer))
6571            goto onError;
6572    }
6573
6574    Py_XDECREF(errorHandler);
6575    Py_XDECREF(exc);
6576    return _PyUnicodeWriter_Finish(&writer);
6577
6578  onError:
6579    _PyUnicodeWriter_Dealloc(&writer);
6580    Py_XDECREF(errorHandler);
6581    Py_XDECREF(exc);
6582    return NULL;
6583}
6584
6585/* --- Latin-1 Codec ------------------------------------------------------ */
6586
6587PyObject *
6588PyUnicode_DecodeLatin1(const char *s,
6589                       Py_ssize_t size,
6590                       const char *errors)
6591{
6592    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6593    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6594}
6595
6596/* create or adjust a UnicodeEncodeError */
6597static void
6598make_encode_exception(PyObject **exceptionObject,
6599                      const char *encoding,
6600                      PyObject *unicode,
6601                      Py_ssize_t startpos, Py_ssize_t endpos,
6602                      const char *reason)
6603{
6604    if (*exceptionObject == NULL) {
6605        *exceptionObject = PyObject_CallFunction(
6606            PyExc_UnicodeEncodeError, "sOnns",
6607            encoding, unicode, startpos, endpos, reason);
6608    }
6609    else {
6610        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6611            goto onError;
6612        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6613            goto onError;
6614        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6615            goto onError;
6616        return;
6617      onError:
6618        Py_CLEAR(*exceptionObject);
6619    }
6620}
6621
6622/* raises a UnicodeEncodeError */
6623static void
6624raise_encode_exception(PyObject **exceptionObject,
6625                       const char *encoding,
6626                       PyObject *unicode,
6627                       Py_ssize_t startpos, Py_ssize_t endpos,
6628                       const char *reason)
6629{
6630    make_encode_exception(exceptionObject,
6631                          encoding, unicode, startpos, endpos, reason);
6632    if (*exceptionObject != NULL)
6633        PyCodec_StrictErrors(*exceptionObject);
6634}
6635
6636/* error handling callback helper:
6637   build arguments, call the callback and check the arguments,
6638   put the result into newpos and return the replacement string, which
6639   has to be freed by the caller */
6640static PyObject *
6641unicode_encode_call_errorhandler(const char *errors,
6642                                 PyObject **errorHandler,
6643                                 const char *encoding, const char *reason,
6644                                 PyObject *unicode, PyObject **exceptionObject,
6645                                 Py_ssize_t startpos, Py_ssize_t endpos,
6646                                 Py_ssize_t *newpos)
6647{
6648    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6649    Py_ssize_t len;
6650    PyObject *restuple;
6651    PyObject *resunicode;
6652
6653    if (*errorHandler == NULL) {
6654        *errorHandler = PyCodec_LookupError(errors);
6655        if (*errorHandler == NULL)
6656            return NULL;
6657    }
6658
6659    if (PyUnicode_READY(unicode) == -1)
6660        return NULL;
6661    len = PyUnicode_GET_LENGTH(unicode);
6662
6663    make_encode_exception(exceptionObject,
6664                          encoding, unicode, startpos, endpos, reason);
6665    if (*exceptionObject == NULL)
6666        return NULL;
6667
6668    restuple = PyObject_CallFunctionObjArgs(
6669        *errorHandler, *exceptionObject, NULL);
6670    if (restuple == NULL)
6671        return NULL;
6672    if (!PyTuple_Check(restuple)) {
6673        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6674        Py_DECREF(restuple);
6675        return NULL;
6676    }
6677    if (!PyArg_ParseTuple(restuple, argparse,
6678                          &resunicode, newpos)) {
6679        Py_DECREF(restuple);
6680        return NULL;
6681    }
6682    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6683        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6684        Py_DECREF(restuple);
6685        return NULL;
6686    }
6687    if (*newpos<0)
6688        *newpos = len + *newpos;
6689    if (*newpos<0 || *newpos>len) {
6690        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6691        Py_DECREF(restuple);
6692        return NULL;
6693    }
6694    Py_INCREF(resunicode);
6695    Py_DECREF(restuple);
6696    return resunicode;
6697}
6698
6699static PyObject *
6700unicode_encode_ucs1(PyObject *unicode,
6701                    const char *errors,
6702                    const Py_UCS4 limit)
6703{
6704    /* input state */
6705    Py_ssize_t pos=0, size;
6706    int kind;
6707    void *data;
6708    /* pointer into the output */
6709    char *str;
6710    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6711    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6712    PyObject *error_handler_obj = NULL;
6713    PyObject *exc = NULL;
6714    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6715    PyObject *rep = NULL;
6716    /* output object */
6717    _PyBytesWriter writer;
6718
6719    if (PyUnicode_READY(unicode) == -1)
6720        return NULL;
6721    size = PyUnicode_GET_LENGTH(unicode);
6722    kind = PyUnicode_KIND(unicode);
6723    data = PyUnicode_DATA(unicode);
6724    /* allocate enough for a simple encoding without
6725       replacements, if we need more, we'll resize */
6726    if (size == 0)
6727        return PyBytes_FromStringAndSize(NULL, 0);
6728
6729    _PyBytesWriter_Init(&writer);
6730    str = _PyBytesWriter_Alloc(&writer, size);
6731    if (str == NULL)
6732        return NULL;
6733
6734    while (pos < size) {
6735        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6736
6737        /* can we encode this? */
6738        if (ch < limit) {
6739            /* no overflow check, because we know that the space is enough */
6740            *str++ = (char)ch;
6741            ++pos;
6742        }
6743        else {
6744            Py_ssize_t newpos, i;
6745            /* startpos for collecting unencodable chars */
6746            Py_ssize_t collstart = pos;
6747            Py_ssize_t collend = collstart + 1;
6748            /* find all unecodable characters */
6749
6750            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6751                ++collend;
6752
6753            /* Only overallocate the buffer if it's not the last write */
6754            writer.overallocate = (collend < size);
6755
6756            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6757            if (error_handler == _Py_ERROR_UNKNOWN)
6758                error_handler = get_error_handler(errors);
6759
6760            switch (error_handler) {
6761            case _Py_ERROR_STRICT:
6762                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6763                goto onError;
6764
6765            case _Py_ERROR_REPLACE:
6766                memset(str, '?', collend - collstart);
6767                str += (collend - collstart);
6768                /* fall through ignore error handler */
6769            case _Py_ERROR_IGNORE:
6770                pos = collend;
6771                break;
6772
6773            case _Py_ERROR_BACKSLASHREPLACE:
6774                /* subtract preallocated bytes */
6775                writer.min_size -= (collend - collstart);
6776                str = backslashreplace(&writer, str,
6777                                       unicode, collstart, collend);
6778                if (str == NULL)
6779                    goto onError;
6780                pos = collend;
6781                break;
6782
6783            case _Py_ERROR_XMLCHARREFREPLACE:
6784                /* subtract preallocated bytes */
6785                writer.min_size -= (collend - collstart);
6786                str = xmlcharrefreplace(&writer, str,
6787                                        unicode, collstart, collend);
6788                if (str == NULL)
6789                    goto onError;
6790                pos = collend;
6791                break;
6792
6793            case _Py_ERROR_SURROGATEESCAPE:
6794                for (i = collstart; i < collend; ++i) {
6795                    ch = PyUnicode_READ(kind, data, i);
6796                    if (ch < 0xdc80 || 0xdcff < ch) {
6797                        /* Not a UTF-8b surrogate */
6798                        break;
6799                    }
6800                    *str++ = (char)(ch - 0xdc00);
6801                    ++pos;
6802                }
6803                if (i >= collend)
6804                    break;
6805                collstart = pos;
6806                assert(collstart != collend);
6807                /* fallback to general error handling */
6808
6809            default:
6810                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6811                                                       encoding, reason, unicode, &exc,
6812                                                       collstart, collend, &newpos);
6813                if (rep == NULL)
6814                    goto onError;
6815
6816                /* subtract preallocated bytes */
6817                writer.min_size -= 1;
6818
6819                if (PyBytes_Check(rep)) {
6820                    /* Directly copy bytes result to output. */
6821                    str = _PyBytesWriter_WriteBytes(&writer, str,
6822                                                    PyBytes_AS_STRING(rep),
6823                                                    PyBytes_GET_SIZE(rep));
6824                    if (str == NULL)
6825                        goto onError;
6826                }
6827                else {
6828                    assert(PyUnicode_Check(rep));
6829
6830                    if (PyUnicode_READY(rep) < 0)
6831                        goto onError;
6832
6833                    if (PyUnicode_IS_ASCII(rep)) {
6834                        /* Fast path: all characters are smaller than limit */
6835                        assert(limit >= 128);
6836                        assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6837                        str = _PyBytesWriter_WriteBytes(&writer, str,
6838                                                        PyUnicode_DATA(rep),
6839                                                        PyUnicode_GET_LENGTH(rep));
6840                    }
6841                    else {
6842                        Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6843
6844                        str = _PyBytesWriter_Prepare(&writer, str, repsize);
6845                        if (str == NULL)
6846                            goto onError;
6847
6848                        /* check if there is anything unencodable in the
6849                           replacement and copy it to the output */
6850                        for (i = 0; repsize-->0; ++i, ++str) {
6851                            ch = PyUnicode_READ_CHAR(rep, i);
6852                            if (ch >= limit) {
6853                                raise_encode_exception(&exc, encoding, unicode,
6854                                                       pos, pos+1, reason);
6855                                goto onError;
6856                            }
6857                            *str = (char)ch;
6858                        }
6859                    }
6860                }
6861                pos = newpos;
6862                Py_CLEAR(rep);
6863            }
6864
6865            /* If overallocation was disabled, ensure that it was the last
6866               write. Otherwise, we missed an optimization */
6867            assert(writer.overallocate || pos == size);
6868        }
6869    }
6870
6871    Py_XDECREF(error_handler_obj);
6872    Py_XDECREF(exc);
6873    return _PyBytesWriter_Finish(&writer, str);
6874
6875  onError:
6876    Py_XDECREF(rep);
6877    _PyBytesWriter_Dealloc(&writer);
6878    Py_XDECREF(error_handler_obj);
6879    Py_XDECREF(exc);
6880    return NULL;
6881}
6882
6883/* Deprecated */
6884PyObject *
6885PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6886                       Py_ssize_t size,
6887                       const char *errors)
6888{
6889    PyObject *result;
6890    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6891    if (unicode == NULL)
6892        return NULL;
6893    result = unicode_encode_ucs1(unicode, errors, 256);
6894    Py_DECREF(unicode);
6895    return result;
6896}
6897
6898PyObject *
6899_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6900{
6901    if (!PyUnicode_Check(unicode)) {
6902        PyErr_BadArgument();
6903        return NULL;
6904    }
6905    if (PyUnicode_READY(unicode) == -1)
6906        return NULL;
6907    /* Fast path: if it is a one-byte string, construct
6908       bytes object directly. */
6909    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911                                         PyUnicode_GET_LENGTH(unicode));
6912    /* Non-Latin-1 characters present. Defer to above function to
6913       raise the exception. */
6914    return unicode_encode_ucs1(unicode, errors, 256);
6915}
6916
6917PyObject*
6918PyUnicode_AsLatin1String(PyObject *unicode)
6919{
6920    return _PyUnicode_AsLatin1String(unicode, NULL);
6921}
6922
6923/* --- 7-bit ASCII Codec -------------------------------------------------- */
6924
6925PyObject *
6926PyUnicode_DecodeASCII(const char *s,
6927                      Py_ssize_t size,
6928                      const char *errors)
6929{
6930    const char *starts = s;
6931    _PyUnicodeWriter writer;
6932    int kind;
6933    void *data;
6934    Py_ssize_t startinpos;
6935    Py_ssize_t endinpos;
6936    Py_ssize_t outpos;
6937    const char *e;
6938    PyObject *error_handler_obj = NULL;
6939    PyObject *exc = NULL;
6940    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6941
6942    if (size == 0)
6943        _Py_RETURN_UNICODE_EMPTY();
6944
6945    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6946    if (size == 1 && (unsigned char)s[0] < 128)
6947        return get_latin1_char((unsigned char)s[0]);
6948
6949    _PyUnicodeWriter_Init(&writer);
6950    writer.min_length = size;
6951    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6952        return NULL;
6953
6954    e = s + size;
6955    data = writer.data;
6956    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6957    writer.pos = outpos;
6958    if (writer.pos == size)
6959        return _PyUnicodeWriter_Finish(&writer);
6960
6961    s += writer.pos;
6962    kind = writer.kind;
6963    while (s < e) {
6964        unsigned char c = (unsigned char)*s;
6965        if (c < 128) {
6966            PyUnicode_WRITE(kind, data, writer.pos, c);
6967            writer.pos++;
6968            ++s;
6969            continue;
6970        }
6971
6972        /* byte outsize range 0x00..0x7f: call the error handler */
6973
6974        if (error_handler == _Py_ERROR_UNKNOWN)
6975            error_handler = get_error_handler(errors);
6976
6977        switch (error_handler)
6978        {
6979        case _Py_ERROR_REPLACE:
6980        case _Py_ERROR_SURROGATEESCAPE:
6981            /* Fast-path: the error handler only writes one character,
6982               but we may switch to UCS2 at the first write */
6983            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6984                goto onError;
6985            kind = writer.kind;
6986            data = writer.data;
6987
6988            if (error_handler == _Py_ERROR_REPLACE)
6989                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6990            else
6991                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6992            writer.pos++;
6993            ++s;
6994            break;
6995
6996        case _Py_ERROR_IGNORE:
6997            ++s;
6998            break;
6999
7000        default:
7001            startinpos = s-starts;
7002            endinpos = startinpos + 1;
7003            if (unicode_decode_call_errorhandler_writer(
7004                    errors, &error_handler_obj,
7005                    "ascii", "ordinal not in range(128)",
7006                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7007                    &writer))
7008                goto onError;
7009            kind = writer.kind;
7010            data = writer.data;
7011        }
7012    }
7013    Py_XDECREF(error_handler_obj);
7014    Py_XDECREF(exc);
7015    return _PyUnicodeWriter_Finish(&writer);
7016
7017  onError:
7018    _PyUnicodeWriter_Dealloc(&writer);
7019    Py_XDECREF(error_handler_obj);
7020    Py_XDECREF(exc);
7021    return NULL;
7022}
7023
7024/* Deprecated */
7025PyObject *
7026PyUnicode_EncodeASCII(const Py_UNICODE *p,
7027                      Py_ssize_t size,
7028                      const char *errors)
7029{
7030    PyObject *result;
7031    PyObject *unicode = PyUnicode_FromUnicode(p, size);
7032    if (unicode == NULL)
7033        return NULL;
7034    result = unicode_encode_ucs1(unicode, errors, 128);
7035    Py_DECREF(unicode);
7036    return result;
7037}
7038
7039PyObject *
7040_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7041{
7042    if (!PyUnicode_Check(unicode)) {
7043        PyErr_BadArgument();
7044        return NULL;
7045    }
7046    if (PyUnicode_READY(unicode) == -1)
7047        return NULL;
7048    /* Fast path: if it is an ASCII-only string, construct bytes object
7049       directly. Else defer to above function to raise the exception. */
7050    if (PyUnicode_IS_ASCII(unicode))
7051        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7052                                         PyUnicode_GET_LENGTH(unicode));
7053    return unicode_encode_ucs1(unicode, errors, 128);
7054}
7055
7056PyObject *
7057PyUnicode_AsASCIIString(PyObject *unicode)
7058{
7059    return _PyUnicode_AsASCIIString(unicode, NULL);
7060}
7061
7062#ifdef MS_WINDOWS
7063
7064/* --- MBCS codecs for Windows -------------------------------------------- */
7065
7066#if SIZEOF_INT < SIZEOF_SIZE_T
7067#define NEED_RETRY
7068#endif
7069
7070#ifndef WC_ERR_INVALID_CHARS
7071#  define WC_ERR_INVALID_CHARS 0x0080
7072#endif
7073
7074static const char*
7075code_page_name(UINT code_page, PyObject **obj)
7076{
7077    *obj = NULL;
7078    if (code_page == CP_ACP)
7079        return "mbcs";
7080    if (code_page == CP_UTF7)
7081        return "CP_UTF7";
7082    if (code_page == CP_UTF8)
7083        return "CP_UTF8";
7084
7085    *obj = PyBytes_FromFormat("cp%u", code_page);
7086    if (*obj == NULL)
7087        return NULL;
7088    return PyBytes_AS_STRING(*obj);
7089}
7090
7091static DWORD
7092decode_code_page_flags(UINT code_page)
7093{
7094    if (code_page == CP_UTF7) {
7095        /* The CP_UTF7 decoder only supports flags=0 */
7096        return 0;
7097    }
7098    else
7099        return MB_ERR_INVALID_CHARS;
7100}
7101
7102/*
7103 * Decode a byte string from a Windows code page into unicode object in strict
7104 * mode.
7105 *
7106 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7107 * OSError and returns -1 on other error.
7108 */
7109static int
7110decode_code_page_strict(UINT code_page,
7111                        PyObject **v,
7112                        const char *in,
7113                        int insize)
7114{
7115    const DWORD flags = decode_code_page_flags(code_page);
7116    wchar_t *out;
7117    DWORD outsize;
7118
7119    /* First get the size of the result */
7120    assert(insize > 0);
7121    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7122    if (outsize <= 0)
7123        goto error;
7124
7125    if (*v == NULL) {
7126        /* Create unicode object */
7127        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7128        *v = (PyObject*)_PyUnicode_New(outsize);
7129        if (*v == NULL)
7130            return -1;
7131        out = PyUnicode_AS_UNICODE(*v);
7132    }
7133    else {
7134        /* Extend unicode object */
7135        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7136        if (unicode_resize(v, n + outsize) < 0)
7137            return -1;
7138        out = PyUnicode_AS_UNICODE(*v) + n;
7139    }
7140
7141    /* Do the conversion */
7142    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7143    if (outsize <= 0)
7144        goto error;
7145    return insize;
7146
7147error:
7148    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7149        return -2;
7150    PyErr_SetFromWindowsErr(0);
7151    return -1;
7152}
7153
7154/*
7155 * Decode a byte string from a code page into unicode object with an error
7156 * handler.
7157 *
7158 * Returns consumed size if succeed, or raise an OSError or
7159 * UnicodeDecodeError exception and returns -1 on error.
7160 */
7161static int
7162decode_code_page_errors(UINT code_page,
7163                        PyObject **v,
7164                        const char *in, const int size,
7165                        const char *errors, int final)
7166{
7167    const char *startin = in;
7168    const char *endin = in + size;
7169    const DWORD flags = decode_code_page_flags(code_page);
7170    /* Ideally, we should get reason from FormatMessage. This is the Windows
7171       2000 English version of the message. */
7172    const char *reason = "No mapping for the Unicode character exists "
7173                         "in the target code page.";
7174    /* each step cannot decode more than 1 character, but a character can be
7175       represented as a surrogate pair */
7176    wchar_t buffer[2], *startout, *out;
7177    int insize;
7178    Py_ssize_t outsize;
7179    PyObject *errorHandler = NULL;
7180    PyObject *exc = NULL;
7181    PyObject *encoding_obj = NULL;
7182    const char *encoding;
7183    DWORD err;
7184    int ret = -1;
7185
7186    assert(size > 0);
7187
7188    encoding = code_page_name(code_page, &encoding_obj);
7189    if (encoding == NULL)
7190        return -1;
7191
7192    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7193        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7194           UnicodeDecodeError. */
7195        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7196        if (exc != NULL) {
7197            PyCodec_StrictErrors(exc);
7198            Py_CLEAR(exc);
7199        }
7200        goto error;
7201    }
7202
7203    if (*v == NULL) {
7204        /* Create unicode object */
7205        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7206            PyErr_NoMemory();
7207            goto error;
7208        }
7209        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7210        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7211        if (*v == NULL)
7212            goto error;
7213        startout = PyUnicode_AS_UNICODE(*v);
7214    }
7215    else {
7216        /* Extend unicode object */
7217        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7218        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7219            PyErr_NoMemory();
7220            goto error;
7221        }
7222        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7223            goto error;
7224        startout = PyUnicode_AS_UNICODE(*v) + n;
7225    }
7226
7227    /* Decode the byte string character per character */
7228    out = startout;
7229    while (in < endin)
7230    {
7231        /* Decode a character */
7232        insize = 1;
7233        do
7234        {
7235            outsize = MultiByteToWideChar(code_page, flags,
7236                                          in, insize,
7237                                          buffer, Py_ARRAY_LENGTH(buffer));
7238            if (outsize > 0)
7239                break;
7240            err = GetLastError();
7241            if (err != ERROR_NO_UNICODE_TRANSLATION
7242                && err != ERROR_INSUFFICIENT_BUFFER)
7243            {
7244                PyErr_SetFromWindowsErr(0);
7245                goto error;
7246            }
7247            insize++;
7248        }
7249        /* 4=maximum length of a UTF-8 sequence */
7250        while (insize <= 4 && (in + insize) <= endin);
7251
7252        if (outsize <= 0) {
7253            Py_ssize_t startinpos, endinpos, outpos;
7254
7255            /* last character in partial decode? */
7256            if (in + insize >= endin && !final)
7257                break;
7258
7259            startinpos = in - startin;
7260            endinpos = startinpos + 1;
7261            outpos = out - PyUnicode_AS_UNICODE(*v);
7262            if (unicode_decode_call_errorhandler_wchar(
7263                    errors, &errorHandler,
7264                    encoding, reason,
7265                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7266                    v, &outpos))
7267            {
7268                goto error;
7269            }
7270            out = PyUnicode_AS_UNICODE(*v) + outpos;
7271        }
7272        else {
7273            in += insize;
7274            memcpy(out, buffer, outsize * sizeof(wchar_t));
7275            out += outsize;
7276        }
7277    }
7278
7279    /* write a NUL character at the end */
7280    *out = 0;
7281
7282    /* Extend unicode object */
7283    outsize = out - startout;
7284    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7285    if (unicode_resize(v, outsize) < 0)
7286        goto error;
7287    /* (in - startin) <= size and size is an int */
7288    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7289
7290error:
7291    Py_XDECREF(encoding_obj);
7292    Py_XDECREF(errorHandler);
7293    Py_XDECREF(exc);
7294    return ret;
7295}
7296
7297static PyObject *
7298decode_code_page_stateful(int code_page,
7299                          const char *s, Py_ssize_t size,
7300                          const char *errors, Py_ssize_t *consumed)
7301{
7302    PyObject *v = NULL;
7303    int chunk_size, final, converted, done;
7304
7305    if (code_page < 0) {
7306        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7307        return NULL;
7308    }
7309
7310    if (consumed)
7311        *consumed = 0;
7312
7313    do
7314    {
7315#ifdef NEED_RETRY
7316        if (size > INT_MAX) {
7317            chunk_size = INT_MAX;
7318            final = 0;
7319            done = 0;
7320        }
7321        else
7322#endif
7323        {
7324            chunk_size = (int)size;
7325            final = (consumed == NULL);
7326            done = 1;
7327        }
7328
7329        if (chunk_size == 0 && done) {
7330            if (v != NULL)
7331                break;
7332            _Py_RETURN_UNICODE_EMPTY();
7333        }
7334
7335        converted = decode_code_page_strict(code_page, &v,
7336                                            s, chunk_size);
7337        if (converted == -2)
7338            converted = decode_code_page_errors(code_page, &v,
7339                                                s, chunk_size,
7340                                                errors, final);
7341        assert(converted != 0 || done);
7342
7343        if (converted < 0) {
7344            Py_XDECREF(v);
7345            return NULL;
7346        }
7347
7348        if (consumed)
7349            *consumed += converted;
7350
7351        s += converted;
7352        size -= converted;
7353    } while (!done);
7354
7355    return unicode_result(v);
7356}
7357
7358PyObject *
7359PyUnicode_DecodeCodePageStateful(int code_page,
7360                                 const char *s,
7361                                 Py_ssize_t size,
7362                                 const char *errors,
7363                                 Py_ssize_t *consumed)
7364{
7365    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7366}
7367
7368PyObject *
7369PyUnicode_DecodeMBCSStateful(const char *s,
7370                             Py_ssize_t size,
7371                             const char *errors,
7372                             Py_ssize_t *consumed)
7373{
7374    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7375}
7376
7377PyObject *
7378PyUnicode_DecodeMBCS(const char *s,
7379                     Py_ssize_t size,
7380                     const char *errors)
7381{
7382    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7383}
7384
7385static DWORD
7386encode_code_page_flags(UINT code_page, const char *errors)
7387{
7388    if (code_page == CP_UTF8) {
7389        return WC_ERR_INVALID_CHARS;
7390    }
7391    else if (code_page == CP_UTF7) {
7392        /* CP_UTF7 only supports flags=0 */
7393        return 0;
7394    }
7395    else {
7396        if (errors != NULL && strcmp(errors, "replace") == 0)
7397            return 0;
7398        else
7399            return WC_NO_BEST_FIT_CHARS;
7400    }
7401}
7402
7403/*
7404 * Encode a Unicode string to a Windows code page into a byte string in strict
7405 * mode.
7406 *
7407 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7408 * an OSError and returns -1 on other error.
7409 */
7410static int
7411encode_code_page_strict(UINT code_page, PyObject **outbytes,
7412                        PyObject *unicode, Py_ssize_t offset, int len,
7413                        const char* errors)
7414{
7415    BOOL usedDefaultChar = FALSE;
7416    BOOL *pusedDefaultChar = &usedDefaultChar;
7417    int outsize;
7418    wchar_t *p;
7419    Py_ssize_t size;
7420    const DWORD flags = encode_code_page_flags(code_page, NULL);
7421    char *out;
7422    /* Create a substring so that we can get the UTF-16 representation
7423       of just the slice under consideration. */
7424    PyObject *substring;
7425
7426    assert(len > 0);
7427
7428    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7429        pusedDefaultChar = &usedDefaultChar;
7430    else
7431        pusedDefaultChar = NULL;
7432
7433    substring = PyUnicode_Substring(unicode, offset, offset+len);
7434    if (substring == NULL)
7435        return -1;
7436    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7437    if (p == NULL) {
7438        Py_DECREF(substring);
7439        return -1;
7440    }
7441    assert(size <= INT_MAX);
7442
7443    /* First get the size of the result */
7444    outsize = WideCharToMultiByte(code_page, flags,
7445                                  p, (int)size,
7446                                  NULL, 0,
7447                                  NULL, pusedDefaultChar);
7448    if (outsize <= 0)
7449        goto error;
7450    /* If we used a default char, then we failed! */
7451    if (pusedDefaultChar && *pusedDefaultChar) {
7452        Py_DECREF(substring);
7453        return -2;
7454    }
7455
7456    if (*outbytes == NULL) {
7457        /* Create string object */
7458        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7459        if (*outbytes == NULL) {
7460            Py_DECREF(substring);
7461            return -1;
7462        }
7463        out = PyBytes_AS_STRING(*outbytes);
7464    }
7465    else {
7466        /* Extend string object */
7467        const Py_ssize_t n = PyBytes_Size(*outbytes);
7468        if (outsize > PY_SSIZE_T_MAX - n) {
7469            PyErr_NoMemory();
7470            Py_DECREF(substring);
7471            return -1;
7472        }
7473        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7474            Py_DECREF(substring);
7475            return -1;
7476        }
7477        out = PyBytes_AS_STRING(*outbytes) + n;
7478    }
7479
7480    /* Do the conversion */
7481    outsize = WideCharToMultiByte(code_page, flags,
7482                                  p, (int)size,
7483                                  out, outsize,
7484                                  NULL, pusedDefaultChar);
7485    Py_CLEAR(substring);
7486    if (outsize <= 0)
7487        goto error;
7488    if (pusedDefaultChar && *pusedDefaultChar)
7489        return -2;
7490    return 0;
7491
7492error:
7493    Py_XDECREF(substring);
7494    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7495        return -2;
7496    PyErr_SetFromWindowsErr(0);
7497    return -1;
7498}
7499
7500/*
7501 * Encode a Unicode string to a Windows code page into a byte string using an
7502 * error handler.
7503 *
7504 * Returns consumed characters if succeed, or raise an OSError and returns
7505 * -1 on other error.
7506 */
7507static int
7508encode_code_page_errors(UINT code_page, PyObject **outbytes,
7509                        PyObject *unicode, Py_ssize_t unicode_offset,
7510                        Py_ssize_t insize, const char* errors)
7511{
7512    const DWORD flags = encode_code_page_flags(code_page, errors);
7513    Py_ssize_t pos = unicode_offset;
7514    Py_ssize_t endin = unicode_offset + insize;
7515    /* Ideally, we should get reason from FormatMessage. This is the Windows
7516       2000 English version of the message. */
7517    const char *reason = "invalid character";
7518    /* 4=maximum length of a UTF-8 sequence */
7519    char buffer[4];
7520    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7521    Py_ssize_t outsize;
7522    char *out;
7523    PyObject *errorHandler = NULL;
7524    PyObject *exc = NULL;
7525    PyObject *encoding_obj = NULL;
7526    const char *encoding;
7527    Py_ssize_t newpos, newoutsize;
7528    PyObject *rep;
7529    int ret = -1;
7530
7531    assert(insize > 0);
7532
7533    encoding = code_page_name(code_page, &encoding_obj);
7534    if (encoding == NULL)
7535        return -1;
7536
7537    if (errors == NULL || strcmp(errors, "strict") == 0) {
7538        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7539           then we raise a UnicodeEncodeError. */
7540        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7541        if (exc != NULL) {
7542            PyCodec_StrictErrors(exc);
7543            Py_DECREF(exc);
7544        }
7545        Py_XDECREF(encoding_obj);
7546        return -1;
7547    }
7548
7549    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7550        pusedDefaultChar = &usedDefaultChar;
7551    else
7552        pusedDefaultChar = NULL;
7553
7554    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7555        PyErr_NoMemory();
7556        goto error;
7557    }
7558    outsize = insize * Py_ARRAY_LENGTH(buffer);
7559
7560    if (*outbytes == NULL) {
7561        /* Create string object */
7562        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7563        if (*outbytes == NULL)
7564            goto error;
7565        out = PyBytes_AS_STRING(*outbytes);
7566    }
7567    else {
7568        /* Extend string object */
7569        Py_ssize_t n = PyBytes_Size(*outbytes);
7570        if (n > PY_SSIZE_T_MAX - outsize) {
7571            PyErr_NoMemory();
7572            goto error;
7573        }
7574        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7575            goto error;
7576        out = PyBytes_AS_STRING(*outbytes) + n;
7577    }
7578
7579    /* Encode the string character per character */
7580    while (pos < endin)
7581    {
7582        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7583        wchar_t chars[2];
7584        int charsize;
7585        if (ch < 0x10000) {
7586            chars[0] = (wchar_t)ch;
7587            charsize = 1;
7588        }
7589        else {
7590            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7591            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7592            charsize = 2;
7593        }
7594
7595        outsize = WideCharToMultiByte(code_page, flags,
7596                                      chars, charsize,
7597                                      buffer, Py_ARRAY_LENGTH(buffer),
7598                                      NULL, pusedDefaultChar);
7599        if (outsize > 0) {
7600            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7601            {
7602                pos++;
7603                memcpy(out, buffer, outsize);
7604                out += outsize;
7605                continue;
7606            }
7607        }
7608        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7609            PyErr_SetFromWindowsErr(0);
7610            goto error;
7611        }
7612
7613        rep = unicode_encode_call_errorhandler(
7614                  errors, &errorHandler, encoding, reason,
7615                  unicode, &exc,
7616                  pos, pos + 1, &newpos);
7617        if (rep == NULL)
7618            goto error;
7619        pos = newpos;
7620
7621        if (PyBytes_Check(rep)) {
7622            outsize = PyBytes_GET_SIZE(rep);
7623            if (outsize != 1) {
7624                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7625                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7626                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7627                    Py_DECREF(rep);
7628                    goto error;
7629                }
7630                out = PyBytes_AS_STRING(*outbytes) + offset;
7631            }
7632            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7633            out += outsize;
7634        }
7635        else {
7636            Py_ssize_t i;
7637            enum PyUnicode_Kind kind;
7638            void *data;
7639
7640            if (PyUnicode_READY(rep) == -1) {
7641                Py_DECREF(rep);
7642                goto error;
7643            }
7644
7645            outsize = PyUnicode_GET_LENGTH(rep);
7646            if (outsize != 1) {
7647                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7648                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7649                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7650                    Py_DECREF(rep);
7651                    goto error;
7652                }
7653                out = PyBytes_AS_STRING(*outbytes) + offset;
7654            }
7655            kind = PyUnicode_KIND(rep);
7656            data = PyUnicode_DATA(rep);
7657            for (i=0; i < outsize; i++) {
7658                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7659                if (ch > 127) {
7660                    raise_encode_exception(&exc,
7661                        encoding, unicode,
7662                        pos, pos + 1,
7663                        "unable to encode error handler result to ASCII");
7664                    Py_DECREF(rep);
7665                    goto error;
7666                }
7667                *out = (unsigned char)ch;
7668                out++;
7669            }
7670        }
7671        Py_DECREF(rep);
7672    }
7673    /* write a NUL byte */
7674    *out = 0;
7675    outsize = out - PyBytes_AS_STRING(*outbytes);
7676    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7677    if (_PyBytes_Resize(outbytes, outsize) < 0)
7678        goto error;
7679    ret = 0;
7680
7681error:
7682    Py_XDECREF(encoding_obj);
7683    Py_XDECREF(errorHandler);
7684    Py_XDECREF(exc);
7685    return ret;
7686}
7687
7688static PyObject *
7689encode_code_page(int code_page,
7690                 PyObject *unicode,
7691                 const char *errors)
7692{
7693    Py_ssize_t len;
7694    PyObject *outbytes = NULL;
7695    Py_ssize_t offset;
7696    int chunk_len, ret, done;
7697
7698    if (!PyUnicode_Check(unicode)) {
7699        PyErr_BadArgument();
7700        return NULL;
7701    }
7702
7703    if (PyUnicode_READY(unicode) == -1)
7704        return NULL;
7705    len = PyUnicode_GET_LENGTH(unicode);
7706
7707    if (code_page < 0) {
7708        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7709        return NULL;
7710    }
7711
7712    if (len == 0)
7713        return PyBytes_FromStringAndSize(NULL, 0);
7714
7715    offset = 0;
7716    do
7717    {
7718#ifdef NEED_RETRY
7719        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7720           chunks. */
7721        if (len > INT_MAX/2) {
7722            chunk_len = INT_MAX/2;
7723            done = 0;
7724        }
7725        else
7726#endif
7727        {
7728            chunk_len = (int)len;
7729            done = 1;
7730        }
7731
7732        ret = encode_code_page_strict(code_page, &outbytes,
7733                                      unicode, offset, chunk_len,
7734                                      errors);
7735        if (ret == -2)
7736            ret = encode_code_page_errors(code_page, &outbytes,
7737                                          unicode, offset,
7738                                          chunk_len, errors);
7739        if (ret < 0) {
7740            Py_XDECREF(outbytes);
7741            return NULL;
7742        }
7743
7744        offset += chunk_len;
7745        len -= chunk_len;
7746    } while (!done);
7747
7748    return outbytes;
7749}
7750
7751PyObject *
7752PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7753                     Py_ssize_t size,
7754                     const char *errors)
7755{
7756    PyObject *unicode, *res;
7757    unicode = PyUnicode_FromUnicode(p, size);
7758    if (unicode == NULL)
7759        return NULL;
7760    res = encode_code_page(CP_ACP, unicode, errors);
7761    Py_DECREF(unicode);
7762    return res;
7763}
7764
7765PyObject *
7766PyUnicode_EncodeCodePage(int code_page,
7767                         PyObject *unicode,
7768                         const char *errors)
7769{
7770    return encode_code_page(code_page, unicode, errors);
7771}
7772
7773PyObject *
7774PyUnicode_AsMBCSString(PyObject *unicode)
7775{
7776    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7777}
7778
7779#undef NEED_RETRY
7780
7781#endif /* MS_WINDOWS */
7782
7783/* --- Character Mapping Codec -------------------------------------------- */
7784
7785static int
7786charmap_decode_string(const char *s,
7787                      Py_ssize_t size,
7788                      PyObject *mapping,
7789                      const char *errors,
7790                      _PyUnicodeWriter *writer)
7791{
7792    const char *starts = s;
7793    const char *e;
7794    Py_ssize_t startinpos, endinpos;
7795    PyObject *errorHandler = NULL, *exc = NULL;
7796    Py_ssize_t maplen;
7797    enum PyUnicode_Kind mapkind;
7798    void *mapdata;
7799    Py_UCS4 x;
7800    unsigned char ch;
7801
7802    if (PyUnicode_READY(mapping) == -1)
7803        return -1;
7804
7805    maplen = PyUnicode_GET_LENGTH(mapping);
7806    mapdata = PyUnicode_DATA(mapping);
7807    mapkind = PyUnicode_KIND(mapping);
7808
7809    e = s + size;
7810
7811    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7812        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7813         * is disabled in encoding aliases, latin1 is preferred because
7814         * its implementation is faster. */
7815        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7816        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7817        Py_UCS4 maxchar = writer->maxchar;
7818
7819        assert (writer->kind == PyUnicode_1BYTE_KIND);
7820        while (s < e) {
7821            ch = *s;
7822            x = mapdata_ucs1[ch];
7823            if (x > maxchar) {
7824                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7825                    goto onError;
7826                maxchar = writer->maxchar;
7827                outdata = (Py_UCS1 *)writer->data;
7828            }
7829            outdata[writer->pos] = x;
7830            writer->pos++;
7831            ++s;
7832        }
7833        return 0;
7834    }
7835
7836    while (s < e) {
7837        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7838            enum PyUnicode_Kind outkind = writer->kind;
7839            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7840            if (outkind == PyUnicode_1BYTE_KIND) {
7841                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7842                Py_UCS4 maxchar = writer->maxchar;
7843                while (s < e) {
7844                    ch = *s;
7845                    x = mapdata_ucs2[ch];
7846                    if (x > maxchar)
7847                        goto Error;
7848                    outdata[writer->pos] = x;
7849                    writer->pos++;
7850                    ++s;
7851                }
7852                break;
7853            }
7854            else if (outkind == PyUnicode_2BYTE_KIND) {
7855                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7856                while (s < e) {
7857                    ch = *s;
7858                    x = mapdata_ucs2[ch];
7859                    if (x == 0xFFFE)
7860                        goto Error;
7861                    outdata[writer->pos] = x;
7862                    writer->pos++;
7863                    ++s;
7864                }
7865                break;
7866            }
7867        }
7868        ch = *s;
7869
7870        if (ch < maplen)
7871            x = PyUnicode_READ(mapkind, mapdata, ch);
7872        else
7873            x = 0xfffe; /* invalid value */
7874Error:
7875        if (x == 0xfffe)
7876        {
7877            /* undefined mapping */
7878            startinpos = s-starts;
7879            endinpos = startinpos+1;
7880            if (unicode_decode_call_errorhandler_writer(
7881                    errors, &errorHandler,
7882                    "charmap", "character maps to <undefined>",
7883                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7884                    writer)) {
7885                goto onError;
7886            }
7887            continue;
7888        }
7889
7890        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7891            goto onError;
7892        ++s;
7893    }
7894    Py_XDECREF(errorHandler);
7895    Py_XDECREF(exc);
7896    return 0;
7897
7898onError:
7899    Py_XDECREF(errorHandler);
7900    Py_XDECREF(exc);
7901    return -1;
7902}
7903
7904static int
7905charmap_decode_mapping(const char *s,
7906                       Py_ssize_t size,
7907                       PyObject *mapping,
7908                       const char *errors,
7909                       _PyUnicodeWriter *writer)
7910{
7911    const char *starts = s;
7912    const char *e;
7913    Py_ssize_t startinpos, endinpos;
7914    PyObject *errorHandler = NULL, *exc = NULL;
7915    unsigned char ch;
7916    PyObject *key, *item = NULL;
7917
7918    e = s + size;
7919
7920    while (s < e) {
7921        ch = *s;
7922
7923        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7924        key = PyLong_FromLong((long)ch);
7925        if (key == NULL)
7926            goto onError;
7927
7928        item = PyObject_GetItem(mapping, key);
7929        Py_DECREF(key);
7930        if (item == NULL) {
7931            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7932                /* No mapping found means: mapping is undefined. */
7933                PyErr_Clear();
7934                goto Undefined;
7935            } else
7936                goto onError;
7937        }
7938
7939        /* Apply mapping */
7940        if (item == Py_None)
7941            goto Undefined;
7942        if (PyLong_Check(item)) {
7943            long value = PyLong_AS_LONG(item);
7944            if (value == 0xFFFE)
7945                goto Undefined;
7946            if (value < 0 || value > MAX_UNICODE) {
7947                PyErr_Format(PyExc_TypeError,
7948                             "character mapping must be in range(0x%lx)",
7949                             (unsigned long)MAX_UNICODE + 1);
7950                goto onError;
7951            }
7952
7953            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7954                goto onError;
7955        }
7956        else if (PyUnicode_Check(item)) {
7957            if (PyUnicode_READY(item) == -1)
7958                goto onError;
7959            if (PyUnicode_GET_LENGTH(item) == 1) {
7960                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7961                if (value == 0xFFFE)
7962                    goto Undefined;
7963                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7964                    goto onError;
7965            }
7966            else {
7967                writer->overallocate = 1;
7968                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7969                    goto onError;
7970            }
7971        }
7972        else {
7973            /* wrong return value */
7974            PyErr_SetString(PyExc_TypeError,
7975                            "character mapping must return integer, None or str");
7976            goto onError;
7977        }
7978        Py_CLEAR(item);
7979        ++s;
7980        continue;
7981
7982Undefined:
7983        /* undefined mapping */
7984        Py_CLEAR(item);
7985        startinpos = s-starts;
7986        endinpos = startinpos+1;
7987        if (unicode_decode_call_errorhandler_writer(
7988                errors, &errorHandler,
7989                "charmap", "character maps to <undefined>",
7990                &starts, &e, &startinpos, &endinpos, &exc, &s,
7991                writer)) {
7992            goto onError;
7993        }
7994    }
7995    Py_XDECREF(errorHandler);
7996    Py_XDECREF(exc);
7997    return 0;
7998
7999onError:
8000    Py_XDECREF(item);
8001    Py_XDECREF(errorHandler);
8002    Py_XDECREF(exc);
8003    return -1;
8004}
8005
8006PyObject *
8007PyUnicode_DecodeCharmap(const char *s,
8008                        Py_ssize_t size,
8009                        PyObject *mapping,
8010                        const char *errors)
8011{
8012    _PyUnicodeWriter writer;
8013
8014    /* Default to Latin-1 */
8015    if (mapping == NULL)
8016        return PyUnicode_DecodeLatin1(s, size, errors);
8017
8018    if (size == 0)
8019        _Py_RETURN_UNICODE_EMPTY();
8020    _PyUnicodeWriter_Init(&writer);
8021    writer.min_length = size;
8022    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8023        goto onError;
8024
8025    if (PyUnicode_CheckExact(mapping)) {
8026        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8027            goto onError;
8028    }
8029    else {
8030        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8031            goto onError;
8032    }
8033    return _PyUnicodeWriter_Finish(&writer);
8034
8035  onError:
8036    _PyUnicodeWriter_Dealloc(&writer);
8037    return NULL;
8038}
8039
8040/* Charmap encoding: the lookup table */
8041
8042struct encoding_map {
8043    PyObject_HEAD
8044    unsigned char level1[32];
8045    int count2, count3;
8046    unsigned char level23[1];
8047};
8048
8049static PyObject*
8050encoding_map_size(PyObject *obj, PyObject* args)
8051{
8052    struct encoding_map *map = (struct encoding_map*)obj;
8053    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8054                           128*map->count3);
8055}
8056
8057static PyMethodDef encoding_map_methods[] = {
8058    {"size", encoding_map_size, METH_NOARGS,
8059     PyDoc_STR("Return the size (in bytes) of this object") },
8060    { 0 }
8061};
8062
8063static void
8064encoding_map_dealloc(PyObject* o)
8065{
8066    PyObject_FREE(o);
8067}
8068
8069static PyTypeObject EncodingMapType = {
8070    PyVarObject_HEAD_INIT(NULL, 0)
8071    "EncodingMap",          /*tp_name*/
8072    sizeof(struct encoding_map),   /*tp_basicsize*/
8073    0,                      /*tp_itemsize*/
8074    /* methods */
8075    encoding_map_dealloc,   /*tp_dealloc*/
8076    0,                      /*tp_print*/
8077    0,                      /*tp_getattr*/
8078    0,                      /*tp_setattr*/
8079    0,                      /*tp_reserved*/
8080    0,                      /*tp_repr*/
8081    0,                      /*tp_as_number*/
8082    0,                      /*tp_as_sequence*/
8083    0,                      /*tp_as_mapping*/
8084    0,                      /*tp_hash*/
8085    0,                      /*tp_call*/
8086    0,                      /*tp_str*/
8087    0,                      /*tp_getattro*/
8088    0,                      /*tp_setattro*/
8089    0,                      /*tp_as_buffer*/
8090    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8091    0,                      /*tp_doc*/
8092    0,                      /*tp_traverse*/
8093    0,                      /*tp_clear*/
8094    0,                      /*tp_richcompare*/
8095    0,                      /*tp_weaklistoffset*/
8096    0,                      /*tp_iter*/
8097    0,                      /*tp_iternext*/
8098    encoding_map_methods,   /*tp_methods*/
8099    0,                      /*tp_members*/
8100    0,                      /*tp_getset*/
8101    0,                      /*tp_base*/
8102    0,                      /*tp_dict*/
8103    0,                      /*tp_descr_get*/
8104    0,                      /*tp_descr_set*/
8105    0,                      /*tp_dictoffset*/
8106    0,                      /*tp_init*/
8107    0,                      /*tp_alloc*/
8108    0,                      /*tp_new*/
8109    0,                      /*tp_free*/
8110    0,                      /*tp_is_gc*/
8111};
8112
8113PyObject*
8114PyUnicode_BuildEncodingMap(PyObject* string)
8115{
8116    PyObject *result;
8117    struct encoding_map *mresult;
8118    int i;
8119    int need_dict = 0;
8120    unsigned char level1[32];
8121    unsigned char level2[512];
8122    unsigned char *mlevel1, *mlevel2, *mlevel3;
8123    int count2 = 0, count3 = 0;
8124    int kind;
8125    void *data;
8126    Py_ssize_t length;
8127    Py_UCS4 ch;
8128
8129    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8130        PyErr_BadArgument();
8131        return NULL;
8132    }
8133    kind = PyUnicode_KIND(string);
8134    data = PyUnicode_DATA(string);
8135    length = PyUnicode_GET_LENGTH(string);
8136    length = Py_MIN(length, 256);
8137    memset(level1, 0xFF, sizeof level1);
8138    memset(level2, 0xFF, sizeof level2);
8139
8140    /* If there isn't a one-to-one mapping of NULL to \0,
8141       or if there are non-BMP characters, we need to use
8142       a mapping dictionary. */
8143    if (PyUnicode_READ(kind, data, 0) != 0)
8144        need_dict = 1;
8145    for (i = 1; i < length; i++) {
8146        int l1, l2;
8147        ch = PyUnicode_READ(kind, data, i);
8148        if (ch == 0 || ch > 0xFFFF) {
8149            need_dict = 1;
8150            break;
8151        }
8152        if (ch == 0xFFFE)
8153            /* unmapped character */
8154            continue;
8155        l1 = ch >> 11;
8156        l2 = ch >> 7;
8157        if (level1[l1] == 0xFF)
8158            level1[l1] = count2++;
8159        if (level2[l2] == 0xFF)
8160            level2[l2] = count3++;
8161    }
8162
8163    if (count2 >= 0xFF || count3 >= 0xFF)
8164        need_dict = 1;
8165
8166    if (need_dict) {
8167        PyObject *result = PyDict_New();
8168        PyObject *key, *value;
8169        if (!result)
8170            return NULL;
8171        for (i = 0; i < length; i++) {
8172            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8173            value = PyLong_FromLong(i);
8174            if (!key || !value)
8175                goto failed1;
8176            if (PyDict_SetItem(result, key, value) == -1)
8177                goto failed1;
8178            Py_DECREF(key);
8179            Py_DECREF(value);
8180        }
8181        return result;
8182      failed1:
8183        Py_XDECREF(key);
8184        Py_XDECREF(value);
8185        Py_DECREF(result);
8186        return NULL;
8187    }
8188
8189    /* Create a three-level trie */
8190    result = PyObject_MALLOC(sizeof(struct encoding_map) +
8191                             16*count2 + 128*count3 - 1);
8192    if (!result)
8193        return PyErr_NoMemory();
8194    PyObject_Init(result, &EncodingMapType);
8195    mresult = (struct encoding_map*)result;
8196    mresult->count2 = count2;
8197    mresult->count3 = count3;
8198    mlevel1 = mresult->level1;
8199    mlevel2 = mresult->level23;
8200    mlevel3 = mresult->level23 + 16*count2;
8201    memcpy(mlevel1, level1, 32);
8202    memset(mlevel2, 0xFF, 16*count2);
8203    memset(mlevel3, 0, 128*count3);
8204    count3 = 0;
8205    for (i = 1; i < length; i++) {
8206        int o1, o2, o3, i2, i3;
8207        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8208        if (ch == 0xFFFE)
8209            /* unmapped character */
8210            continue;
8211        o1 = ch>>11;
8212        o2 = (ch>>7) & 0xF;
8213        i2 = 16*mlevel1[o1] + o2;
8214        if (mlevel2[i2] == 0xFF)
8215            mlevel2[i2] = count3++;
8216        o3 = ch & 0x7F;
8217        i3 = 128*mlevel2[i2] + o3;
8218        mlevel3[i3] = i;
8219    }
8220    return result;
8221}
8222
8223static int
8224encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8225{
8226    struct encoding_map *map = (struct encoding_map*)mapping;
8227    int l1 = c>>11;
8228    int l2 = (c>>7) & 0xF;
8229    int l3 = c & 0x7F;
8230    int i;
8231
8232    if (c > 0xFFFF)
8233        return -1;
8234    if (c == 0)
8235        return 0;
8236    /* level 1*/
8237    i = map->level1[l1];
8238    if (i == 0xFF) {
8239        return -1;
8240    }
8241    /* level 2*/
8242    i = map->level23[16*i+l2];
8243    if (i == 0xFF) {
8244        return -1;
8245    }
8246    /* level 3 */
8247    i = map->level23[16*map->count2 + 128*i + l3];
8248    if (i == 0) {
8249        return -1;
8250    }
8251    return i;
8252}
8253
8254/* Lookup the character ch in the mapping. If the character
8255   can't be found, Py_None is returned (or NULL, if another
8256   error occurred). */
8257static PyObject *
8258charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8259{
8260    PyObject *w = PyLong_FromLong((long)c);
8261    PyObject *x;
8262
8263    if (w == NULL)
8264        return NULL;
8265    x = PyObject_GetItem(mapping, w);
8266    Py_DECREF(w);
8267    if (x == NULL) {
8268        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8269            /* No mapping found means: mapping is undefined. */
8270            PyErr_Clear();
8271            x = Py_None;
8272            Py_INCREF(x);
8273            return x;
8274        } else
8275            return NULL;
8276    }
8277    else if (x == Py_None)
8278        return x;
8279    else if (PyLong_Check(x)) {
8280        long value = PyLong_AS_LONG(x);
8281        if (value < 0 || value > 255) {
8282            PyErr_SetString(PyExc_TypeError,
8283                            "character mapping must be in range(256)");
8284            Py_DECREF(x);
8285            return NULL;
8286        }
8287        return x;
8288    }
8289    else if (PyBytes_Check(x))
8290        return x;
8291    else {
8292        /* wrong return value */
8293        PyErr_Format(PyExc_TypeError,
8294                     "character mapping must return integer, bytes or None, not %.400s",
8295                     x->ob_type->tp_name);
8296        Py_DECREF(x);
8297        return NULL;
8298    }
8299}
8300
8301static int
8302charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8303{
8304    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8305    /* exponentially overallocate to minimize reallocations */
8306    if (requiredsize < 2*outsize)
8307        requiredsize = 2*outsize;
8308    if (_PyBytes_Resize(outobj, requiredsize))
8309        return -1;
8310    return 0;
8311}
8312
8313typedef enum charmapencode_result {
8314    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8315} charmapencode_result;
8316/* lookup the character, put the result in the output string and adjust
8317   various state variables. Resize the output bytes object if not enough
8318   space is available. Return a new reference to the object that
8319   was put in the output buffer, or Py_None, if the mapping was undefined
8320   (in which case no character was written) or NULL, if a
8321   reallocation error occurred. The caller must decref the result */
8322static charmapencode_result
8323charmapencode_output(Py_UCS4 c, PyObject *mapping,
8324                     PyObject **outobj, Py_ssize_t *outpos)
8325{
8326    PyObject *rep;
8327    char *outstart;
8328    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8329
8330    if (Py_TYPE(mapping) == &EncodingMapType) {
8331        int res = encoding_map_lookup(c, mapping);
8332        Py_ssize_t requiredsize = *outpos+1;
8333        if (res == -1)
8334            return enc_FAILED;
8335        if (outsize<requiredsize)
8336            if (charmapencode_resize(outobj, outpos, requiredsize))
8337                return enc_EXCEPTION;
8338        outstart = PyBytes_AS_STRING(*outobj);
8339        outstart[(*outpos)++] = (char)res;
8340        return enc_SUCCESS;
8341    }
8342
8343    rep = charmapencode_lookup(c, mapping);
8344    if (rep==NULL)
8345        return enc_EXCEPTION;
8346    else if (rep==Py_None) {
8347        Py_DECREF(rep);
8348        return enc_FAILED;
8349    } else {
8350        if (PyLong_Check(rep)) {
8351            Py_ssize_t requiredsize = *outpos+1;
8352            if (outsize<requiredsize)
8353                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8354                    Py_DECREF(rep);
8355                    return enc_EXCEPTION;
8356                }
8357            outstart = PyBytes_AS_STRING(*outobj);
8358            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8359        }
8360        else {
8361            const char *repchars = PyBytes_AS_STRING(rep);
8362            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8363            Py_ssize_t requiredsize = *outpos+repsize;
8364            if (outsize<requiredsize)
8365                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8366                    Py_DECREF(rep);
8367                    return enc_EXCEPTION;
8368                }
8369            outstart = PyBytes_AS_STRING(*outobj);
8370            memcpy(outstart + *outpos, repchars, repsize);
8371            *outpos += repsize;
8372        }
8373    }
8374    Py_DECREF(rep);
8375    return enc_SUCCESS;
8376}
8377
8378/* handle an error in PyUnicode_EncodeCharmap
8379   Return 0 on success, -1 on error */
8380static int
8381charmap_encoding_error(
8382    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8383    PyObject **exceptionObject,
8384    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8385    PyObject **res, Py_ssize_t *respos)
8386{
8387    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8388    Py_ssize_t size, repsize;
8389    Py_ssize_t newpos;
8390    enum PyUnicode_Kind kind;
8391    void *data;
8392    Py_ssize_t index;
8393    /* startpos for collecting unencodable chars */
8394    Py_ssize_t collstartpos = *inpos;
8395    Py_ssize_t collendpos = *inpos+1;
8396    Py_ssize_t collpos;
8397    char *encoding = "charmap";
8398    char *reason = "character maps to <undefined>";
8399    charmapencode_result x;
8400    Py_UCS4 ch;
8401    int val;
8402
8403    if (PyUnicode_READY(unicode) == -1)
8404        return -1;
8405    size = PyUnicode_GET_LENGTH(unicode);
8406    /* find all unencodable characters */
8407    while (collendpos < size) {
8408        PyObject *rep;
8409        if (Py_TYPE(mapping) == &EncodingMapType) {
8410            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8411            val = encoding_map_lookup(ch, mapping);
8412            if (val != -1)
8413                break;
8414            ++collendpos;
8415            continue;
8416        }
8417
8418        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8419        rep = charmapencode_lookup(ch, mapping);
8420        if (rep==NULL)
8421            return -1;
8422        else if (rep!=Py_None) {
8423            Py_DECREF(rep);
8424            break;
8425        }
8426        Py_DECREF(rep);
8427        ++collendpos;
8428    }
8429    /* cache callback name lookup
8430     * (if not done yet, i.e. it's the first error) */
8431    if (*error_handler == _Py_ERROR_UNKNOWN)
8432        *error_handler = get_error_handler(errors);
8433
8434    switch (*error_handler) {
8435    case _Py_ERROR_STRICT:
8436        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8437        return -1;
8438
8439    case _Py_ERROR_REPLACE:
8440        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8441            x = charmapencode_output('?', mapping, res, respos);
8442            if (x==enc_EXCEPTION) {
8443                return -1;
8444            }
8445            else if (x==enc_FAILED) {
8446                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8447                return -1;
8448            }
8449        }
8450        /* fall through */
8451    case _Py_ERROR_IGNORE:
8452        *inpos = collendpos;
8453        break;
8454
8455    case _Py_ERROR_XMLCHARREFREPLACE:
8456        /* generate replacement (temporarily (mis)uses p) */
8457        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8458            char buffer[2+29+1+1];
8459            char *cp;
8460            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8461            for (cp = buffer; *cp; ++cp) {
8462                x = charmapencode_output(*cp, mapping, res, respos);
8463                if (x==enc_EXCEPTION)
8464                    return -1;
8465                else if (x==enc_FAILED) {
8466                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8467                    return -1;
8468                }
8469            }
8470        }
8471        *inpos = collendpos;
8472        break;
8473
8474    default:
8475        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8476                                                      encoding, reason, unicode, exceptionObject,
8477                                                      collstartpos, collendpos, &newpos);
8478        if (repunicode == NULL)
8479            return -1;
8480        if (PyBytes_Check(repunicode)) {
8481            /* Directly copy bytes result to output. */
8482            Py_ssize_t outsize = PyBytes_Size(*res);
8483            Py_ssize_t requiredsize;
8484            repsize = PyBytes_Size(repunicode);
8485            requiredsize = *respos + repsize;
8486            if (requiredsize > outsize)
8487                /* Make room for all additional bytes. */
8488                if (charmapencode_resize(res, respos, requiredsize)) {
8489                    Py_DECREF(repunicode);
8490                    return -1;
8491                }
8492            memcpy(PyBytes_AsString(*res) + *respos,
8493                   PyBytes_AsString(repunicode),  repsize);
8494            *respos += repsize;
8495            *inpos = newpos;
8496            Py_DECREF(repunicode);
8497            break;
8498        }
8499        /* generate replacement  */
8500        if (PyUnicode_READY(repunicode) == -1) {
8501            Py_DECREF(repunicode);
8502            return -1;
8503        }
8504        repsize = PyUnicode_GET_LENGTH(repunicode);
8505        data = PyUnicode_DATA(repunicode);
8506        kind = PyUnicode_KIND(repunicode);
8507        for (index = 0; index < repsize; index++) {
8508            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8509            x = charmapencode_output(repch, mapping, res, respos);
8510            if (x==enc_EXCEPTION) {
8511                Py_DECREF(repunicode);
8512                return -1;
8513            }
8514            else if (x==enc_FAILED) {
8515                Py_DECREF(repunicode);
8516                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8517                return -1;
8518            }
8519        }
8520        *inpos = newpos;
8521        Py_DECREF(repunicode);
8522    }
8523    return 0;
8524}
8525
8526PyObject *
8527_PyUnicode_EncodeCharmap(PyObject *unicode,
8528                         PyObject *mapping,
8529                         const char *errors)
8530{
8531    /* output object */
8532    PyObject *res = NULL;
8533    /* current input position */
8534    Py_ssize_t inpos = 0;
8535    Py_ssize_t size;
8536    /* current output position */
8537    Py_ssize_t respos = 0;
8538    PyObject *error_handler_obj = NULL;
8539    PyObject *exc = NULL;
8540    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8541    void *data;
8542    int kind;
8543
8544    if (PyUnicode_READY(unicode) == -1)
8545        return NULL;
8546    size = PyUnicode_GET_LENGTH(unicode);
8547    data = PyUnicode_DATA(unicode);
8548    kind = PyUnicode_KIND(unicode);
8549
8550    /* Default to Latin-1 */
8551    if (mapping == NULL)
8552        return unicode_encode_ucs1(unicode, errors, 256);
8553
8554    /* allocate enough for a simple encoding without
8555       replacements, if we need more, we'll resize */
8556    res = PyBytes_FromStringAndSize(NULL, size);
8557    if (res == NULL)
8558        goto onError;
8559    if (size == 0)
8560        return res;
8561
8562    while (inpos<size) {
8563        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8564        /* try to encode it */
8565        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8566        if (x==enc_EXCEPTION) /* error */
8567            goto onError;
8568        if (x==enc_FAILED) { /* unencodable character */
8569            if (charmap_encoding_error(unicode, &inpos, mapping,
8570                                       &exc,
8571                                       &error_handler, &error_handler_obj, errors,
8572                                       &res, &respos)) {
8573                goto onError;
8574            }
8575        }
8576        else
8577            /* done with this character => adjust input position */
8578            ++inpos;
8579    }
8580
8581    /* Resize if we allocated to much */
8582    if (respos<PyBytes_GET_SIZE(res))
8583        if (_PyBytes_Resize(&res, respos) < 0)
8584            goto onError;
8585
8586    Py_XDECREF(exc);
8587    Py_XDECREF(error_handler_obj);
8588    return res;
8589
8590  onError:
8591    Py_XDECREF(res);
8592    Py_XDECREF(exc);
8593    Py_XDECREF(error_handler_obj);
8594    return NULL;
8595}
8596
8597/* Deprecated */
8598PyObject *
8599PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8600                        Py_ssize_t size,
8601                        PyObject *mapping,
8602                        const char *errors)
8603{
8604    PyObject *result;
8605    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8606    if (unicode == NULL)
8607        return NULL;
8608    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8609    Py_DECREF(unicode);
8610    return result;
8611}
8612
8613PyObject *
8614PyUnicode_AsCharmapString(PyObject *unicode,
8615                          PyObject *mapping)
8616{
8617    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8618        PyErr_BadArgument();
8619        return NULL;
8620    }
8621    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8622}
8623
8624/* create or adjust a UnicodeTranslateError */
8625static void
8626make_translate_exception(PyObject **exceptionObject,
8627                         PyObject *unicode,
8628                         Py_ssize_t startpos, Py_ssize_t endpos,
8629                         const char *reason)
8630{
8631    if (*exceptionObject == NULL) {
8632        *exceptionObject = _PyUnicodeTranslateError_Create(
8633            unicode, startpos, endpos, reason);
8634    }
8635    else {
8636        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8637            goto onError;
8638        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8639            goto onError;
8640        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8641            goto onError;
8642        return;
8643      onError:
8644        Py_CLEAR(*exceptionObject);
8645    }
8646}
8647
8648/* error handling callback helper:
8649   build arguments, call the callback and check the arguments,
8650   put the result into newpos and return the replacement string, which
8651   has to be freed by the caller */
8652static PyObject *
8653unicode_translate_call_errorhandler(const char *errors,
8654                                    PyObject **errorHandler,
8655                                    const char *reason,
8656                                    PyObject *unicode, PyObject **exceptionObject,
8657                                    Py_ssize_t startpos, Py_ssize_t endpos,
8658                                    Py_ssize_t *newpos)
8659{
8660    static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
8661
8662    Py_ssize_t i_newpos;
8663    PyObject *restuple;
8664    PyObject *resunicode;
8665
8666    if (*errorHandler == NULL) {
8667        *errorHandler = PyCodec_LookupError(errors);
8668        if (*errorHandler == NULL)
8669            return NULL;
8670    }
8671
8672    make_translate_exception(exceptionObject,
8673                             unicode, startpos, endpos, reason);
8674    if (*exceptionObject == NULL)
8675        return NULL;
8676
8677    restuple = PyObject_CallFunctionObjArgs(
8678        *errorHandler, *exceptionObject, NULL);
8679    if (restuple == NULL)
8680        return NULL;
8681    if (!PyTuple_Check(restuple)) {
8682        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8683        Py_DECREF(restuple);
8684        return NULL;
8685    }
8686    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8687                          &resunicode, &i_newpos)) {
8688        Py_DECREF(restuple);
8689        return NULL;
8690    }
8691    if (i_newpos<0)
8692        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8693    else
8694        *newpos = i_newpos;
8695    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8696        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8697        Py_DECREF(restuple);
8698        return NULL;
8699    }
8700    Py_INCREF(resunicode);
8701    Py_DECREF(restuple);
8702    return resunicode;
8703}
8704
8705/* Lookup the character ch in the mapping and put the result in result,
8706   which must be decrefed by the caller.
8707   Return 0 on success, -1 on error */
8708static int
8709charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8710{
8711    PyObject *w = PyLong_FromLong((long)c);
8712    PyObject *x;
8713
8714    if (w == NULL)
8715        return -1;
8716    x = PyObject_GetItem(mapping, w);
8717    Py_DECREF(w);
8718    if (x == NULL) {
8719        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8720            /* No mapping found means: use 1:1 mapping. */
8721            PyErr_Clear();
8722            *result = NULL;
8723            return 0;
8724        } else
8725            return -1;
8726    }
8727    else if (x == Py_None) {
8728        *result = x;
8729        return 0;
8730    }
8731    else if (PyLong_Check(x)) {
8732        long value = PyLong_AS_LONG(x);
8733        if (value < 0 || value > MAX_UNICODE) {
8734            PyErr_Format(PyExc_ValueError,
8735                         "character mapping must be in range(0x%x)",
8736                         MAX_UNICODE+1);
8737            Py_DECREF(x);
8738            return -1;
8739        }
8740        *result = x;
8741        return 0;
8742    }
8743    else if (PyUnicode_Check(x)) {
8744        *result = x;
8745        return 0;
8746    }
8747    else {
8748        /* wrong return value */
8749        PyErr_SetString(PyExc_TypeError,
8750                        "character mapping must return integer, None or str");
8751        Py_DECREF(x);
8752        return -1;
8753    }
8754}
8755
8756/* lookup the character, write the result into the writer.
8757   Return 1 if the result was written into the writer, return 0 if the mapping
8758   was undefined, raise an exception return -1 on error. */
8759static int
8760charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8761                        _PyUnicodeWriter *writer)
8762{
8763    PyObject *item;
8764
8765    if (charmaptranslate_lookup(ch, mapping, &item))
8766        return -1;
8767
8768    if (item == NULL) {
8769        /* not found => default to 1:1 mapping */
8770        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8771            return -1;
8772        }
8773        return 1;
8774    }
8775
8776    if (item == Py_None) {
8777        Py_DECREF(item);
8778        return 0;
8779    }
8780
8781    if (PyLong_Check(item)) {
8782        long ch = (Py_UCS4)PyLong_AS_LONG(item);
8783        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8784           used it */
8785        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8786            Py_DECREF(item);
8787            return -1;
8788        }
8789        Py_DECREF(item);
8790        return 1;
8791    }
8792
8793    if (!PyUnicode_Check(item)) {
8794        Py_DECREF(item);
8795        return -1;
8796    }
8797
8798    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8799        Py_DECREF(item);
8800        return -1;
8801    }
8802
8803    Py_DECREF(item);
8804    return 1;
8805}
8806
8807static int
8808unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8809                              Py_UCS1 *translate)
8810{
8811    PyObject *item = NULL;
8812    int ret = 0;
8813
8814    if (charmaptranslate_lookup(ch, mapping, &item)) {
8815        return -1;
8816    }
8817
8818    if (item == Py_None) {
8819        /* deletion */
8820        translate[ch] = 0xfe;
8821    }
8822    else if (item == NULL) {
8823        /* not found => default to 1:1 mapping */
8824        translate[ch] = ch;
8825        return 1;
8826    }
8827    else if (PyLong_Check(item)) {
8828        long replace = PyLong_AS_LONG(item);
8829        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8830           used it */
8831        if (127 < replace) {
8832            /* invalid character or character outside ASCII:
8833               skip the fast translate */
8834            goto exit;
8835        }
8836        translate[ch] = (Py_UCS1)replace;
8837    }
8838    else if (PyUnicode_Check(item)) {
8839        Py_UCS4 replace;
8840
8841        if (PyUnicode_READY(item) == -1) {
8842            Py_DECREF(item);
8843            return -1;
8844        }
8845        if (PyUnicode_GET_LENGTH(item) != 1)
8846            goto exit;
8847
8848        replace = PyUnicode_READ_CHAR(item, 0);
8849        if (replace > 127)
8850            goto exit;
8851        translate[ch] = (Py_UCS1)replace;
8852    }
8853    else {
8854        /* not None, NULL, long or unicode */
8855        goto exit;
8856    }
8857    ret = 1;
8858
8859  exit:
8860    Py_DECREF(item);
8861    return ret;
8862}
8863
8864/* Fast path for ascii => ascii translation. Return 1 if the whole string
8865   was translated into writer, return 0 if the input string was partially
8866   translated into writer, raise an exception and return -1 on error. */
8867static int
8868unicode_fast_translate(PyObject *input, PyObject *mapping,
8869                       _PyUnicodeWriter *writer, int ignore,
8870                       Py_ssize_t *input_pos)
8871{
8872    Py_UCS1 ascii_table[128], ch, ch2;
8873    Py_ssize_t len;
8874    Py_UCS1 *in, *end, *out;
8875    int res = 0;
8876
8877    len = PyUnicode_GET_LENGTH(input);
8878
8879    memset(ascii_table, 0xff, 128);
8880
8881    in = PyUnicode_1BYTE_DATA(input);
8882    end = in + len;
8883
8884    assert(PyUnicode_IS_ASCII(writer->buffer));
8885    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8886    out = PyUnicode_1BYTE_DATA(writer->buffer);
8887
8888    for (; in < end; in++) {
8889        ch = *in;
8890        ch2 = ascii_table[ch];
8891        if (ch2 == 0xff) {
8892            int translate = unicode_fast_translate_lookup(mapping, ch,
8893                                                          ascii_table);
8894            if (translate < 0)
8895                return -1;
8896            if (translate == 0)
8897                goto exit;
8898            ch2 = ascii_table[ch];
8899        }
8900        if (ch2 == 0xfe) {
8901            if (ignore)
8902                continue;
8903            goto exit;
8904        }
8905        assert(ch2 < 128);
8906        *out = ch2;
8907        out++;
8908    }
8909    res = 1;
8910
8911exit:
8912    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8913    *input_pos = in - PyUnicode_1BYTE_DATA(input);
8914    return res;
8915}
8916
8917static PyObject *
8918_PyUnicode_TranslateCharmap(PyObject *input,
8919                            PyObject *mapping,
8920                            const char *errors)
8921{
8922    /* input object */
8923    char *data;
8924    Py_ssize_t size, i;
8925    int kind;
8926    /* output buffer */
8927    _PyUnicodeWriter writer;
8928    /* error handler */
8929    char *reason = "character maps to <undefined>";
8930    PyObject *errorHandler = NULL;
8931    PyObject *exc = NULL;
8932    int ignore;
8933    int res;
8934
8935    if (mapping == NULL) {
8936        PyErr_BadArgument();
8937        return NULL;
8938    }
8939
8940    if (PyUnicode_READY(input) == -1)
8941        return NULL;
8942    data = (char*)PyUnicode_DATA(input);
8943    kind = PyUnicode_KIND(input);
8944    size = PyUnicode_GET_LENGTH(input);
8945
8946    if (size == 0)
8947        return PyUnicode_FromObject(input);
8948
8949    /* allocate enough for a simple 1:1 translation without
8950       replacements, if we need more, we'll resize */
8951    _PyUnicodeWriter_Init(&writer);
8952    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8953        goto onError;
8954
8955    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8956
8957    if (PyUnicode_READY(input) == -1)
8958        return NULL;
8959    if (PyUnicode_IS_ASCII(input)) {
8960        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8961        if (res < 0) {
8962            _PyUnicodeWriter_Dealloc(&writer);
8963            return NULL;
8964        }
8965        if (res == 1)
8966            return _PyUnicodeWriter_Finish(&writer);
8967    }
8968    else {
8969        i = 0;
8970    }
8971
8972    while (i<size) {
8973        /* try to encode it */
8974        int translate;
8975        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8976        Py_ssize_t newpos;
8977        /* startpos for collecting untranslatable chars */
8978        Py_ssize_t collstart;
8979        Py_ssize_t collend;
8980        Py_UCS4 ch;
8981
8982        ch = PyUnicode_READ(kind, data, i);
8983        translate = charmaptranslate_output(ch, mapping, &writer);
8984        if (translate < 0)
8985            goto onError;
8986
8987        if (translate != 0) {
8988            /* it worked => adjust input pointer */
8989            ++i;
8990            continue;
8991        }
8992
8993        /* untranslatable character */
8994        collstart = i;
8995        collend = i+1;
8996
8997        /* find all untranslatable characters */
8998        while (collend < size) {
8999            PyObject *x;
9000            ch = PyUnicode_READ(kind, data, collend);
9001            if (charmaptranslate_lookup(ch, mapping, &x))
9002                goto onError;
9003            Py_XDECREF(x);
9004            if (x != Py_None)
9005                break;
9006            ++collend;
9007        }
9008
9009        if (ignore) {
9010            i = collend;
9011        }
9012        else {
9013            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9014                                                             reason, input, &exc,
9015                                                             collstart, collend, &newpos);
9016            if (repunicode == NULL)
9017                goto onError;
9018            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9019                Py_DECREF(repunicode);
9020                goto onError;
9021            }
9022            Py_DECREF(repunicode);
9023            i = newpos;
9024        }
9025    }
9026    Py_XDECREF(exc);
9027    Py_XDECREF(errorHandler);
9028    return _PyUnicodeWriter_Finish(&writer);
9029
9030  onError:
9031    _PyUnicodeWriter_Dealloc(&writer);
9032    Py_XDECREF(exc);
9033    Py_XDECREF(errorHandler);
9034    return NULL;
9035}
9036
9037/* Deprecated. Use PyUnicode_Translate instead. */
9038PyObject *
9039PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9040                           Py_ssize_t size,
9041                           PyObject *mapping,
9042                           const char *errors)
9043{
9044    PyObject *result;
9045    PyObject *unicode = PyUnicode_FromUnicode(p, size);
9046    if (!unicode)
9047        return NULL;
9048    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9049    Py_DECREF(unicode);
9050    return result;
9051}
9052
9053PyObject *
9054PyUnicode_Translate(PyObject *str,
9055                    PyObject *mapping,
9056                    const char *errors)
9057{
9058    if (ensure_unicode(str) < 0)
9059        return NULL;
9060    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9061}
9062
9063static Py_UCS4
9064fix_decimal_and_space_to_ascii(PyObject *self)
9065{
9066    /* No need to call PyUnicode_READY(self) because this function is only
9067       called as a callback from fixup() which does it already. */
9068    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9069    const int kind = PyUnicode_KIND(self);
9070    void *data = PyUnicode_DATA(self);
9071    Py_UCS4 maxchar = 127, ch, fixed;
9072    int modified = 0;
9073    Py_ssize_t i;
9074
9075    for (i = 0; i < len; ++i) {
9076        ch = PyUnicode_READ(kind, data, i);
9077        fixed = 0;
9078        if (ch > 127) {
9079            if (Py_UNICODE_ISSPACE(ch))
9080                fixed = ' ';
9081            else {
9082                const int decimal = Py_UNICODE_TODECIMAL(ch);
9083                if (decimal >= 0)
9084                    fixed = '0' + decimal;
9085            }
9086            if (fixed != 0) {
9087                modified = 1;
9088                maxchar = Py_MAX(maxchar, fixed);
9089                PyUnicode_WRITE(kind, data, i, fixed);
9090            }
9091            else
9092                maxchar = Py_MAX(maxchar, ch);
9093        }
9094    }
9095
9096    return (modified) ? maxchar : 0;
9097}
9098
9099PyObject *
9100_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9101{
9102    if (!PyUnicode_Check(unicode)) {
9103        PyErr_BadInternalCall();
9104        return NULL;
9105    }
9106    if (PyUnicode_READY(unicode) == -1)
9107        return NULL;
9108    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9109        /* If the string is already ASCII, just return the same string */
9110        Py_INCREF(unicode);
9111        return unicode;
9112    }
9113    return fixup(unicode, fix_decimal_and_space_to_ascii);
9114}
9115
9116PyObject *
9117PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9118                                  Py_ssize_t length)
9119{
9120    PyObject *decimal;
9121    Py_ssize_t i;
9122    Py_UCS4 maxchar;
9123    enum PyUnicode_Kind kind;
9124    void *data;
9125
9126    maxchar = 127;
9127    for (i = 0; i < length; i++) {
9128        Py_UCS4 ch = s[i];
9129        if (ch > 127) {
9130            int decimal = Py_UNICODE_TODECIMAL(ch);
9131            if (decimal >= 0)
9132                ch = '0' + decimal;
9133            maxchar = Py_MAX(maxchar, ch);
9134        }
9135    }
9136
9137    /* Copy to a new string */
9138    decimal = PyUnicode_New(length, maxchar);
9139    if (decimal == NULL)
9140        return decimal;
9141    kind = PyUnicode_KIND(decimal);
9142    data = PyUnicode_DATA(decimal);
9143    /* Iterate over code points */
9144    for (i = 0; i < length; i++) {
9145        Py_UCS4 ch = s[i];
9146        if (ch > 127) {
9147            int decimal = Py_UNICODE_TODECIMAL(ch);
9148            if (decimal >= 0)
9149                ch = '0' + decimal;
9150        }
9151        PyUnicode_WRITE(kind, data, i, ch);
9152    }
9153    return unicode_result(decimal);
9154}
9155/* --- Decimal Encoder ---------------------------------------------------- */
9156
9157int
9158PyUnicode_EncodeDecimal(Py_UNICODE *s,
9159                        Py_ssize_t length,
9160                        char *output,
9161                        const char *errors)
9162{
9163    PyObject *unicode;
9164    Py_ssize_t i;
9165    enum PyUnicode_Kind kind;
9166    void *data;
9167
9168    if (output == NULL) {
9169        PyErr_BadArgument();
9170        return -1;
9171    }
9172
9173    unicode = PyUnicode_FromUnicode(s, length);
9174    if (unicode == NULL)
9175        return -1;
9176
9177    if (PyUnicode_READY(unicode) == -1) {
9178        Py_DECREF(unicode);
9179        return -1;
9180    }
9181    kind = PyUnicode_KIND(unicode);
9182    data = PyUnicode_DATA(unicode);
9183
9184    for (i=0; i < length; ) {
9185        PyObject *exc;
9186        Py_UCS4 ch;
9187        int decimal;
9188        Py_ssize_t startpos;
9189
9190        ch = PyUnicode_READ(kind, data, i);
9191
9192        if (Py_UNICODE_ISSPACE(ch)) {
9193            *output++ = ' ';
9194            i++;
9195            continue;
9196        }
9197        decimal = Py_UNICODE_TODECIMAL(ch);
9198        if (decimal >= 0) {
9199            *output++ = '0' + decimal;
9200            i++;
9201            continue;
9202        }
9203        if (0 < ch && ch < 256) {
9204            *output++ = (char)ch;
9205            i++;
9206            continue;
9207        }
9208
9209        startpos = i;
9210        exc = NULL;
9211        raise_encode_exception(&exc, "decimal", unicode,
9212                               startpos, startpos+1,
9213                               "invalid decimal Unicode string");
9214        Py_XDECREF(exc);
9215        Py_DECREF(unicode);
9216        return -1;
9217    }
9218    /* 0-terminate the output string */
9219    *output++ = '\0';
9220    Py_DECREF(unicode);
9221    return 0;
9222}
9223
9224/* --- Helpers ------------------------------------------------------------ */
9225
9226/* helper macro to fixup start/end slice values */
9227#define ADJUST_INDICES(start, end, len)         \
9228    if (end > len)                              \
9229        end = len;                              \
9230    else if (end < 0) {                         \
9231        end += len;                             \
9232        if (end < 0)                            \
9233            end = 0;                            \
9234    }                                           \
9235    if (start < 0) {                            \
9236        start += len;                           \
9237        if (start < 0)                          \
9238            start = 0;                          \
9239    }
9240
9241static Py_ssize_t
9242any_find_slice(PyObject* s1, PyObject* s2,
9243               Py_ssize_t start,
9244               Py_ssize_t end,
9245               int direction)
9246{
9247    int kind1, kind2;
9248    void *buf1, *buf2;
9249    Py_ssize_t len1, len2, result;
9250
9251    kind1 = PyUnicode_KIND(s1);
9252    kind2 = PyUnicode_KIND(s2);
9253    if (kind1 < kind2)
9254        return -1;
9255
9256    len1 = PyUnicode_GET_LENGTH(s1);
9257    len2 = PyUnicode_GET_LENGTH(s2);
9258    ADJUST_INDICES(start, end, len1);
9259    if (end - start < len2)
9260        return -1;
9261
9262    buf1 = PyUnicode_DATA(s1);
9263    buf2 = PyUnicode_DATA(s2);
9264    if (len2 == 1) {
9265        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9266        result = findchar((const char *)buf1 + kind1*start,
9267                          kind1, end - start, ch, direction);
9268        if (result == -1)
9269            return -1;
9270        else
9271            return start + result;
9272    }
9273
9274    if (kind2 != kind1) {
9275        buf2 = _PyUnicode_AsKind(s2, kind1);
9276        if (!buf2)
9277            return -2;
9278    }
9279
9280    if (direction > 0) {
9281        switch (kind1) {
9282        case PyUnicode_1BYTE_KIND:
9283            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9284                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9285            else
9286                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9287            break;
9288        case PyUnicode_2BYTE_KIND:
9289            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9290            break;
9291        case PyUnicode_4BYTE_KIND:
9292            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9293            break;
9294        default:
9295            assert(0); result = -2;
9296        }
9297    }
9298    else {
9299        switch (kind1) {
9300        case PyUnicode_1BYTE_KIND:
9301            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9302                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303            else
9304                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9305            break;
9306        case PyUnicode_2BYTE_KIND:
9307            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9308            break;
9309        case PyUnicode_4BYTE_KIND:
9310            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9311            break;
9312        default:
9313            assert(0); result = -2;
9314        }
9315    }
9316
9317    if (kind2 != kind1)
9318        PyMem_Free(buf2);
9319
9320    return result;
9321}
9322
9323Py_ssize_t
9324_PyUnicode_InsertThousandsGrouping(
9325    PyObject *unicode, Py_ssize_t index,
9326    Py_ssize_t n_buffer,
9327    void *digits, Py_ssize_t n_digits,
9328    Py_ssize_t min_width,
9329    const char *grouping, PyObject *thousands_sep,
9330    Py_UCS4 *maxchar)
9331{
9332    unsigned int kind, thousands_sep_kind;
9333    char *data, *thousands_sep_data;
9334    Py_ssize_t thousands_sep_len;
9335    Py_ssize_t len;
9336
9337    if (unicode != NULL) {
9338        kind = PyUnicode_KIND(unicode);
9339        data = (char *) PyUnicode_DATA(unicode) + index * kind;
9340    }
9341    else {
9342        kind = PyUnicode_1BYTE_KIND;
9343        data = NULL;
9344    }
9345    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9346    thousands_sep_data = PyUnicode_DATA(thousands_sep);
9347    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9348    if (unicode != NULL && thousands_sep_kind != kind) {
9349        if (thousands_sep_kind < kind) {
9350            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9351            if (!thousands_sep_data)
9352                return -1;
9353        }
9354        else {
9355            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9356            if (!data)
9357                return -1;
9358        }
9359    }
9360
9361    switch (kind) {
9362    case PyUnicode_1BYTE_KIND:
9363        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9364            len = asciilib_InsertThousandsGrouping(
9365                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9366                min_width, grouping,
9367                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9368        else
9369            len = ucs1lib_InsertThousandsGrouping(
9370                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9371                min_width, grouping,
9372                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9373        break;
9374    case PyUnicode_2BYTE_KIND:
9375        len = ucs2lib_InsertThousandsGrouping(
9376            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9377            min_width, grouping,
9378            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9379        break;
9380    case PyUnicode_4BYTE_KIND:
9381        len = ucs4lib_InsertThousandsGrouping(
9382            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9383            min_width, grouping,
9384            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9385        break;
9386    default:
9387        assert(0);
9388        return -1;
9389    }
9390    if (unicode != NULL && thousands_sep_kind != kind) {
9391        if (thousands_sep_kind < kind)
9392            PyMem_Free(thousands_sep_data);
9393        else
9394            PyMem_Free(data);
9395    }
9396    if (unicode == NULL) {
9397        *maxchar = 127;
9398        if (len != n_digits) {
9399            *maxchar = Py_MAX(*maxchar,
9400                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9401        }
9402    }
9403    return len;
9404}
9405
9406
9407Py_ssize_t
9408PyUnicode_Count(PyObject *str,
9409                PyObject *substr,
9410                Py_ssize_t start,
9411                Py_ssize_t end)
9412{
9413    Py_ssize_t result;
9414    int kind1, kind2;
9415    void *buf1 = NULL, *buf2 = NULL;
9416    Py_ssize_t len1, len2;
9417
9418    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9419        return -1;
9420
9421    kind1 = PyUnicode_KIND(str);
9422    kind2 = PyUnicode_KIND(substr);
9423    if (kind1 < kind2)
9424        return 0;
9425
9426    len1 = PyUnicode_GET_LENGTH(str);
9427    len2 = PyUnicode_GET_LENGTH(substr);
9428    ADJUST_INDICES(start, end, len1);
9429    if (end - start < len2)
9430        return 0;
9431
9432    buf1 = PyUnicode_DATA(str);
9433    buf2 = PyUnicode_DATA(substr);
9434    if (kind2 != kind1) {
9435        buf2 = _PyUnicode_AsKind(substr, kind1);
9436        if (!buf2)
9437            goto onError;
9438    }
9439
9440    switch (kind1) {
9441    case PyUnicode_1BYTE_KIND:
9442        if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9443            result = asciilib_count(
9444                ((Py_UCS1*)buf1) + start, end - start,
9445                buf2, len2, PY_SSIZE_T_MAX
9446                );
9447        else
9448            result = ucs1lib_count(
9449                ((Py_UCS1*)buf1) + start, end - start,
9450                buf2, len2, PY_SSIZE_T_MAX
9451                );
9452        break;
9453    case PyUnicode_2BYTE_KIND:
9454        result = ucs2lib_count(
9455            ((Py_UCS2*)buf1) + start, end - start,
9456            buf2, len2, PY_SSIZE_T_MAX
9457            );
9458        break;
9459    case PyUnicode_4BYTE_KIND:
9460        result = ucs4lib_count(
9461            ((Py_UCS4*)buf1) + start, end - start,
9462            buf2, len2, PY_SSIZE_T_MAX
9463            );
9464        break;
9465    default:
9466        assert(0); result = 0;
9467    }
9468
9469    if (kind2 != kind1)
9470        PyMem_Free(buf2);
9471
9472    return result;
9473  onError:
9474    if (kind2 != kind1 && buf2)
9475        PyMem_Free(buf2);
9476    return -1;
9477}
9478
9479Py_ssize_t
9480PyUnicode_Find(PyObject *str,
9481               PyObject *substr,
9482               Py_ssize_t start,
9483               Py_ssize_t end,
9484               int direction)
9485{
9486    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9487        return -2;
9488
9489    return any_find_slice(str, substr, start, end, direction);
9490}
9491
9492Py_ssize_t
9493PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9494                   Py_ssize_t start, Py_ssize_t end,
9495                   int direction)
9496{
9497    int kind;
9498    Py_ssize_t result;
9499    if (PyUnicode_READY(str) == -1)
9500        return -2;
9501    if (start < 0 || end < 0) {
9502        PyErr_SetString(PyExc_IndexError, "string index out of range");
9503        return -2;
9504    }
9505    if (end > PyUnicode_GET_LENGTH(str))
9506        end = PyUnicode_GET_LENGTH(str);
9507    if (start >= end)
9508        return -1;
9509    kind = PyUnicode_KIND(str);
9510    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9511                      kind, end-start, ch, direction);
9512    if (result == -1)
9513        return -1;
9514    else
9515        return start + result;
9516}
9517
9518static int
9519tailmatch(PyObject *self,
9520          PyObject *substring,
9521          Py_ssize_t start,
9522          Py_ssize_t end,
9523          int direction)
9524{
9525    int kind_self;
9526    int kind_sub;
9527    void *data_self;
9528    void *data_sub;
9529    Py_ssize_t offset;
9530    Py_ssize_t i;
9531    Py_ssize_t end_sub;
9532
9533    if (PyUnicode_READY(self) == -1 ||
9534        PyUnicode_READY(substring) == -1)
9535        return -1;
9536
9537    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9538    end -= PyUnicode_GET_LENGTH(substring);
9539    if (end < start)
9540        return 0;
9541
9542    if (PyUnicode_GET_LENGTH(substring) == 0)
9543        return 1;
9544
9545    kind_self = PyUnicode_KIND(self);
9546    data_self = PyUnicode_DATA(self);
9547    kind_sub = PyUnicode_KIND(substring);
9548    data_sub = PyUnicode_DATA(substring);
9549    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9550
9551    if (direction > 0)
9552        offset = end;
9553    else
9554        offset = start;
9555
9556    if (PyUnicode_READ(kind_self, data_self, offset) ==
9557        PyUnicode_READ(kind_sub, data_sub, 0) &&
9558        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9559        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9560        /* If both are of the same kind, memcmp is sufficient */
9561        if (kind_self == kind_sub) {
9562            return ! memcmp((char *)data_self +
9563                                (offset * PyUnicode_KIND(substring)),
9564                            data_sub,
9565                            PyUnicode_GET_LENGTH(substring) *
9566                                PyUnicode_KIND(substring));
9567        }
9568        /* otherwise we have to compare each character by first accessing it */
9569        else {
9570            /* We do not need to compare 0 and len(substring)-1 because
9571               the if statement above ensured already that they are equal
9572               when we end up here. */
9573            for (i = 1; i < end_sub; ++i) {
9574                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9575                    PyUnicode_READ(kind_sub, data_sub, i))
9576                    return 0;
9577            }
9578            return 1;
9579        }
9580    }
9581
9582    return 0;
9583}
9584
9585Py_ssize_t
9586PyUnicode_Tailmatch(PyObject *str,
9587                    PyObject *substr,
9588                    Py_ssize_t start,
9589                    Py_ssize_t end,
9590                    int direction)
9591{
9592    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9593        return -1;
9594
9595    return tailmatch(str, substr, start, end, direction);
9596}
9597
9598/* Apply fixfct filter to the Unicode object self and return a
9599   reference to the modified object */
9600
9601static PyObject *
9602fixup(PyObject *self,
9603      Py_UCS4 (*fixfct)(PyObject *s))
9604{
9605    PyObject *u;
9606    Py_UCS4 maxchar_old, maxchar_new = 0;
9607    PyObject *v;
9608
9609    u = _PyUnicode_Copy(self);
9610    if (u == NULL)
9611        return NULL;
9612    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9613
9614    /* fix functions return the new maximum character in a string,
9615       if the kind of the resulting unicode object does not change,
9616       everything is fine.  Otherwise we need to change the string kind
9617       and re-run the fix function. */
9618    maxchar_new = fixfct(u);
9619
9620    if (maxchar_new == 0) {
9621        /* no changes */;
9622        if (PyUnicode_CheckExact(self)) {
9623            Py_DECREF(u);
9624            Py_INCREF(self);
9625            return self;
9626        }
9627        else
9628            return u;
9629    }
9630
9631    maxchar_new = align_maxchar(maxchar_new);
9632
9633    if (maxchar_new == maxchar_old)
9634        return u;
9635
9636    /* In case the maximum character changed, we need to
9637       convert the string to the new category. */
9638    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9639    if (v == NULL) {
9640        Py_DECREF(u);
9641        return NULL;
9642    }
9643    if (maxchar_new > maxchar_old) {
9644        /* If the maxchar increased so that the kind changed, not all
9645           characters are representable anymore and we need to fix the
9646           string again. This only happens in very few cases. */
9647        _PyUnicode_FastCopyCharacters(v, 0,
9648                                      self, 0, PyUnicode_GET_LENGTH(self));
9649        maxchar_old = fixfct(v);
9650        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9651    }
9652    else {
9653        _PyUnicode_FastCopyCharacters(v, 0,
9654                                      u, 0, PyUnicode_GET_LENGTH(self));
9655    }
9656    Py_DECREF(u);
9657    assert(_PyUnicode_CheckConsistency(v, 1));
9658    return v;
9659}
9660
9661static PyObject *
9662ascii_upper_or_lower(PyObject *self, int lower)
9663{
9664    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9665    char *resdata, *data = PyUnicode_DATA(self);
9666    PyObject *res;
9667
9668    res = PyUnicode_New(len, 127);
9669    if (res == NULL)
9670        return NULL;
9671    resdata = PyUnicode_DATA(res);
9672    if (lower)
9673        _Py_bytes_lower(resdata, data, len);
9674    else
9675        _Py_bytes_upper(resdata, data, len);
9676    return res;
9677}
9678
9679static Py_UCS4
9680handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9681{
9682    Py_ssize_t j;
9683    int final_sigma;
9684    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9685    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9686
9687     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9688
9689    where ! is a negation and \p{xxx} is a character with property xxx.
9690    */
9691    for (j = i - 1; j >= 0; j--) {
9692        c = PyUnicode_READ(kind, data, j);
9693        if (!_PyUnicode_IsCaseIgnorable(c))
9694            break;
9695    }
9696    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9697    if (final_sigma) {
9698        for (j = i + 1; j < length; j++) {
9699            c = PyUnicode_READ(kind, data, j);
9700            if (!_PyUnicode_IsCaseIgnorable(c))
9701                break;
9702        }
9703        final_sigma = j == length || !_PyUnicode_IsCased(c);
9704    }
9705    return (final_sigma) ? 0x3C2 : 0x3C3;
9706}
9707
9708static int
9709lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9710           Py_UCS4 c, Py_UCS4 *mapped)
9711{
9712    /* Obscure special case. */
9713    if (c == 0x3A3) {
9714        mapped[0] = handle_capital_sigma(kind, data, length, i);
9715        return 1;
9716    }
9717    return _PyUnicode_ToLowerFull(c, mapped);
9718}
9719
9720static Py_ssize_t
9721do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9722{
9723    Py_ssize_t i, k = 0;
9724    int n_res, j;
9725    Py_UCS4 c, mapped[3];
9726
9727    c = PyUnicode_READ(kind, data, 0);
9728    n_res = _PyUnicode_ToUpperFull(c, mapped);
9729    for (j = 0; j < n_res; j++) {
9730        *maxchar = Py_MAX(*maxchar, mapped[j]);
9731        res[k++] = mapped[j];
9732    }
9733    for (i = 1; i < length; i++) {
9734        c = PyUnicode_READ(kind, data, i);
9735        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9736        for (j = 0; j < n_res; j++) {
9737            *maxchar = Py_MAX(*maxchar, mapped[j]);
9738            res[k++] = mapped[j];
9739        }
9740    }
9741    return k;
9742}
9743
9744static Py_ssize_t
9745do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9746    Py_ssize_t i, k = 0;
9747
9748    for (i = 0; i < length; i++) {
9749        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9750        int n_res, j;
9751        if (Py_UNICODE_ISUPPER(c)) {
9752            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9753        }
9754        else if (Py_UNICODE_ISLOWER(c)) {
9755            n_res = _PyUnicode_ToUpperFull(c, mapped);
9756        }
9757        else {
9758            n_res = 1;
9759            mapped[0] = c;
9760        }
9761        for (j = 0; j < n_res; j++) {
9762            *maxchar = Py_MAX(*maxchar, mapped[j]);
9763            res[k++] = mapped[j];
9764        }
9765    }
9766    return k;
9767}
9768
9769static Py_ssize_t
9770do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9771                  Py_UCS4 *maxchar, int lower)
9772{
9773    Py_ssize_t i, k = 0;
9774
9775    for (i = 0; i < length; i++) {
9776        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9777        int n_res, j;
9778        if (lower)
9779            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9780        else
9781            n_res = _PyUnicode_ToUpperFull(c, mapped);
9782        for (j = 0; j < n_res; j++) {
9783            *maxchar = Py_MAX(*maxchar, mapped[j]);
9784            res[k++] = mapped[j];
9785        }
9786    }
9787    return k;
9788}
9789
9790static Py_ssize_t
9791do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792{
9793    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9794}
9795
9796static Py_ssize_t
9797do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9798{
9799    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9800}
9801
9802static Py_ssize_t
9803do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9804{
9805    Py_ssize_t i, k = 0;
9806
9807    for (i = 0; i < length; i++) {
9808        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9809        Py_UCS4 mapped[3];
9810        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9811        for (j = 0; j < n_res; j++) {
9812            *maxchar = Py_MAX(*maxchar, mapped[j]);
9813            res[k++] = mapped[j];
9814        }
9815    }
9816    return k;
9817}
9818
9819static Py_ssize_t
9820do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9821{
9822    Py_ssize_t i, k = 0;
9823    int previous_is_cased;
9824
9825    previous_is_cased = 0;
9826    for (i = 0; i < length; i++) {
9827        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9828        Py_UCS4 mapped[3];
9829        int n_res, j;
9830
9831        if (previous_is_cased)
9832            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9833        else
9834            n_res = _PyUnicode_ToTitleFull(c, mapped);
9835
9836        for (j = 0; j < n_res; j++) {
9837            *maxchar = Py_MAX(*maxchar, mapped[j]);
9838            res[k++] = mapped[j];
9839        }
9840
9841        previous_is_cased = _PyUnicode_IsCased(c);
9842    }
9843    return k;
9844}
9845
9846static PyObject *
9847case_operation(PyObject *self,
9848               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9849{
9850    PyObject *res = NULL;
9851    Py_ssize_t length, newlength = 0;
9852    int kind, outkind;
9853    void *data, *outdata;
9854    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9855
9856    assert(PyUnicode_IS_READY(self));
9857
9858    kind = PyUnicode_KIND(self);
9859    data = PyUnicode_DATA(self);
9860    length = PyUnicode_GET_LENGTH(self);
9861    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9862        PyErr_SetString(PyExc_OverflowError, "string is too long");
9863        return NULL;
9864    }
9865    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9866    if (tmp == NULL)
9867        return PyErr_NoMemory();
9868    newlength = perform(kind, data, length, tmp, &maxchar);
9869    res = PyUnicode_New(newlength, maxchar);
9870    if (res == NULL)
9871        goto leave;
9872    tmpend = tmp + newlength;
9873    outdata = PyUnicode_DATA(res);
9874    outkind = PyUnicode_KIND(res);
9875    switch (outkind) {
9876    case PyUnicode_1BYTE_KIND:
9877        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9878        break;
9879    case PyUnicode_2BYTE_KIND:
9880        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9881        break;
9882    case PyUnicode_4BYTE_KIND:
9883        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9884        break;
9885    default:
9886        assert(0);
9887        break;
9888    }
9889  leave:
9890    PyMem_FREE(tmp);
9891    return res;
9892}
9893
9894PyObject *
9895PyUnicode_Join(PyObject *separator, PyObject *seq)
9896{
9897    PyObject *res;
9898    PyObject *fseq;
9899    Py_ssize_t seqlen;
9900    PyObject **items;
9901
9902    fseq = PySequence_Fast(seq, "can only join an iterable");
9903    if (fseq == NULL) {
9904        return NULL;
9905    }
9906
9907    /* NOTE: the following code can't call back into Python code,
9908     * so we are sure that fseq won't be mutated.
9909     */
9910
9911    items = PySequence_Fast_ITEMS(fseq);
9912    seqlen = PySequence_Fast_GET_SIZE(fseq);
9913    res = _PyUnicode_JoinArray(separator, items, seqlen);
9914    Py_DECREF(fseq);
9915    return res;
9916}
9917
9918PyObject *
9919_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9920{
9921    PyObject *res = NULL; /* the result */
9922    PyObject *sep = NULL;
9923    Py_ssize_t seplen;
9924    PyObject *item;
9925    Py_ssize_t sz, i, res_offset;
9926    Py_UCS4 maxchar;
9927    Py_UCS4 item_maxchar;
9928    int use_memcpy;
9929    unsigned char *res_data = NULL, *sep_data = NULL;
9930    PyObject *last_obj;
9931    unsigned int kind = 0;
9932
9933    /* If empty sequence, return u"". */
9934    if (seqlen == 0) {
9935        _Py_RETURN_UNICODE_EMPTY();
9936    }
9937
9938    /* If singleton sequence with an exact Unicode, return that. */
9939    last_obj = NULL;
9940    if (seqlen == 1) {
9941        if (PyUnicode_CheckExact(items[0])) {
9942            res = items[0];
9943            Py_INCREF(res);
9944            return res;
9945        }
9946        seplen = 0;
9947        maxchar = 0;
9948    }
9949    else {
9950        /* Set up sep and seplen */
9951        if (separator == NULL) {
9952            /* fall back to a blank space separator */
9953            sep = PyUnicode_FromOrdinal(' ');
9954            if (!sep)
9955                goto onError;
9956            seplen = 1;
9957            maxchar = 32;
9958        }
9959        else {
9960            if (!PyUnicode_Check(separator)) {
9961                PyErr_Format(PyExc_TypeError,
9962                             "separator: expected str instance,"
9963                             " %.80s found",
9964                             Py_TYPE(separator)->tp_name);
9965                goto onError;
9966            }
9967            if (PyUnicode_READY(separator))
9968                goto onError;
9969            sep = separator;
9970            seplen = PyUnicode_GET_LENGTH(separator);
9971            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9972            /* inc refcount to keep this code path symmetric with the
9973               above case of a blank separator */
9974            Py_INCREF(sep);
9975        }
9976        last_obj = sep;
9977    }
9978
9979    /* There are at least two things to join, or else we have a subclass
9980     * of str in the sequence.
9981     * Do a pre-pass to figure out the total amount of space we'll
9982     * need (sz), and see whether all argument are strings.
9983     */
9984    sz = 0;
9985#ifdef Py_DEBUG
9986    use_memcpy = 0;
9987#else
9988    use_memcpy = 1;
9989#endif
9990    for (i = 0; i < seqlen; i++) {
9991        size_t add_sz;
9992        item = items[i];
9993        if (!PyUnicode_Check(item)) {
9994            PyErr_Format(PyExc_TypeError,
9995                         "sequence item %zd: expected str instance,"
9996                         " %.80s found",
9997                         i, Py_TYPE(item)->tp_name);
9998            goto onError;
9999        }
10000        if (PyUnicode_READY(item) == -1)
10001            goto onError;
10002        add_sz = PyUnicode_GET_LENGTH(item);
10003        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10004        maxchar = Py_MAX(maxchar, item_maxchar);
10005        if (i != 0) {
10006            add_sz += seplen;
10007        }
10008        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10009            PyErr_SetString(PyExc_OverflowError,
10010                            "join() result is too long for a Python string");
10011            goto onError;
10012        }
10013        sz += add_sz;
10014        if (use_memcpy && last_obj != NULL) {
10015            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10016                use_memcpy = 0;
10017        }
10018        last_obj = item;
10019    }
10020
10021    res = PyUnicode_New(sz, maxchar);
10022    if (res == NULL)
10023        goto onError;
10024
10025    /* Catenate everything. */
10026#ifdef Py_DEBUG
10027    use_memcpy = 0;
10028#else
10029    if (use_memcpy) {
10030        res_data = PyUnicode_1BYTE_DATA(res);
10031        kind = PyUnicode_KIND(res);
10032        if (seplen != 0)
10033            sep_data = PyUnicode_1BYTE_DATA(sep);
10034    }
10035#endif
10036    if (use_memcpy) {
10037        for (i = 0; i < seqlen; ++i) {
10038            Py_ssize_t itemlen;
10039            item = items[i];
10040
10041            /* Copy item, and maybe the separator. */
10042            if (i && seplen != 0) {
10043                memcpy(res_data,
10044                          sep_data,
10045                          kind * seplen);
10046                res_data += kind * seplen;
10047            }
10048
10049            itemlen = PyUnicode_GET_LENGTH(item);
10050            if (itemlen != 0) {
10051                memcpy(res_data,
10052                          PyUnicode_DATA(item),
10053                          kind * itemlen);
10054                res_data += kind * itemlen;
10055            }
10056        }
10057        assert(res_data == PyUnicode_1BYTE_DATA(res)
10058                           + kind * PyUnicode_GET_LENGTH(res));
10059    }
10060    else {
10061        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10062            Py_ssize_t itemlen;
10063            item = items[i];
10064
10065            /* Copy item, and maybe the separator. */
10066            if (i && seplen != 0) {
10067                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10068                res_offset += seplen;
10069            }
10070
10071            itemlen = PyUnicode_GET_LENGTH(item);
10072            if (itemlen != 0) {
10073                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10074                res_offset += itemlen;
10075            }
10076        }
10077        assert(res_offset == PyUnicode_GET_LENGTH(res));
10078    }
10079
10080    Py_XDECREF(sep);
10081    assert(_PyUnicode_CheckConsistency(res, 1));
10082    return res;
10083
10084  onError:
10085    Py_XDECREF(sep);
10086    Py_XDECREF(res);
10087    return NULL;
10088}
10089
10090#define FILL(kind, data, value, start, length) \
10091    do { \
10092        Py_ssize_t i_ = 0; \
10093        assert(kind != PyUnicode_WCHAR_KIND); \
10094        switch ((kind)) { \
10095        case PyUnicode_1BYTE_KIND: { \
10096            unsigned char * to_ = (unsigned char *)((data)) + (start); \
10097            memset(to_, (unsigned char)value, (length)); \
10098            break; \
10099        } \
10100        case PyUnicode_2BYTE_KIND: { \
10101            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10102            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10103            break; \
10104        } \
10105        case PyUnicode_4BYTE_KIND: { \
10106            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10107            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10108            break; \
10109        } \
10110        default: assert(0); \
10111        } \
10112    } while (0)
10113
10114void
10115_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10116                    Py_UCS4 fill_char)
10117{
10118    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10119    const void *data = PyUnicode_DATA(unicode);
10120    assert(PyUnicode_IS_READY(unicode));
10121    assert(unicode_modifiable(unicode));
10122    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10123    assert(start >= 0);
10124    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10125    FILL(kind, data, fill_char, start, length);
10126}
10127
10128Py_ssize_t
10129PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10130               Py_UCS4 fill_char)
10131{
10132    Py_ssize_t maxlen;
10133
10134    if (!PyUnicode_Check(unicode)) {
10135        PyErr_BadInternalCall();
10136        return -1;
10137    }
10138    if (PyUnicode_READY(unicode) == -1)
10139        return -1;
10140    if (unicode_check_modifiable(unicode))
10141        return -1;
10142
10143    if (start < 0) {
10144        PyErr_SetString(PyExc_IndexError, "string index out of range");
10145        return -1;
10146    }
10147    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10148        PyErr_SetString(PyExc_ValueError,
10149                         "fill character is bigger than "
10150                         "the string maximum character");
10151        return -1;
10152    }
10153
10154    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10155    length = Py_MIN(maxlen, length);
10156    if (length <= 0)
10157        return 0;
10158
10159    _PyUnicode_FastFill(unicode, start, length, fill_char);
10160    return length;
10161}
10162
10163static PyObject *
10164pad(PyObject *self,
10165    Py_ssize_t left,
10166    Py_ssize_t right,
10167    Py_UCS4 fill)
10168{
10169    PyObject *u;
10170    Py_UCS4 maxchar;
10171    int kind;
10172    void *data;
10173
10174    if (left < 0)
10175        left = 0;
10176    if (right < 0)
10177        right = 0;
10178
10179    if (left == 0 && right == 0)
10180        return unicode_result_unchanged(self);
10181
10182    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10183        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10184        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10185        return NULL;
10186    }
10187    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10188    maxchar = Py_MAX(maxchar, fill);
10189    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10190    if (!u)
10191        return NULL;
10192
10193    kind = PyUnicode_KIND(u);
10194    data = PyUnicode_DATA(u);
10195    if (left)
10196        FILL(kind, data, fill, 0, left);
10197    if (right)
10198        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10199    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10200    assert(_PyUnicode_CheckConsistency(u, 1));
10201    return u;
10202}
10203
10204PyObject *
10205PyUnicode_Splitlines(PyObject *string, int keepends)
10206{
10207    PyObject *list;
10208
10209    if (ensure_unicode(string) < 0)
10210        return NULL;
10211
10212    switch (PyUnicode_KIND(string)) {
10213    case PyUnicode_1BYTE_KIND:
10214        if (PyUnicode_IS_ASCII(string))
10215            list = asciilib_splitlines(
10216                string, PyUnicode_1BYTE_DATA(string),
10217                PyUnicode_GET_LENGTH(string), keepends);
10218        else
10219            list = ucs1lib_splitlines(
10220                string, PyUnicode_1BYTE_DATA(string),
10221                PyUnicode_GET_LENGTH(string), keepends);
10222        break;
10223    case PyUnicode_2BYTE_KIND:
10224        list = ucs2lib_splitlines(
10225            string, PyUnicode_2BYTE_DATA(string),
10226            PyUnicode_GET_LENGTH(string), keepends);
10227        break;
10228    case PyUnicode_4BYTE_KIND:
10229        list = ucs4lib_splitlines(
10230            string, PyUnicode_4BYTE_DATA(string),
10231            PyUnicode_GET_LENGTH(string), keepends);
10232        break;
10233    default:
10234        assert(0);
10235        list = 0;
10236    }
10237    return list;
10238}
10239
10240static PyObject *
10241split(PyObject *self,
10242      PyObject *substring,
10243      Py_ssize_t maxcount)
10244{
10245    int kind1, kind2;
10246    void *buf1, *buf2;
10247    Py_ssize_t len1, len2;
10248    PyObject* out;
10249
10250    if (maxcount < 0)
10251        maxcount = PY_SSIZE_T_MAX;
10252
10253    if (PyUnicode_READY(self) == -1)
10254        return NULL;
10255
10256    if (substring == NULL)
10257        switch (PyUnicode_KIND(self)) {
10258        case PyUnicode_1BYTE_KIND:
10259            if (PyUnicode_IS_ASCII(self))
10260                return asciilib_split_whitespace(
10261                    self,  PyUnicode_1BYTE_DATA(self),
10262                    PyUnicode_GET_LENGTH(self), maxcount
10263                    );
10264            else
10265                return ucs1lib_split_whitespace(
10266                    self,  PyUnicode_1BYTE_DATA(self),
10267                    PyUnicode_GET_LENGTH(self), maxcount
10268                    );
10269        case PyUnicode_2BYTE_KIND:
10270            return ucs2lib_split_whitespace(
10271                self,  PyUnicode_2BYTE_DATA(self),
10272                PyUnicode_GET_LENGTH(self), maxcount
10273                );
10274        case PyUnicode_4BYTE_KIND:
10275            return ucs4lib_split_whitespace(
10276                self,  PyUnicode_4BYTE_DATA(self),
10277                PyUnicode_GET_LENGTH(self), maxcount
10278                );
10279        default:
10280            assert(0);
10281            return NULL;
10282        }
10283
10284    if (PyUnicode_READY(substring) == -1)
10285        return NULL;
10286
10287    kind1 = PyUnicode_KIND(self);
10288    kind2 = PyUnicode_KIND(substring);
10289    len1 = PyUnicode_GET_LENGTH(self);
10290    len2 = PyUnicode_GET_LENGTH(substring);
10291    if (kind1 < kind2 || len1 < len2) {
10292        out = PyList_New(1);
10293        if (out == NULL)
10294            return NULL;
10295        Py_INCREF(self);
10296        PyList_SET_ITEM(out, 0, self);
10297        return out;
10298    }
10299    buf1 = PyUnicode_DATA(self);
10300    buf2 = PyUnicode_DATA(substring);
10301    if (kind2 != kind1) {
10302        buf2 = _PyUnicode_AsKind(substring, kind1);
10303        if (!buf2)
10304            return NULL;
10305    }
10306
10307    switch (kind1) {
10308    case PyUnicode_1BYTE_KIND:
10309        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10310            out = asciilib_split(
10311                self,  buf1, len1, buf2, len2, maxcount);
10312        else
10313            out = ucs1lib_split(
10314                self,  buf1, len1, buf2, len2, maxcount);
10315        break;
10316    case PyUnicode_2BYTE_KIND:
10317        out = ucs2lib_split(
10318            self,  buf1, len1, buf2, len2, maxcount);
10319        break;
10320    case PyUnicode_4BYTE_KIND:
10321        out = ucs4lib_split(
10322            self,  buf1, len1, buf2, len2, maxcount);
10323        break;
10324    default:
10325        out = NULL;
10326    }
10327    if (kind2 != kind1)
10328        PyMem_Free(buf2);
10329    return out;
10330}
10331
10332static PyObject *
10333rsplit(PyObject *self,
10334       PyObject *substring,
10335       Py_ssize_t maxcount)
10336{
10337    int kind1, kind2;
10338    void *buf1, *buf2;
10339    Py_ssize_t len1, len2;
10340    PyObject* out;
10341
10342    if (maxcount < 0)
10343        maxcount = PY_SSIZE_T_MAX;
10344
10345    if (PyUnicode_READY(self) == -1)
10346        return NULL;
10347
10348    if (substring == NULL)
10349        switch (PyUnicode_KIND(self)) {
10350        case PyUnicode_1BYTE_KIND:
10351            if (PyUnicode_IS_ASCII(self))
10352                return asciilib_rsplit_whitespace(
10353                    self,  PyUnicode_1BYTE_DATA(self),
10354                    PyUnicode_GET_LENGTH(self), maxcount
10355                    );
10356            else
10357                return ucs1lib_rsplit_whitespace(
10358                    self,  PyUnicode_1BYTE_DATA(self),
10359                    PyUnicode_GET_LENGTH(self), maxcount
10360                    );
10361        case PyUnicode_2BYTE_KIND:
10362            return ucs2lib_rsplit_whitespace(
10363                self,  PyUnicode_2BYTE_DATA(self),
10364                PyUnicode_GET_LENGTH(self), maxcount
10365                );
10366        case PyUnicode_4BYTE_KIND:
10367            return ucs4lib_rsplit_whitespace(
10368                self,  PyUnicode_4BYTE_DATA(self),
10369                PyUnicode_GET_LENGTH(self), maxcount
10370                );
10371        default:
10372            assert(0);
10373            return NULL;
10374        }
10375
10376    if (PyUnicode_READY(substring) == -1)
10377        return NULL;
10378
10379    kind1 = PyUnicode_KIND(self);
10380    kind2 = PyUnicode_KIND(substring);
10381    len1 = PyUnicode_GET_LENGTH(self);
10382    len2 = PyUnicode_GET_LENGTH(substring);
10383    if (kind1 < kind2 || len1 < len2) {
10384        out = PyList_New(1);
10385        if (out == NULL)
10386            return NULL;
10387        Py_INCREF(self);
10388        PyList_SET_ITEM(out, 0, self);
10389        return out;
10390    }
10391    buf1 = PyUnicode_DATA(self);
10392    buf2 = PyUnicode_DATA(substring);
10393    if (kind2 != kind1) {
10394        buf2 = _PyUnicode_AsKind(substring, kind1);
10395        if (!buf2)
10396            return NULL;
10397    }
10398
10399    switch (kind1) {
10400    case PyUnicode_1BYTE_KIND:
10401        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10402            out = asciilib_rsplit(
10403                self,  buf1, len1, buf2, len2, maxcount);
10404        else
10405            out = ucs1lib_rsplit(
10406                self,  buf1, len1, buf2, len2, maxcount);
10407        break;
10408    case PyUnicode_2BYTE_KIND:
10409        out = ucs2lib_rsplit(
10410            self,  buf1, len1, buf2, len2, maxcount);
10411        break;
10412    case PyUnicode_4BYTE_KIND:
10413        out = ucs4lib_rsplit(
10414            self,  buf1, len1, buf2, len2, maxcount);
10415        break;
10416    default:
10417        out = NULL;
10418    }
10419    if (kind2 != kind1)
10420        PyMem_Free(buf2);
10421    return out;
10422}
10423
10424static Py_ssize_t
10425anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10426            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10427{
10428    switch (kind) {
10429    case PyUnicode_1BYTE_KIND:
10430        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10431            return asciilib_find(buf1, len1, buf2, len2, offset);
10432        else
10433            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10434    case PyUnicode_2BYTE_KIND:
10435        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10436    case PyUnicode_4BYTE_KIND:
10437        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10438    }
10439    assert(0);
10440    return -1;
10441}
10442
10443static Py_ssize_t
10444anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10445             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10446{
10447    switch (kind) {
10448    case PyUnicode_1BYTE_KIND:
10449        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10450            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10451        else
10452            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10453    case PyUnicode_2BYTE_KIND:
10454        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10455    case PyUnicode_4BYTE_KIND:
10456        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10457    }
10458    assert(0);
10459    return 0;
10460}
10461
10462static void
10463replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10464                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10465{
10466    int kind = PyUnicode_KIND(u);
10467    void *data = PyUnicode_DATA(u);
10468    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10469    if (kind == PyUnicode_1BYTE_KIND) {
10470        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10471                                      (Py_UCS1 *)data + len,
10472                                      u1, u2, maxcount);
10473    }
10474    else if (kind == PyUnicode_2BYTE_KIND) {
10475        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10476                                      (Py_UCS2 *)data + len,
10477                                      u1, u2, maxcount);
10478    }
10479    else {
10480        assert(kind == PyUnicode_4BYTE_KIND);
10481        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10482                                      (Py_UCS4 *)data + len,
10483                                      u1, u2, maxcount);
10484    }
10485}
10486
10487static PyObject *
10488replace(PyObject *self, PyObject *str1,
10489        PyObject *str2, Py_ssize_t maxcount)
10490{
10491    PyObject *u;
10492    char *sbuf = PyUnicode_DATA(self);
10493    char *buf1 = PyUnicode_DATA(str1);
10494    char *buf2 = PyUnicode_DATA(str2);
10495    int srelease = 0, release1 = 0, release2 = 0;
10496    int skind = PyUnicode_KIND(self);
10497    int kind1 = PyUnicode_KIND(str1);
10498    int kind2 = PyUnicode_KIND(str2);
10499    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10500    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10501    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10502    int mayshrink;
10503    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10504
10505    if (maxcount < 0)
10506        maxcount = PY_SSIZE_T_MAX;
10507    else if (maxcount == 0 || slen == 0)
10508        goto nothing;
10509
10510    if (str1 == str2)
10511        goto nothing;
10512
10513    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10514    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10515    if (maxchar < maxchar_str1)
10516        /* substring too wide to be present */
10517        goto nothing;
10518    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10519    /* Replacing str1 with str2 may cause a maxchar reduction in the
10520       result string. */
10521    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10522    maxchar = Py_MAX(maxchar, maxchar_str2);
10523
10524    if (len1 == len2) {
10525        /* same length */
10526        if (len1 == 0)
10527            goto nothing;
10528        if (len1 == 1) {
10529            /* replace characters */
10530            Py_UCS4 u1, u2;
10531            Py_ssize_t pos;
10532
10533            u1 = PyUnicode_READ(kind1, buf1, 0);
10534            pos = findchar(sbuf, skind, slen, u1, 1);
10535            if (pos < 0)
10536                goto nothing;
10537            u2 = PyUnicode_READ(kind2, buf2, 0);
10538            u = PyUnicode_New(slen, maxchar);
10539            if (!u)
10540                goto error;
10541
10542            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10543            replace_1char_inplace(u, pos, u1, u2, maxcount);
10544        }
10545        else {
10546            int rkind = skind;
10547            char *res;
10548            Py_ssize_t i;
10549
10550            if (kind1 < rkind) {
10551                /* widen substring */
10552                buf1 = _PyUnicode_AsKind(str1, rkind);
10553                if (!buf1) goto error;
10554                release1 = 1;
10555            }
10556            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10557            if (i < 0)
10558                goto nothing;
10559            if (rkind > kind2) {
10560                /* widen replacement */
10561                buf2 = _PyUnicode_AsKind(str2, rkind);
10562                if (!buf2) goto error;
10563                release2 = 1;
10564            }
10565            else if (rkind < kind2) {
10566                /* widen self and buf1 */
10567                rkind = kind2;
10568                if (release1) PyMem_Free(buf1);
10569                release1 = 0;
10570                sbuf = _PyUnicode_AsKind(self, rkind);
10571                if (!sbuf) goto error;
10572                srelease = 1;
10573                buf1 = _PyUnicode_AsKind(str1, rkind);
10574                if (!buf1) goto error;
10575                release1 = 1;
10576            }
10577            u = PyUnicode_New(slen, maxchar);
10578            if (!u)
10579                goto error;
10580            assert(PyUnicode_KIND(u) == rkind);
10581            res = PyUnicode_DATA(u);
10582
10583            memcpy(res, sbuf, rkind * slen);
10584            /* change everything in-place, starting with this one */
10585            memcpy(res + rkind * i,
10586                   buf2,
10587                   rkind * len2);
10588            i += len1;
10589
10590            while ( --maxcount > 0) {
10591                i = anylib_find(rkind, self,
10592                                sbuf+rkind*i, slen-i,
10593                                str1, buf1, len1, i);
10594                if (i == -1)
10595                    break;
10596                memcpy(res + rkind * i,
10597                       buf2,
10598                       rkind * len2);
10599                i += len1;
10600            }
10601        }
10602    }
10603    else {
10604        Py_ssize_t n, i, j, ires;
10605        Py_ssize_t new_size;
10606        int rkind = skind;
10607        char *res;
10608
10609        if (kind1 < rkind) {
10610            /* widen substring */
10611            buf1 = _PyUnicode_AsKind(str1, rkind);
10612            if (!buf1) goto error;
10613            release1 = 1;
10614        }
10615        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10616        if (n == 0)
10617            goto nothing;
10618        if (kind2 < rkind) {
10619            /* widen replacement */
10620            buf2 = _PyUnicode_AsKind(str2, rkind);
10621            if (!buf2) goto error;
10622            release2 = 1;
10623        }
10624        else if (kind2 > rkind) {
10625            /* widen self and buf1 */
10626            rkind = kind2;
10627            sbuf = _PyUnicode_AsKind(self, rkind);
10628            if (!sbuf) goto error;
10629            srelease = 1;
10630            if (release1) PyMem_Free(buf1);
10631            release1 = 0;
10632            buf1 = _PyUnicode_AsKind(str1, rkind);
10633            if (!buf1) goto error;
10634            release1 = 1;
10635        }
10636        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10637           PyUnicode_GET_LENGTH(str1))); */
10638        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10639                PyErr_SetString(PyExc_OverflowError,
10640                                "replace string is too long");
10641                goto error;
10642        }
10643        new_size = slen + n * (len2 - len1);
10644        if (new_size == 0) {
10645            _Py_INCREF_UNICODE_EMPTY();
10646            if (!unicode_empty)
10647                goto error;
10648            u = unicode_empty;
10649            goto done;
10650        }
10651        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10652            PyErr_SetString(PyExc_OverflowError,
10653                            "replace string is too long");
10654            goto error;
10655        }
10656        u = PyUnicode_New(new_size, maxchar);
10657        if (!u)
10658            goto error;
10659        assert(PyUnicode_KIND(u) == rkind);
10660        res = PyUnicode_DATA(u);
10661        ires = i = 0;
10662        if (len1 > 0) {
10663            while (n-- > 0) {
10664                /* look for next match */
10665                j = anylib_find(rkind, self,
10666                                sbuf + rkind * i, slen-i,
10667                                str1, buf1, len1, i);
10668                if (j == -1)
10669                    break;
10670                else if (j > i) {
10671                    /* copy unchanged part [i:j] */
10672                    memcpy(res + rkind * ires,
10673                           sbuf + rkind * i,
10674                           rkind * (j-i));
10675                    ires += j - i;
10676                }
10677                /* copy substitution string */
10678                if (len2 > 0) {
10679                    memcpy(res + rkind * ires,
10680                           buf2,
10681                           rkind * len2);
10682                    ires += len2;
10683                }
10684                i = j + len1;
10685            }
10686            if (i < slen)
10687                /* copy tail [i:] */
10688                memcpy(res + rkind * ires,
10689                       sbuf + rkind * i,
10690                       rkind * (slen-i));
10691        }
10692        else {
10693            /* interleave */
10694            while (n > 0) {
10695                memcpy(res + rkind * ires,
10696                       buf2,
10697                       rkind * len2);
10698                ires += len2;
10699                if (--n <= 0)
10700                    break;
10701                memcpy(res + rkind * ires,
10702                       sbuf + rkind * i,
10703                       rkind);
10704                ires++;
10705                i++;
10706            }
10707            memcpy(res + rkind * ires,
10708                   sbuf + rkind * i,
10709                   rkind * (slen-i));
10710        }
10711    }
10712
10713    if (mayshrink) {
10714        unicode_adjust_maxchar(&u);
10715        if (u == NULL)
10716            goto error;
10717    }
10718
10719  done:
10720    if (srelease)
10721        PyMem_FREE(sbuf);
10722    if (release1)
10723        PyMem_FREE(buf1);
10724    if (release2)
10725        PyMem_FREE(buf2);
10726    assert(_PyUnicode_CheckConsistency(u, 1));
10727    return u;
10728
10729  nothing:
10730    /* nothing to replace; return original string (when possible) */
10731    if (srelease)
10732        PyMem_FREE(sbuf);
10733    if (release1)
10734        PyMem_FREE(buf1);
10735    if (release2)
10736        PyMem_FREE(buf2);
10737    return unicode_result_unchanged(self);
10738
10739  error:
10740    if (srelease && sbuf)
10741        PyMem_FREE(sbuf);
10742    if (release1 && buf1)
10743        PyMem_FREE(buf1);
10744    if (release2 && buf2)
10745        PyMem_FREE(buf2);
10746    return NULL;
10747}
10748
10749/* --- Unicode Object Methods --------------------------------------------- */
10750
10751PyDoc_STRVAR(title__doc__,
10752             "S.title() -> str\n\
10753\n\
10754Return a titlecased version of S, i.e. words start with title case\n\
10755characters, all remaining cased characters have lower case.");
10756
10757static PyObject*
10758unicode_title(PyObject *self)
10759{
10760    if (PyUnicode_READY(self) == -1)
10761        return NULL;
10762    return case_operation(self, do_title);
10763}
10764
10765PyDoc_STRVAR(capitalize__doc__,
10766             "S.capitalize() -> str\n\
10767\n\
10768Return a capitalized version of S, i.e. make the first character\n\
10769have upper case and the rest lower case.");
10770
10771static PyObject*
10772unicode_capitalize(PyObject *self)
10773{
10774    if (PyUnicode_READY(self) == -1)
10775        return NULL;
10776    if (PyUnicode_GET_LENGTH(self) == 0)
10777        return unicode_result_unchanged(self);
10778    return case_operation(self, do_capitalize);
10779}
10780
10781PyDoc_STRVAR(casefold__doc__,
10782             "S.casefold() -> str\n\
10783\n\
10784Return a version of S suitable for caseless comparisons.");
10785
10786static PyObject *
10787unicode_casefold(PyObject *self)
10788{
10789    if (PyUnicode_READY(self) == -1)
10790        return NULL;
10791    if (PyUnicode_IS_ASCII(self))
10792        return ascii_upper_or_lower(self, 1);
10793    return case_operation(self, do_casefold);
10794}
10795
10796
10797/* Argument converter. Accepts a single Unicode character. */
10798
10799static int
10800convert_uc(PyObject *obj, void *addr)
10801{
10802    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10803
10804    if (!PyUnicode_Check(obj)) {
10805        PyErr_Format(PyExc_TypeError,
10806                     "The fill character must be a unicode character, "
10807                     "not %.100s", Py_TYPE(obj)->tp_name);
10808        return 0;
10809    }
10810    if (PyUnicode_READY(obj) < 0)
10811        return 0;
10812    if (PyUnicode_GET_LENGTH(obj) != 1) {
10813        PyErr_SetString(PyExc_TypeError,
10814                        "The fill character must be exactly one character long");
10815        return 0;
10816    }
10817    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10818    return 1;
10819}
10820
10821PyDoc_STRVAR(center__doc__,
10822             "S.center(width[, fillchar]) -> str\n\
10823\n\
10824Return S centered in a string of length width. Padding is\n\
10825done using the specified fill character (default is a space)");
10826
10827static PyObject *
10828unicode_center(PyObject *self, PyObject *args)
10829{
10830    Py_ssize_t marg, left;
10831    Py_ssize_t width;
10832    Py_UCS4 fillchar = ' ';
10833
10834    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10835        return NULL;
10836
10837    if (PyUnicode_READY(self) == -1)
10838        return NULL;
10839
10840    if (PyUnicode_GET_LENGTH(self) >= width)
10841        return unicode_result_unchanged(self);
10842
10843    marg = width - PyUnicode_GET_LENGTH(self);
10844    left = marg / 2 + (marg & width & 1);
10845
10846    return pad(self, left, marg - left, fillchar);
10847}
10848
10849/* This function assumes that str1 and str2 are readied by the caller. */
10850
10851static int
10852unicode_compare(PyObject *str1, PyObject *str2)
10853{
10854#define COMPARE(TYPE1, TYPE2) \
10855    do { \
10856        TYPE1* p1 = (TYPE1 *)data1; \
10857        TYPE2* p2 = (TYPE2 *)data2; \
10858        TYPE1* end = p1 + len; \
10859        Py_UCS4 c1, c2; \
10860        for (; p1 != end; p1++, p2++) { \
10861            c1 = *p1; \
10862            c2 = *p2; \
10863            if (c1 != c2) \
10864                return (c1 < c2) ? -1 : 1; \
10865        } \
10866    } \
10867    while (0)
10868
10869    int kind1, kind2;
10870    void *data1, *data2;
10871    Py_ssize_t len1, len2, len;
10872
10873    kind1 = PyUnicode_KIND(str1);
10874    kind2 = PyUnicode_KIND(str2);
10875    data1 = PyUnicode_DATA(str1);
10876    data2 = PyUnicode_DATA(str2);
10877    len1 = PyUnicode_GET_LENGTH(str1);
10878    len2 = PyUnicode_GET_LENGTH(str2);
10879    len = Py_MIN(len1, len2);
10880
10881    switch(kind1) {
10882    case PyUnicode_1BYTE_KIND:
10883    {
10884        switch(kind2) {
10885        case PyUnicode_1BYTE_KIND:
10886        {
10887            int cmp = memcmp(data1, data2, len);
10888            /* normalize result of memcmp() into the range [-1; 1] */
10889            if (cmp < 0)
10890                return -1;
10891            if (cmp > 0)
10892                return 1;
10893            break;
10894        }
10895        case PyUnicode_2BYTE_KIND:
10896            COMPARE(Py_UCS1, Py_UCS2);
10897            break;
10898        case PyUnicode_4BYTE_KIND:
10899            COMPARE(Py_UCS1, Py_UCS4);
10900            break;
10901        default:
10902            assert(0);
10903        }
10904        break;
10905    }
10906    case PyUnicode_2BYTE_KIND:
10907    {
10908        switch(kind2) {
10909        case PyUnicode_1BYTE_KIND:
10910            COMPARE(Py_UCS2, Py_UCS1);
10911            break;
10912        case PyUnicode_2BYTE_KIND:
10913        {
10914            COMPARE(Py_UCS2, Py_UCS2);
10915            break;
10916        }
10917        case PyUnicode_4BYTE_KIND:
10918            COMPARE(Py_UCS2, Py_UCS4);
10919            break;
10920        default:
10921            assert(0);
10922        }
10923        break;
10924    }
10925    case PyUnicode_4BYTE_KIND:
10926    {
10927        switch(kind2) {
10928        case PyUnicode_1BYTE_KIND:
10929            COMPARE(Py_UCS4, Py_UCS1);
10930            break;
10931        case PyUnicode_2BYTE_KIND:
10932            COMPARE(Py_UCS4, Py_UCS2);
10933            break;
10934        case PyUnicode_4BYTE_KIND:
10935        {
10936#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10937            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10938            /* normalize result of wmemcmp() into the range [-1; 1] */
10939            if (cmp < 0)
10940                return -1;
10941            if (cmp > 0)
10942                return 1;
10943#else
10944            COMPARE(Py_UCS4, Py_UCS4);
10945#endif
10946            break;
10947        }
10948        default:
10949            assert(0);
10950        }
10951        break;
10952    }
10953    default:
10954        assert(0);
10955    }
10956
10957    if (len1 == len2)
10958        return 0;
10959    if (len1 < len2)
10960        return -1;
10961    else
10962        return 1;
10963
10964#undef COMPARE
10965}
10966
10967static int
10968unicode_compare_eq(PyObject *str1, PyObject *str2)
10969{
10970    int kind;
10971    void *data1, *data2;
10972    Py_ssize_t len;
10973    int cmp;
10974
10975    len = PyUnicode_GET_LENGTH(str1);
10976    if (PyUnicode_GET_LENGTH(str2) != len)
10977        return 0;
10978    kind = PyUnicode_KIND(str1);
10979    if (PyUnicode_KIND(str2) != kind)
10980        return 0;
10981    data1 = PyUnicode_DATA(str1);
10982    data2 = PyUnicode_DATA(str2);
10983
10984    cmp = memcmp(data1, data2, len * kind);
10985    return (cmp == 0);
10986}
10987
10988
10989int
10990PyUnicode_Compare(PyObject *left, PyObject *right)
10991{
10992    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10993        if (PyUnicode_READY(left) == -1 ||
10994            PyUnicode_READY(right) == -1)
10995            return -1;
10996
10997        /* a string is equal to itself */
10998        if (left == right)
10999            return 0;
11000
11001        return unicode_compare(left, right);
11002    }
11003    PyErr_Format(PyExc_TypeError,
11004                 "Can't compare %.100s and %.100s",
11005                 left->ob_type->tp_name,
11006                 right->ob_type->tp_name);
11007    return -1;
11008}
11009
11010int
11011PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11012{
11013    Py_ssize_t i;
11014    int kind;
11015    Py_UCS4 chr;
11016    const unsigned char *ustr = (const unsigned char *)str;
11017
11018    assert(_PyUnicode_CHECK(uni));
11019    if (!PyUnicode_IS_READY(uni)) {
11020        const wchar_t *ws = _PyUnicode_WSTR(uni);
11021        /* Compare Unicode string and source character set string */
11022        for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11023            if (chr != ustr[i])
11024                return (chr < ustr[i]) ? -1 : 1;
11025        }
11026        /* This check keeps Python strings that end in '\0' from comparing equal
11027         to C strings identical up to that point. */
11028        if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11029            return 1; /* uni is longer */
11030        if (ustr[i])
11031            return -1; /* str is longer */
11032        return 0;
11033    }
11034    kind = PyUnicode_KIND(uni);
11035    if (kind == PyUnicode_1BYTE_KIND) {
11036        const void *data = PyUnicode_1BYTE_DATA(uni);
11037        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11038        size_t len, len2 = strlen(str);
11039        int cmp;
11040
11041        len = Py_MIN(len1, len2);
11042        cmp = memcmp(data, str, len);
11043        if (cmp != 0) {
11044            if (cmp < 0)
11045                return -1;
11046            else
11047                return 1;
11048        }
11049        if (len1 > len2)
11050            return 1; /* uni is longer */
11051        if (len1 < len2)
11052            return -1; /* str is longer */
11053        return 0;
11054    }
11055    else {
11056        void *data = PyUnicode_DATA(uni);
11057        /* Compare Unicode string and source character set string */
11058        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11059            if (chr != (unsigned char)str[i])
11060                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11061        /* This check keeps Python strings that end in '\0' from comparing equal
11062         to C strings identical up to that point. */
11063        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11064            return 1; /* uni is longer */
11065        if (str[i])
11066            return -1; /* str is longer */
11067        return 0;
11068    }
11069}
11070
11071static int
11072non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11073{
11074    size_t i, len;
11075    const wchar_t *p;
11076    len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11077    if (strlen(str) != len)
11078        return 0;
11079    p = _PyUnicode_WSTR(unicode);
11080    assert(p);
11081    for (i = 0; i < len; i++) {
11082        unsigned char c = (unsigned char)str[i];
11083        if (c >= 128 || p[i] != (wchar_t)c)
11084            return 0;
11085    }
11086    return 1;
11087}
11088
11089int
11090_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11091{
11092    size_t len;
11093    assert(_PyUnicode_CHECK(unicode));
11094    assert(str);
11095#ifndef NDEBUG
11096    for (const char *p = str; *p; p++) {
11097        assert((unsigned char)*p < 128);
11098    }
11099#endif
11100    if (PyUnicode_READY(unicode) == -1) {
11101        /* Memory error or bad data */
11102        PyErr_Clear();
11103        return non_ready_unicode_equal_to_ascii_string(unicode, str);
11104    }
11105    if (!PyUnicode_IS_ASCII(unicode))
11106        return 0;
11107    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11108    return strlen(str) == len &&
11109           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11110}
11111
11112int
11113_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11114{
11115    PyObject *right_uni;
11116    Py_hash_t hash;
11117
11118    assert(_PyUnicode_CHECK(left));
11119    assert(right->string);
11120#ifndef NDEBUG
11121    for (const char *p = right->string; *p; p++) {
11122        assert((unsigned char)*p < 128);
11123    }
11124#endif
11125
11126    if (PyUnicode_READY(left) == -1) {
11127        /* memory error or bad data */
11128        PyErr_Clear();
11129        return non_ready_unicode_equal_to_ascii_string(left, right->string);
11130    }
11131
11132    if (!PyUnicode_IS_ASCII(left))
11133        return 0;
11134
11135    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11136    if (right_uni == NULL) {
11137        /* memory error or bad data */
11138        PyErr_Clear();
11139        return _PyUnicode_EqualToASCIIString(left, right->string);
11140    }
11141
11142    if (left == right_uni)
11143        return 1;
11144
11145    if (PyUnicode_CHECK_INTERNED(left))
11146        return 0;
11147
11148    assert(_PyUnicode_HASH(right_uni) != 1);
11149    hash = _PyUnicode_HASH(left);
11150    if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11151        return 0;
11152
11153    return unicode_compare_eq(left, right_uni);
11154}
11155
11156#define TEST_COND(cond)                         \
11157    ((cond) ? Py_True : Py_False)
11158
11159PyObject *
11160PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11161{
11162    int result;
11163    PyObject *v;
11164
11165    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11166        Py_RETURN_NOTIMPLEMENTED;
11167
11168    if (PyUnicode_READY(left) == -1 ||
11169        PyUnicode_READY(right) == -1)
11170        return NULL;
11171
11172    if (left == right) {
11173        switch (op) {
11174        case Py_EQ:
11175        case Py_LE:
11176        case Py_GE:
11177            /* a string is equal to itself */
11178            v = Py_True;
11179            break;
11180        case Py_NE:
11181        case Py_LT:
11182        case Py_GT:
11183            v = Py_False;
11184            break;
11185        default:
11186            PyErr_BadArgument();
11187            return NULL;
11188        }
11189    }
11190    else if (op == Py_EQ || op == Py_NE) {
11191        result = unicode_compare_eq(left, right);
11192        result ^= (op == Py_NE);
11193        v = TEST_COND(result);
11194    }
11195    else {
11196        result = unicode_compare(left, right);
11197
11198        /* Convert the return value to a Boolean */
11199        switch (op) {
11200        case Py_LE:
11201            v = TEST_COND(result <= 0);
11202            break;
11203        case Py_GE:
11204            v = TEST_COND(result >= 0);
11205            break;
11206        case Py_LT:
11207            v = TEST_COND(result == -1);
11208            break;
11209        case Py_GT:
11210            v = TEST_COND(result == 1);
11211            break;
11212        default:
11213            PyErr_BadArgument();
11214            return NULL;
11215        }
11216    }
11217    Py_INCREF(v);
11218    return v;
11219}
11220
11221int
11222_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11223{
11224    return unicode_eq(aa, bb);
11225}
11226
11227int
11228PyUnicode_Contains(PyObject *str, PyObject *substr)
11229{
11230    int kind1, kind2;
11231    void *buf1, *buf2;
11232    Py_ssize_t len1, len2;
11233    int result;
11234
11235    if (!PyUnicode_Check(substr)) {
11236        PyErr_Format(PyExc_TypeError,
11237                     "'in <string>' requires string as left operand, not %.100s",
11238                     Py_TYPE(substr)->tp_name);
11239        return -1;
11240    }
11241    if (PyUnicode_READY(substr) == -1)
11242        return -1;
11243    if (ensure_unicode(str) < 0)
11244        return -1;
11245
11246    kind1 = PyUnicode_KIND(str);
11247    kind2 = PyUnicode_KIND(substr);
11248    if (kind1 < kind2)
11249        return 0;
11250    len1 = PyUnicode_GET_LENGTH(str);
11251    len2 = PyUnicode_GET_LENGTH(substr);
11252    if (len1 < len2)
11253        return 0;
11254    buf1 = PyUnicode_DATA(str);
11255    buf2 = PyUnicode_DATA(substr);
11256    if (len2 == 1) {
11257        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11258        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11259        return result;
11260    }
11261    if (kind2 != kind1) {
11262        buf2 = _PyUnicode_AsKind(substr, kind1);
11263        if (!buf2)
11264            return -1;
11265    }
11266
11267    switch (kind1) {
11268    case PyUnicode_1BYTE_KIND:
11269        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11270        break;
11271    case PyUnicode_2BYTE_KIND:
11272        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11273        break;
11274    case PyUnicode_4BYTE_KIND:
11275        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11276        break;
11277    default:
11278        result = -1;
11279        assert(0);
11280    }
11281
11282    if (kind2 != kind1)
11283        PyMem_Free(buf2);
11284
11285    return result;
11286}
11287
11288/* Concat to string or Unicode object giving a new Unicode object. */
11289
11290PyObject *
11291PyUnicode_Concat(PyObject *left, PyObject *right)
11292{
11293    PyObject *result;
11294    Py_UCS4 maxchar, maxchar2;
11295    Py_ssize_t left_len, right_len, new_len;
11296
11297    if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11298        return NULL;
11299
11300    /* Shortcuts */
11301    if (left == unicode_empty)
11302        return PyUnicode_FromObject(right);
11303    if (right == unicode_empty)
11304        return PyUnicode_FromObject(left);
11305
11306    left_len = PyUnicode_GET_LENGTH(left);
11307    right_len = PyUnicode_GET_LENGTH(right);
11308    if (left_len > PY_SSIZE_T_MAX - right_len) {
11309        PyErr_SetString(PyExc_OverflowError,
11310                        "strings are too large to concat");
11311        return NULL;
11312    }
11313    new_len = left_len + right_len;
11314
11315    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11316    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11317    maxchar = Py_MAX(maxchar, maxchar2);
11318
11319    /* Concat the two Unicode strings */
11320    result = PyUnicode_New(new_len, maxchar);
11321    if (result == NULL)
11322        return NULL;
11323    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11324    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11325    assert(_PyUnicode_CheckConsistency(result, 1));
11326    return result;
11327}
11328
11329void
11330PyUnicode_Append(PyObject **p_left, PyObject *right)
11331{
11332    PyObject *left, *res;
11333    Py_UCS4 maxchar, maxchar2;
11334    Py_ssize_t left_len, right_len, new_len;
11335
11336    if (p_left == NULL) {
11337        if (!PyErr_Occurred())
11338            PyErr_BadInternalCall();
11339        return;
11340    }
11341    left = *p_left;
11342    if (right == NULL || left == NULL
11343        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11344        if (!PyErr_Occurred())
11345            PyErr_BadInternalCall();
11346        goto error;
11347    }
11348
11349    if (PyUnicode_READY(left) == -1)
11350        goto error;
11351    if (PyUnicode_READY(right) == -1)
11352        goto error;
11353
11354    /* Shortcuts */
11355    if (left == unicode_empty) {
11356        Py_DECREF(left);
11357        Py_INCREF(right);
11358        *p_left = right;
11359        return;
11360    }
11361    if (right == unicode_empty)
11362        return;
11363
11364    left_len = PyUnicode_GET_LENGTH(left);
11365    right_len = PyUnicode_GET_LENGTH(right);
11366    if (left_len > PY_SSIZE_T_MAX - right_len) {
11367        PyErr_SetString(PyExc_OverflowError,
11368                        "strings are too large to concat");
11369        goto error;
11370    }
11371    new_len = left_len + right_len;
11372
11373    if (unicode_modifiable(left)
11374        && PyUnicode_CheckExact(right)
11375        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11376        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11377           to change the structure size, but characters are stored just after
11378           the structure, and so it requires to move all characters which is
11379           not so different than duplicating the string. */
11380        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11381    {
11382        /* append inplace */
11383        if (unicode_resize(p_left, new_len) != 0)
11384            goto error;
11385
11386        /* copy 'right' into the newly allocated area of 'left' */
11387        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11388    }
11389    else {
11390        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11391        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11392        maxchar = Py_MAX(maxchar, maxchar2);
11393
11394        /* Concat the two Unicode strings */
11395        res = PyUnicode_New(new_len, maxchar);
11396        if (res == NULL)
11397            goto error;
11398        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11399        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11400        Py_DECREF(left);
11401        *p_left = res;
11402    }
11403    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11404    return;
11405
11406error:
11407    Py_CLEAR(*p_left);
11408}
11409
11410void
11411PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11412{
11413    PyUnicode_Append(pleft, right);
11414    Py_XDECREF(right);
11415}
11416
11417/*
11418Wraps stringlib_parse_args_finds() and additionally ensures that the
11419first argument is a unicode object.
11420*/
11421
11422static inline int
11423parse_args_finds_unicode(const char * function_name, PyObject *args,
11424                         PyObject **substring,
11425                         Py_ssize_t *start, Py_ssize_t *end)
11426{
11427    if(stringlib_parse_args_finds(function_name, args, substring,
11428                                  start, end)) {
11429        if (ensure_unicode(*substring) < 0)
11430            return 0;
11431        return 1;
11432    }
11433    return 0;
11434}
11435
11436PyDoc_STRVAR(count__doc__,
11437             "S.count(sub[, start[, end]]) -> int\n\
11438\n\
11439Return the number of non-overlapping occurrences of substring sub in\n\
11440string S[start:end].  Optional arguments start and end are\n\
11441interpreted as in slice notation.");
11442
11443static PyObject *
11444unicode_count(PyObject *self, PyObject *args)
11445{
11446    PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11447    Py_ssize_t start = 0;
11448    Py_ssize_t end = PY_SSIZE_T_MAX;
11449    PyObject *result;
11450    int kind1, kind2;
11451    void *buf1, *buf2;
11452    Py_ssize_t len1, len2, iresult;
11453
11454    if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11455        return NULL;
11456
11457    kind1 = PyUnicode_KIND(self);
11458    kind2 = PyUnicode_KIND(substring);
11459    if (kind1 < kind2)
11460        return PyLong_FromLong(0);
11461
11462    len1 = PyUnicode_GET_LENGTH(self);
11463    len2 = PyUnicode_GET_LENGTH(substring);
11464    ADJUST_INDICES(start, end, len1);
11465    if (end - start < len2)
11466        return PyLong_FromLong(0);
11467
11468    buf1 = PyUnicode_DATA(self);
11469    buf2 = PyUnicode_DATA(substring);
11470    if (kind2 != kind1) {
11471        buf2 = _PyUnicode_AsKind(substring, kind1);
11472        if (!buf2)
11473            return NULL;
11474    }
11475    switch (kind1) {
11476    case PyUnicode_1BYTE_KIND:
11477        iresult = ucs1lib_count(
11478            ((Py_UCS1*)buf1) + start, end - start,
11479            buf2, len2, PY_SSIZE_T_MAX
11480            );
11481        break;
11482    case PyUnicode_2BYTE_KIND:
11483        iresult = ucs2lib_count(
11484            ((Py_UCS2*)buf1) + start, end - start,
11485            buf2, len2, PY_SSIZE_T_MAX
11486            );
11487        break;
11488    case PyUnicode_4BYTE_KIND:
11489        iresult = ucs4lib_count(
11490            ((Py_UCS4*)buf1) + start, end - start,
11491            buf2, len2, PY_SSIZE_T_MAX
11492            );
11493        break;
11494    default:
11495        assert(0); iresult = 0;
11496    }
11497
11498    result = PyLong_FromSsize_t(iresult);
11499
11500    if (kind2 != kind1)
11501        PyMem_Free(buf2);
11502
11503    return result;
11504}
11505
11506PyDoc_STRVAR(encode__doc__,
11507             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11508\n\
11509Encode S using the codec registered for encoding. Default encoding\n\
11510is 'utf-8'. errors may be given to set a different error\n\
11511handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11512a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11513'xmlcharrefreplace' as well as any other name registered with\n\
11514codecs.register_error that can handle UnicodeEncodeErrors.");
11515
11516static PyObject *
11517unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11518{
11519    static char *kwlist[] = {"encoding", "errors", 0};
11520    char *encoding = NULL;
11521    char *errors = NULL;
11522
11523    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11524                                     kwlist, &encoding, &errors))
11525        return NULL;
11526    return PyUnicode_AsEncodedString(self, encoding, errors);
11527}
11528
11529PyDoc_STRVAR(expandtabs__doc__,
11530             "S.expandtabs(tabsize=8) -> str\n\
11531\n\
11532Return a copy of S where all tab characters are expanded using spaces.\n\
11533If tabsize is not given, a tab size of 8 characters is assumed.");
11534
11535static PyObject*
11536unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11537{
11538    Py_ssize_t i, j, line_pos, src_len, incr;
11539    Py_UCS4 ch;
11540    PyObject *u;
11541    void *src_data, *dest_data;
11542    static char *kwlist[] = {"tabsize", 0};
11543    int tabsize = 8;
11544    int kind;
11545    int found;
11546
11547    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11548                                     kwlist, &tabsize))
11549        return NULL;
11550
11551    if (PyUnicode_READY(self) == -1)
11552        return NULL;
11553
11554    /* First pass: determine size of output string */
11555    src_len = PyUnicode_GET_LENGTH(self);
11556    i = j = line_pos = 0;
11557    kind = PyUnicode_KIND(self);
11558    src_data = PyUnicode_DATA(self);
11559    found = 0;
11560    for (; i < src_len; i++) {
11561        ch = PyUnicode_READ(kind, src_data, i);
11562        if (ch == '\t') {
11563            found = 1;
11564            if (tabsize > 0) {
11565                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11566                if (j > PY_SSIZE_T_MAX - incr)
11567                    goto overflow;
11568                line_pos += incr;
11569                j += incr;
11570            }
11571        }
11572        else {
11573            if (j > PY_SSIZE_T_MAX - 1)
11574                goto overflow;
11575            line_pos++;
11576            j++;
11577            if (ch == '\n' || ch == '\r')
11578                line_pos = 0;
11579        }
11580    }
11581    if (!found)
11582        return unicode_result_unchanged(self);
11583
11584    /* Second pass: create output string and fill it */
11585    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11586    if (!u)
11587        return NULL;
11588    dest_data = PyUnicode_DATA(u);
11589
11590    i = j = line_pos = 0;
11591
11592    for (; i < src_len; i++) {
11593        ch = PyUnicode_READ(kind, src_data, i);
11594        if (ch == '\t') {
11595            if (tabsize > 0) {
11596                incr = tabsize - (line_pos % tabsize);
11597                line_pos += incr;
11598                FILL(kind, dest_data, ' ', j, incr);
11599                j += incr;
11600            }
11601        }
11602        else {
11603            line_pos++;
11604            PyUnicode_WRITE(kind, dest_data, j, ch);
11605            j++;
11606            if (ch == '\n' || ch == '\r')
11607                line_pos = 0;
11608        }
11609    }
11610    assert (j == PyUnicode_GET_LENGTH(u));
11611    return unicode_result(u);
11612
11613  overflow:
11614    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11615    return NULL;
11616}
11617
11618PyDoc_STRVAR(find__doc__,
11619             "S.find(sub[, start[, end]]) -> int\n\
11620\n\
11621Return the lowest index in S where substring sub is found,\n\
11622such that sub is contained within S[start:end].  Optional\n\
11623arguments start and end are interpreted as in slice notation.\n\
11624\n\
11625Return -1 on failure.");
11626
11627static PyObject *
11628unicode_find(PyObject *self, PyObject *args)
11629{
11630    /* initialize variables to prevent gcc warning */
11631    PyObject *substring = NULL;
11632    Py_ssize_t start = 0;
11633    Py_ssize_t end = 0;
11634    Py_ssize_t result;
11635
11636    if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11637        return NULL;
11638
11639    if (PyUnicode_READY(self) == -1)
11640        return NULL;
11641
11642    result = any_find_slice(self, substring, start, end, 1);
11643
11644    if (result == -2)
11645        return NULL;
11646
11647    return PyLong_FromSsize_t(result);
11648}
11649
11650static PyObject *
11651unicode_getitem(PyObject *self, Py_ssize_t index)
11652{
11653    void *data;
11654    enum PyUnicode_Kind kind;
11655    Py_UCS4 ch;
11656
11657    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11658        PyErr_BadArgument();
11659        return NULL;
11660    }
11661    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11662        PyErr_SetString(PyExc_IndexError, "string index out of range");
11663        return NULL;
11664    }
11665    kind = PyUnicode_KIND(self);
11666    data = PyUnicode_DATA(self);
11667    ch = PyUnicode_READ(kind, data, index);
11668    return unicode_char(ch);
11669}
11670
11671/* Believe it or not, this produces the same value for ASCII strings
11672   as bytes_hash(). */
11673static Py_hash_t
11674unicode_hash(PyObject *self)
11675{
11676    Py_ssize_t len;
11677    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11678
11679#ifdef Py_DEBUG
11680    assert(_Py_HashSecret_Initialized);
11681#endif
11682    if (_PyUnicode_HASH(self) != -1)
11683        return _PyUnicode_HASH(self);
11684    if (PyUnicode_READY(self) == -1)
11685        return -1;
11686    len = PyUnicode_GET_LENGTH(self);
11687    /*
11688      We make the hash of the empty string be 0, rather than using
11689      (prefix ^ suffix), since this slightly obfuscates the hash secret
11690    */
11691    if (len == 0) {
11692        _PyUnicode_HASH(self) = 0;
11693        return 0;
11694    }
11695    x = _Py_HashBytes(PyUnicode_DATA(self),
11696                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11697    _PyUnicode_HASH(self) = x;
11698    return x;
11699}
11700
11701PyDoc_STRVAR(index__doc__,
11702             "S.index(sub[, start[, end]]) -> int\n\
11703\n\
11704Like S.find() but raise ValueError when the substring is not found.");
11705
11706static PyObject *
11707unicode_index(PyObject *self, PyObject *args)
11708{
11709    /* initialize variables to prevent gcc warning */
11710    Py_ssize_t result;
11711    PyObject *substring = NULL;
11712    Py_ssize_t start = 0;
11713    Py_ssize_t end = 0;
11714
11715    if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11716        return NULL;
11717
11718    if (PyUnicode_READY(self) == -1)
11719        return NULL;
11720
11721    result = any_find_slice(self, substring, start, end, 1);
11722
11723    if (result == -2)
11724        return NULL;
11725
11726    if (result < 0) {
11727        PyErr_SetString(PyExc_ValueError, "substring not found");
11728        return NULL;
11729    }
11730
11731    return PyLong_FromSsize_t(result);
11732}
11733
11734PyDoc_STRVAR(islower__doc__,
11735             "S.islower() -> bool\n\
11736\n\
11737Return True if all cased characters in S are lowercase and there is\n\
11738at least one cased character in S, False otherwise.");
11739
11740static PyObject*
11741unicode_islower(PyObject *self)
11742{
11743    Py_ssize_t i, length;
11744    int kind;
11745    void *data;
11746    int cased;
11747
11748    if (PyUnicode_READY(self) == -1)
11749        return NULL;
11750    length = PyUnicode_GET_LENGTH(self);
11751    kind = PyUnicode_KIND(self);
11752    data = PyUnicode_DATA(self);
11753
11754    /* Shortcut for single character strings */
11755    if (length == 1)
11756        return PyBool_FromLong(
11757            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11758
11759    /* Special case for empty strings */
11760    if (length == 0)
11761        return PyBool_FromLong(0);
11762
11763    cased = 0;
11764    for (i = 0; i < length; i++) {
11765        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11766
11767        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11768            return PyBool_FromLong(0);
11769        else if (!cased && Py_UNICODE_ISLOWER(ch))
11770            cased = 1;
11771    }
11772    return PyBool_FromLong(cased);
11773}
11774
11775PyDoc_STRVAR(isupper__doc__,
11776             "S.isupper() -> bool\n\
11777\n\
11778Return True if all cased characters in S are uppercase and there is\n\
11779at least one cased character in S, False otherwise.");
11780
11781static PyObject*
11782unicode_isupper(PyObject *self)
11783{
11784    Py_ssize_t i, length;
11785    int kind;
11786    void *data;
11787    int cased;
11788
11789    if (PyUnicode_READY(self) == -1)
11790        return NULL;
11791    length = PyUnicode_GET_LENGTH(self);
11792    kind = PyUnicode_KIND(self);
11793    data = PyUnicode_DATA(self);
11794
11795    /* Shortcut for single character strings */
11796    if (length == 1)
11797        return PyBool_FromLong(
11798            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11799
11800    /* Special case for empty strings */
11801    if (length == 0)
11802        return PyBool_FromLong(0);
11803
11804    cased = 0;
11805    for (i = 0; i < length; i++) {
11806        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11807
11808        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11809            return PyBool_FromLong(0);
11810        else if (!cased && Py_UNICODE_ISUPPER(ch))
11811            cased = 1;
11812    }
11813    return PyBool_FromLong(cased);
11814}
11815
11816PyDoc_STRVAR(istitle__doc__,
11817             "S.istitle() -> bool\n\
11818\n\
11819Return True if S is a titlecased string and there is at least one\n\
11820character in S, i.e. upper- and titlecase characters may only\n\
11821follow uncased characters and lowercase characters only cased ones.\n\
11822Return False otherwise.");
11823
11824static PyObject*
11825unicode_istitle(PyObject *self)
11826{
11827    Py_ssize_t i, length;
11828    int kind;
11829    void *data;
11830    int cased, previous_is_cased;
11831
11832    if (PyUnicode_READY(self) == -1)
11833        return NULL;
11834    length = PyUnicode_GET_LENGTH(self);
11835    kind = PyUnicode_KIND(self);
11836    data = PyUnicode_DATA(self);
11837
11838    /* Shortcut for single character strings */
11839    if (length == 1) {
11840        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11841        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11842                               (Py_UNICODE_ISUPPER(ch) != 0));
11843    }
11844
11845    /* Special case for empty strings */
11846    if (length == 0)
11847        return PyBool_FromLong(0);
11848
11849    cased = 0;
11850    previous_is_cased = 0;
11851    for (i = 0; i < length; i++) {
11852        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11853
11854        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11855            if (previous_is_cased)
11856                return PyBool_FromLong(0);
11857            previous_is_cased = 1;
11858            cased = 1;
11859        }
11860        else if (Py_UNICODE_ISLOWER(ch)) {
11861            if (!previous_is_cased)
11862                return PyBool_FromLong(0);
11863            previous_is_cased = 1;
11864            cased = 1;
11865        }
11866        else
11867            previous_is_cased = 0;
11868    }
11869    return PyBool_FromLong(cased);
11870}
11871
11872PyDoc_STRVAR(isspace__doc__,
11873             "S.isspace() -> bool\n\
11874\n\
11875Return True if all characters in S are whitespace\n\
11876and there is at least one character in S, False otherwise.");
11877
11878static PyObject*
11879unicode_isspace(PyObject *self)
11880{
11881    Py_ssize_t i, length;
11882    int kind;
11883    void *data;
11884
11885    if (PyUnicode_READY(self) == -1)
11886        return NULL;
11887    length = PyUnicode_GET_LENGTH(self);
11888    kind = PyUnicode_KIND(self);
11889    data = PyUnicode_DATA(self);
11890
11891    /* Shortcut for single character strings */
11892    if (length == 1)
11893        return PyBool_FromLong(
11894            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11895
11896    /* Special case for empty strings */
11897    if (length == 0)
11898        return PyBool_FromLong(0);
11899
11900    for (i = 0; i < length; i++) {
11901        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11902        if (!Py_UNICODE_ISSPACE(ch))
11903            return PyBool_FromLong(0);
11904    }
11905    return PyBool_FromLong(1);
11906}
11907
11908PyDoc_STRVAR(isalpha__doc__,
11909             "S.isalpha() -> bool\n\
11910\n\
11911Return True if all characters in S are alphabetic\n\
11912and there is at least one character in S, False otherwise.");
11913
11914static PyObject*
11915unicode_isalpha(PyObject *self)
11916{
11917    Py_ssize_t i, length;
11918    int kind;
11919    void *data;
11920
11921    if (PyUnicode_READY(self) == -1)
11922        return NULL;
11923    length = PyUnicode_GET_LENGTH(self);
11924    kind = PyUnicode_KIND(self);
11925    data = PyUnicode_DATA(self);
11926
11927    /* Shortcut for single character strings */
11928    if (length == 1)
11929        return PyBool_FromLong(
11930            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11931
11932    /* Special case for empty strings */
11933    if (length == 0)
11934        return PyBool_FromLong(0);
11935
11936    for (i = 0; i < length; i++) {
11937        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11938            return PyBool_FromLong(0);
11939    }
11940    return PyBool_FromLong(1);
11941}
11942
11943PyDoc_STRVAR(isalnum__doc__,
11944             "S.isalnum() -> bool\n\
11945\n\
11946Return True if all characters in S are alphanumeric\n\
11947and there is at least one character in S, False otherwise.");
11948
11949static PyObject*
11950unicode_isalnum(PyObject *self)
11951{
11952    int kind;
11953    void *data;
11954    Py_ssize_t len, i;
11955
11956    if (PyUnicode_READY(self) == -1)
11957        return NULL;
11958
11959    kind = PyUnicode_KIND(self);
11960    data = PyUnicode_DATA(self);
11961    len = PyUnicode_GET_LENGTH(self);
11962
11963    /* Shortcut for single character strings */
11964    if (len == 1) {
11965        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11966        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11967    }
11968
11969    /* Special case for empty strings */
11970    if (len == 0)
11971        return PyBool_FromLong(0);
11972
11973    for (i = 0; i < len; i++) {
11974        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11975        if (!Py_UNICODE_ISALNUM(ch))
11976            return PyBool_FromLong(0);
11977    }
11978    return PyBool_FromLong(1);
11979}
11980
11981PyDoc_STRVAR(isdecimal__doc__,
11982             "S.isdecimal() -> bool\n\
11983\n\
11984Return True if there are only decimal characters in S,\n\
11985False otherwise.");
11986
11987static PyObject*
11988unicode_isdecimal(PyObject *self)
11989{
11990    Py_ssize_t i, length;
11991    int kind;
11992    void *data;
11993
11994    if (PyUnicode_READY(self) == -1)
11995        return NULL;
11996    length = PyUnicode_GET_LENGTH(self);
11997    kind = PyUnicode_KIND(self);
11998    data = PyUnicode_DATA(self);
11999
12000    /* Shortcut for single character strings */
12001    if (length == 1)
12002        return PyBool_FromLong(
12003            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12004
12005    /* Special case for empty strings */
12006    if (length == 0)
12007        return PyBool_FromLong(0);
12008
12009    for (i = 0; i < length; i++) {
12010        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12011            return PyBool_FromLong(0);
12012    }
12013    return PyBool_FromLong(1);
12014}
12015
12016PyDoc_STRVAR(isdigit__doc__,
12017             "S.isdigit() -> bool\n\
12018\n\
12019Return True if all characters in S are digits\n\
12020and there is at least one character in S, False otherwise.");
12021
12022static PyObject*
12023unicode_isdigit(PyObject *self)
12024{
12025    Py_ssize_t i, length;
12026    int kind;
12027    void *data;
12028
12029    if (PyUnicode_READY(self) == -1)
12030        return NULL;
12031    length = PyUnicode_GET_LENGTH(self);
12032    kind = PyUnicode_KIND(self);
12033    data = PyUnicode_DATA(self);
12034
12035    /* Shortcut for single character strings */
12036    if (length == 1) {
12037        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12038        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12039    }
12040
12041    /* Special case for empty strings */
12042    if (length == 0)
12043        return PyBool_FromLong(0);
12044
12045    for (i = 0; i < length; i++) {
12046        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12047            return PyBool_FromLong(0);
12048    }
12049    return PyBool_FromLong(1);
12050}
12051
12052PyDoc_STRVAR(isnumeric__doc__,
12053             "S.isnumeric() -> bool\n\
12054\n\
12055Return True if there are only numeric characters in S,\n\
12056False otherwise.");
12057
12058static PyObject*
12059unicode_isnumeric(PyObject *self)
12060{
12061    Py_ssize_t i, length;
12062    int kind;
12063    void *data;
12064
12065    if (PyUnicode_READY(self) == -1)
12066        return NULL;
12067    length = PyUnicode_GET_LENGTH(self);
12068    kind = PyUnicode_KIND(self);
12069    data = PyUnicode_DATA(self);
12070
12071    /* Shortcut for single character strings */
12072    if (length == 1)
12073        return PyBool_FromLong(
12074            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12075
12076    /* Special case for empty strings */
12077    if (length == 0)
12078        return PyBool_FromLong(0);
12079
12080    for (i = 0; i < length; i++) {
12081        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12082            return PyBool_FromLong(0);
12083    }
12084    return PyBool_FromLong(1);
12085}
12086
12087int
12088PyUnicode_IsIdentifier(PyObject *self)
12089{
12090    int kind;
12091    void *data;
12092    Py_ssize_t i;
12093    Py_UCS4 first;
12094
12095    if (PyUnicode_READY(self) == -1) {
12096        Py_FatalError("identifier not ready");
12097        return 0;
12098    }
12099
12100    /* Special case for empty strings */
12101    if (PyUnicode_GET_LENGTH(self) == 0)
12102        return 0;
12103    kind = PyUnicode_KIND(self);
12104    data = PyUnicode_DATA(self);
12105
12106    /* PEP 3131 says that the first character must be in
12107       XID_Start and subsequent characters in XID_Continue,
12108       and for the ASCII range, the 2.x rules apply (i.e
12109       start with letters and underscore, continue with
12110       letters, digits, underscore). However, given the current
12111       definition of XID_Start and XID_Continue, it is sufficient
12112       to check just for these, except that _ must be allowed
12113       as starting an identifier.  */
12114    first = PyUnicode_READ(kind, data, 0);
12115    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12116        return 0;
12117
12118    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12119        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12120            return 0;
12121    return 1;
12122}
12123
12124PyDoc_STRVAR(isidentifier__doc__,
12125             "S.isidentifier() -> bool\n\
12126\n\
12127Return True if S is a valid identifier according\n\
12128to the language definition.\n\
12129\n\
12130Use keyword.iskeyword() to test for reserved identifiers\n\
12131such as \"def\" and \"class\".\n");
12132
12133static PyObject*
12134unicode_isidentifier(PyObject *self)
12135{
12136    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12137}
12138
12139PyDoc_STRVAR(isprintable__doc__,
12140             "S.isprintable() -> bool\n\
12141\n\
12142Return True if all characters in S are considered\n\
12143printable in repr() or S is empty, False otherwise.");
12144
12145static PyObject*
12146unicode_isprintable(PyObject *self)
12147{
12148    Py_ssize_t i, length;
12149    int kind;
12150    void *data;
12151
12152    if (PyUnicode_READY(self) == -1)
12153        return NULL;
12154    length = PyUnicode_GET_LENGTH(self);
12155    kind = PyUnicode_KIND(self);
12156    data = PyUnicode_DATA(self);
12157
12158    /* Shortcut for single character strings */
12159    if (length == 1)
12160        return PyBool_FromLong(
12161            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12162
12163    for (i = 0; i < length; i++) {
12164        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12165            Py_RETURN_FALSE;
12166        }
12167    }
12168    Py_RETURN_TRUE;
12169}
12170
12171PyDoc_STRVAR(join__doc__,
12172             "S.join(iterable) -> str\n\
12173\n\
12174Return a string which is the concatenation of the strings in the\n\
12175iterable.  The separator between elements is S.");
12176
12177static PyObject*
12178unicode_join(PyObject *self, PyObject *data)
12179{
12180    return PyUnicode_Join(self, data);
12181}
12182
12183static Py_ssize_t
12184unicode_length(PyObject *self)
12185{
12186    if (PyUnicode_READY(self) == -1)
12187        return -1;
12188    return PyUnicode_GET_LENGTH(self);
12189}
12190
12191PyDoc_STRVAR(ljust__doc__,
12192             "S.ljust(width[, fillchar]) -> str\n\
12193\n\
12194Return S left-justified in a Unicode string of length width. Padding is\n\
12195done using the specified fill character (default is a space).");
12196
12197static PyObject *
12198unicode_ljust(PyObject *self, PyObject *args)
12199{
12200    Py_ssize_t width;
12201    Py_UCS4 fillchar = ' ';
12202
12203    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
12204        return NULL;
12205
12206    if (PyUnicode_READY(self) == -1)
12207        return NULL;
12208
12209    if (PyUnicode_GET_LENGTH(self) >= width)
12210        return unicode_result_unchanged(self);
12211
12212    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12213}
12214
12215PyDoc_STRVAR(lower__doc__,
12216             "S.lower() -> str\n\
12217\n\
12218Return a copy of the string S converted to lowercase.");
12219
12220static PyObject*
12221unicode_lower(PyObject *self)
12222{
12223    if (PyUnicode_READY(self) == -1)
12224        return NULL;
12225    if (PyUnicode_IS_ASCII(self))
12226        return ascii_upper_or_lower(self, 1);
12227    return case_operation(self, do_lower);
12228}
12229
12230#define LEFTSTRIP 0
12231#define RIGHTSTRIP 1
12232#define BOTHSTRIP 2
12233
12234/* Arrays indexed by above */
12235static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12236
12237#define STRIPNAME(i) (stripformat[i]+3)
12238
12239/* externally visible for str.strip(unicode) */
12240PyObject *
12241_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12242{
12243    void *data;
12244    int kind;
12245    Py_ssize_t i, j, len;
12246    BLOOM_MASK sepmask;
12247    Py_ssize_t seplen;
12248
12249    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12250        return NULL;
12251
12252    kind = PyUnicode_KIND(self);
12253    data = PyUnicode_DATA(self);
12254    len = PyUnicode_GET_LENGTH(self);
12255    seplen = PyUnicode_GET_LENGTH(sepobj);
12256    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12257                              PyUnicode_DATA(sepobj),
12258                              seplen);
12259
12260    i = 0;
12261    if (striptype != RIGHTSTRIP) {
12262        while (i < len) {
12263            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12264            if (!BLOOM(sepmask, ch))
12265                break;
12266            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12267                break;
12268            i++;
12269        }
12270    }
12271
12272    j = len;
12273    if (striptype != LEFTSTRIP) {
12274        j--;
12275        while (j >= i) {
12276            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12277            if (!BLOOM(sepmask, ch))
12278                break;
12279            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12280                break;
12281            j--;
12282        }
12283
12284        j++;
12285    }
12286
12287    return PyUnicode_Substring(self, i, j);
12288}
12289
12290PyObject*
12291PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12292{
12293    unsigned char *data;
12294    int kind;
12295    Py_ssize_t length;
12296
12297    if (PyUnicode_READY(self) == -1)
12298        return NULL;
12299
12300    length = PyUnicode_GET_LENGTH(self);
12301    end = Py_MIN(end, length);
12302
12303    if (start == 0 && end == length)
12304        return unicode_result_unchanged(self);
12305
12306    if (start < 0 || end < 0) {
12307        PyErr_SetString(PyExc_IndexError, "string index out of range");
12308        return NULL;
12309    }
12310    if (start >= length || end < start)
12311        _Py_RETURN_UNICODE_EMPTY();
12312
12313    length = end - start;
12314    if (PyUnicode_IS_ASCII(self)) {
12315        data = PyUnicode_1BYTE_DATA(self);
12316        return _PyUnicode_FromASCII((char*)(data + start), length);
12317    }
12318    else {
12319        kind = PyUnicode_KIND(self);
12320        data = PyUnicode_1BYTE_DATA(self);
12321        return PyUnicode_FromKindAndData(kind,
12322                                         data + kind * start,
12323                                         length);
12324    }
12325}
12326
12327static PyObject *
12328do_strip(PyObject *self, int striptype)
12329{
12330    Py_ssize_t len, i, j;
12331
12332    if (PyUnicode_READY(self) == -1)
12333        return NULL;
12334
12335    len = PyUnicode_GET_LENGTH(self);
12336
12337    if (PyUnicode_IS_ASCII(self)) {
12338        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12339
12340        i = 0;
12341        if (striptype != RIGHTSTRIP) {
12342            while (i < len) {
12343                Py_UCS1 ch = data[i];
12344                if (!_Py_ascii_whitespace[ch])
12345                    break;
12346                i++;
12347            }
12348        }
12349
12350        j = len;
12351        if (striptype != LEFTSTRIP) {
12352            j--;
12353            while (j >= i) {
12354                Py_UCS1 ch = data[j];
12355                if (!_Py_ascii_whitespace[ch])
12356                    break;
12357                j--;
12358            }
12359            j++;
12360        }
12361    }
12362    else {
12363        int kind = PyUnicode_KIND(self);
12364        void *data = PyUnicode_DATA(self);
12365
12366        i = 0;
12367        if (striptype != RIGHTSTRIP) {
12368            while (i < len) {
12369                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12370                if (!Py_UNICODE_ISSPACE(ch))
12371                    break;
12372                i++;
12373            }
12374        }
12375
12376        j = len;
12377        if (striptype != LEFTSTRIP) {
12378            j--;
12379            while (j >= i) {
12380                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12381                if (!Py_UNICODE_ISSPACE(ch))
12382                    break;
12383                j--;
12384            }
12385            j++;
12386        }
12387    }
12388
12389    return PyUnicode_Substring(self, i, j);
12390}
12391
12392
12393static PyObject *
12394do_argstrip(PyObject *self, int striptype, PyObject *args)
12395{
12396    PyObject *sep = NULL;
12397
12398    if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12399        return NULL;
12400
12401    if (sep != NULL && sep != Py_None) {
12402        if (PyUnicode_Check(sep))
12403            return _PyUnicode_XStrip(self, striptype, sep);
12404        else {
12405            PyErr_Format(PyExc_TypeError,
12406                         "%s arg must be None or str",
12407                         STRIPNAME(striptype));
12408            return NULL;
12409        }
12410    }
12411
12412    return do_strip(self, striptype);
12413}
12414
12415
12416PyDoc_STRVAR(strip__doc__,
12417             "S.strip([chars]) -> str\n\
12418\n\
12419Return a copy of the string S with leading and trailing\n\
12420whitespace removed.\n\
12421If chars is given and not None, remove characters in chars instead.");
12422
12423static PyObject *
12424unicode_strip(PyObject *self, PyObject *args)
12425{
12426    if (PyTuple_GET_SIZE(args) == 0)
12427        return do_strip(self, BOTHSTRIP); /* Common case */
12428    else
12429        return do_argstrip(self, BOTHSTRIP, args);
12430}
12431
12432
12433PyDoc_STRVAR(lstrip__doc__,
12434             "S.lstrip([chars]) -> str\n\
12435\n\
12436Return a copy of the string S with leading whitespace removed.\n\
12437If chars is given and not None, remove characters in chars instead.");
12438
12439static PyObject *
12440unicode_lstrip(PyObject *self, PyObject *args)
12441{
12442    if (PyTuple_GET_SIZE(args) == 0)
12443        return do_strip(self, LEFTSTRIP); /* Common case */
12444    else
12445        return do_argstrip(self, LEFTSTRIP, args);
12446}
12447
12448
12449PyDoc_STRVAR(rstrip__doc__,
12450             "S.rstrip([chars]) -> str\n\
12451\n\
12452Return a copy of the string S with trailing whitespace removed.\n\
12453If chars is given and not None, remove characters in chars instead.");
12454
12455static PyObject *
12456unicode_rstrip(PyObject *self, PyObject *args)
12457{
12458    if (PyTuple_GET_SIZE(args) == 0)
12459        return do_strip(self, RIGHTSTRIP); /* Common case */
12460    else
12461        return do_argstrip(self, RIGHTSTRIP, args);
12462}
12463
12464
12465static PyObject*
12466unicode_repeat(PyObject *str, Py_ssize_t len)
12467{
12468    PyObject *u;
12469    Py_ssize_t nchars, n;
12470
12471    if (len < 1)
12472        _Py_RETURN_UNICODE_EMPTY();
12473
12474    /* no repeat, return original string */
12475    if (len == 1)
12476        return unicode_result_unchanged(str);
12477
12478    if (PyUnicode_READY(str) == -1)
12479        return NULL;
12480
12481    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12482        PyErr_SetString(PyExc_OverflowError,
12483                        "repeated string is too long");
12484        return NULL;
12485    }
12486    nchars = len * PyUnicode_GET_LENGTH(str);
12487
12488    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12489    if (!u)
12490        return NULL;
12491    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12492
12493    if (PyUnicode_GET_LENGTH(str) == 1) {
12494        const int kind = PyUnicode_KIND(str);
12495        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12496        if (kind == PyUnicode_1BYTE_KIND) {
12497            void *to = PyUnicode_DATA(u);
12498            memset(to, (unsigned char)fill_char, len);
12499        }
12500        else if (kind == PyUnicode_2BYTE_KIND) {
12501            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12502            for (n = 0; n < len; ++n)
12503                ucs2[n] = fill_char;
12504        } else {
12505            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12506            assert(kind == PyUnicode_4BYTE_KIND);
12507            for (n = 0; n < len; ++n)
12508                ucs4[n] = fill_char;
12509        }
12510    }
12511    else {
12512        /* number of characters copied this far */
12513        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12514        const Py_ssize_t char_size = PyUnicode_KIND(str);
12515        char *to = (char *) PyUnicode_DATA(u);
12516        memcpy(to, PyUnicode_DATA(str),
12517                  PyUnicode_GET_LENGTH(str) * char_size);
12518        while (done < nchars) {
12519            n = (done <= nchars-done) ? done : nchars-done;
12520            memcpy(to + (done * char_size), to, n * char_size);
12521            done += n;
12522        }
12523    }
12524
12525    assert(_PyUnicode_CheckConsistency(u, 1));
12526    return u;
12527}
12528
12529PyObject *
12530PyUnicode_Replace(PyObject *str,
12531                  PyObject *substr,
12532                  PyObject *replstr,
12533                  Py_ssize_t maxcount)
12534{
12535    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12536            ensure_unicode(replstr) < 0)
12537        return NULL;
12538    return replace(str, substr, replstr, maxcount);
12539}
12540
12541PyDoc_STRVAR(replace__doc__,
12542             "S.replace(old, new[, count]) -> str\n\
12543\n\
12544Return a copy of S with all occurrences of substring\n\
12545old replaced by new.  If the optional argument count is\n\
12546given, only the first count occurrences are replaced.");
12547
12548static PyObject*
12549unicode_replace(PyObject *self, PyObject *args)
12550{
12551    PyObject *str1;
12552    PyObject *str2;
12553    Py_ssize_t maxcount = -1;
12554
12555    if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
12556        return NULL;
12557    if (PyUnicode_READY(self) == -1)
12558        return NULL;
12559    return replace(self, str1, str2, maxcount);
12560}
12561
12562static PyObject *
12563unicode_repr(PyObject *unicode)
12564{
12565    PyObject *repr;
12566    Py_ssize_t isize;
12567    Py_ssize_t osize, squote, dquote, i, o;
12568    Py_UCS4 max, quote;
12569    int ikind, okind, unchanged;
12570    void *idata, *odata;
12571
12572    if (PyUnicode_READY(unicode) == -1)
12573        return NULL;
12574
12575    isize = PyUnicode_GET_LENGTH(unicode);
12576    idata = PyUnicode_DATA(unicode);
12577
12578    /* Compute length of output, quote characters, and
12579       maximum character */
12580    osize = 0;
12581    max = 127;
12582    squote = dquote = 0;
12583    ikind = PyUnicode_KIND(unicode);
12584    for (i = 0; i < isize; i++) {
12585        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12586        Py_ssize_t incr = 1;
12587        switch (ch) {
12588        case '\'': squote++; break;
12589        case '"':  dquote++; break;
12590        case '\\': case '\t': case '\r': case '\n':
12591            incr = 2;
12592            break;
12593        default:
12594            /* Fast-path ASCII */
12595            if (ch < ' ' || ch == 0x7f)
12596                incr = 4; /* \xHH */
12597            else if (ch < 0x7f)
12598                ;
12599            else if (Py_UNICODE_ISPRINTABLE(ch))
12600                max = ch > max ? ch : max;
12601            else if (ch < 0x100)
12602                incr = 4; /* \xHH */
12603            else if (ch < 0x10000)
12604                incr = 6; /* \uHHHH */
12605            else
12606                incr = 10; /* \uHHHHHHHH */
12607        }
12608        if (osize > PY_SSIZE_T_MAX - incr) {
12609            PyErr_SetString(PyExc_OverflowError,
12610                            "string is too long to generate repr");
12611            return NULL;
12612        }
12613        osize += incr;
12614    }
12615
12616    quote = '\'';
12617    unchanged = (osize == isize);
12618    if (squote) {
12619        unchanged = 0;
12620        if (dquote)
12621            /* Both squote and dquote present. Use squote,
12622               and escape them */
12623            osize += squote;
12624        else
12625            quote = '"';
12626    }
12627    osize += 2;   /* quotes */
12628
12629    repr = PyUnicode_New(osize, max);
12630    if (repr == NULL)
12631        return NULL;
12632    okind = PyUnicode_KIND(repr);
12633    odata = PyUnicode_DATA(repr);
12634
12635    PyUnicode_WRITE(okind, odata, 0, quote);
12636    PyUnicode_WRITE(okind, odata, osize-1, quote);
12637    if (unchanged) {
12638        _PyUnicode_FastCopyCharacters(repr, 1,
12639                                      unicode, 0,
12640                                      isize);
12641    }
12642    else {
12643        for (i = 0, o = 1; i < isize; i++) {
12644            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12645
12646            /* Escape quotes and backslashes */
12647            if ((ch == quote) || (ch == '\\')) {
12648                PyUnicode_WRITE(okind, odata, o++, '\\');
12649                PyUnicode_WRITE(okind, odata, o++, ch);
12650                continue;
12651            }
12652
12653            /* Map special whitespace to '\t', \n', '\r' */
12654            if (ch == '\t') {
12655                PyUnicode_WRITE(okind, odata, o++, '\\');
12656                PyUnicode_WRITE(okind, odata, o++, 't');
12657            }
12658            else if (ch == '\n') {
12659                PyUnicode_WRITE(okind, odata, o++, '\\');
12660                PyUnicode_WRITE(okind, odata, o++, 'n');
12661            }
12662            else if (ch == '\r') {
12663                PyUnicode_WRITE(okind, odata, o++, '\\');
12664                PyUnicode_WRITE(okind, odata, o++, 'r');
12665            }
12666
12667            /* Map non-printable US ASCII to '\xhh' */
12668            else if (ch < ' ' || ch == 0x7F) {
12669                PyUnicode_WRITE(okind, odata, o++, '\\');
12670                PyUnicode_WRITE(okind, odata, o++, 'x');
12671                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12672                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12673            }
12674
12675            /* Copy ASCII characters as-is */
12676            else if (ch < 0x7F) {
12677                PyUnicode_WRITE(okind, odata, o++, ch);
12678            }
12679
12680            /* Non-ASCII characters */
12681            else {
12682                /* Map Unicode whitespace and control characters
12683                   (categories Z* and C* except ASCII space)
12684                */
12685                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12686                    PyUnicode_WRITE(okind, odata, o++, '\\');
12687                    /* Map 8-bit characters to '\xhh' */
12688                    if (ch <= 0xff) {
12689                        PyUnicode_WRITE(okind, odata, o++, 'x');
12690                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12691                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12692                    }
12693                    /* Map 16-bit characters to '\uxxxx' */
12694                    else if (ch <= 0xffff) {
12695                        PyUnicode_WRITE(okind, odata, o++, 'u');
12696                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12697                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12698                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12699                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12700                    }
12701                    /* Map 21-bit characters to '\U00xxxxxx' */
12702                    else {
12703                        PyUnicode_WRITE(okind, odata, o++, 'U');
12704                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12705                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12706                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12707                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12708                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12709                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12710                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12711                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12712                    }
12713                }
12714                /* Copy characters as-is */
12715                else {
12716                    PyUnicode_WRITE(okind, odata, o++, ch);
12717                }
12718            }
12719        }
12720    }
12721    /* Closing quote already added at the beginning */
12722    assert(_PyUnicode_CheckConsistency(repr, 1));
12723    return repr;
12724}
12725
12726PyDoc_STRVAR(rfind__doc__,
12727             "S.rfind(sub[, start[, end]]) -> int\n\
12728\n\
12729Return the highest index in S where substring sub is found,\n\
12730such that sub is contained within S[start:end].  Optional\n\
12731arguments start and end are interpreted as in slice notation.\n\
12732\n\
12733Return -1 on failure.");
12734
12735static PyObject *
12736unicode_rfind(PyObject *self, PyObject *args)
12737{
12738    /* initialize variables to prevent gcc warning */
12739    PyObject *substring = NULL;
12740    Py_ssize_t start = 0;
12741    Py_ssize_t end = 0;
12742    Py_ssize_t result;
12743
12744    if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12745        return NULL;
12746
12747    if (PyUnicode_READY(self) == -1)
12748        return NULL;
12749
12750    result = any_find_slice(self, substring, start, end, -1);
12751
12752    if (result == -2)
12753        return NULL;
12754
12755    return PyLong_FromSsize_t(result);
12756}
12757
12758PyDoc_STRVAR(rindex__doc__,
12759             "S.rindex(sub[, start[, end]]) -> int\n\
12760\n\
12761Like S.rfind() but raise ValueError when the substring is not found.");
12762
12763static PyObject *
12764unicode_rindex(PyObject *self, PyObject *args)
12765{
12766    /* initialize variables to prevent gcc warning */
12767    PyObject *substring = NULL;
12768    Py_ssize_t start = 0;
12769    Py_ssize_t end = 0;
12770    Py_ssize_t result;
12771
12772    if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12773        return NULL;
12774
12775    if (PyUnicode_READY(self) == -1)
12776        return NULL;
12777
12778    result = any_find_slice(self, substring, start, end, -1);
12779
12780    if (result == -2)
12781        return NULL;
12782
12783    if (result < 0) {
12784        PyErr_SetString(PyExc_ValueError, "substring not found");
12785        return NULL;
12786    }
12787
12788    return PyLong_FromSsize_t(result);
12789}
12790
12791PyDoc_STRVAR(rjust__doc__,
12792             "S.rjust(width[, fillchar]) -> str\n\
12793\n\
12794Return S right-justified in a string of length width. Padding is\n\
12795done using the specified fill character (default is a space).");
12796
12797static PyObject *
12798unicode_rjust(PyObject *self, PyObject *args)
12799{
12800    Py_ssize_t width;
12801    Py_UCS4 fillchar = ' ';
12802
12803    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12804        return NULL;
12805
12806    if (PyUnicode_READY(self) == -1)
12807        return NULL;
12808
12809    if (PyUnicode_GET_LENGTH(self) >= width)
12810        return unicode_result_unchanged(self);
12811
12812    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12813}
12814
12815PyObject *
12816PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12817{
12818    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12819        return NULL;
12820
12821    return split(s, sep, maxsplit);
12822}
12823
12824PyDoc_STRVAR(split__doc__,
12825             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12826\n\
12827Return a list of the words in S, using sep as the\n\
12828delimiter string.  If maxsplit is given, at most maxsplit\n\
12829splits are done. If sep is not specified or is None, any\n\
12830whitespace string is a separator and empty strings are\n\
12831removed from the result.");
12832
12833static PyObject*
12834unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12835{
12836    static char *kwlist[] = {"sep", "maxsplit", 0};
12837    PyObject *substring = Py_None;
12838    Py_ssize_t maxcount = -1;
12839
12840    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12841                                     kwlist, &substring, &maxcount))
12842        return NULL;
12843
12844    if (substring == Py_None)
12845        return split(self, NULL, maxcount);
12846
12847    if (PyUnicode_Check(substring))
12848        return split(self, substring, maxcount);
12849
12850    PyErr_Format(PyExc_TypeError,
12851                 "must be str or None, not %.100s",
12852                 Py_TYPE(substring)->tp_name);
12853    return NULL;
12854}
12855
12856PyObject *
12857PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12858{
12859    PyObject* out;
12860    int kind1, kind2;
12861    void *buf1, *buf2;
12862    Py_ssize_t len1, len2;
12863
12864    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12865        return NULL;
12866
12867    kind1 = PyUnicode_KIND(str_obj);
12868    kind2 = PyUnicode_KIND(sep_obj);
12869    len1 = PyUnicode_GET_LENGTH(str_obj);
12870    len2 = PyUnicode_GET_LENGTH(sep_obj);
12871    if (kind1 < kind2 || len1 < len2) {
12872        _Py_INCREF_UNICODE_EMPTY();
12873        if (!unicode_empty)
12874            out = NULL;
12875        else {
12876            out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12877            Py_DECREF(unicode_empty);
12878        }
12879        return out;
12880    }
12881    buf1 = PyUnicode_DATA(str_obj);
12882    buf2 = PyUnicode_DATA(sep_obj);
12883    if (kind2 != kind1) {
12884        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12885        if (!buf2)
12886            return NULL;
12887    }
12888
12889    switch (kind1) {
12890    case PyUnicode_1BYTE_KIND:
12891        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12892            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12893        else
12894            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12895        break;
12896    case PyUnicode_2BYTE_KIND:
12897        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12898        break;
12899    case PyUnicode_4BYTE_KIND:
12900        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12901        break;
12902    default:
12903        assert(0);
12904        out = 0;
12905    }
12906
12907    if (kind2 != kind1)
12908        PyMem_Free(buf2);
12909
12910    return out;
12911}
12912
12913
12914PyObject *
12915PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12916{
12917    PyObject* out;
12918    int kind1, kind2;
12919    void *buf1, *buf2;
12920    Py_ssize_t len1, len2;
12921
12922    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12923        return NULL;
12924
12925    kind1 = PyUnicode_KIND(str_obj);
12926    kind2 = PyUnicode_KIND(sep_obj);
12927    len1 = PyUnicode_GET_LENGTH(str_obj);
12928    len2 = PyUnicode_GET_LENGTH(sep_obj);
12929    if (kind1 < kind2 || len1 < len2) {
12930        _Py_INCREF_UNICODE_EMPTY();
12931        if (!unicode_empty)
12932            out = NULL;
12933        else {
12934            out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12935            Py_DECREF(unicode_empty);
12936        }
12937        return out;
12938    }
12939    buf1 = PyUnicode_DATA(str_obj);
12940    buf2 = PyUnicode_DATA(sep_obj);
12941    if (kind2 != kind1) {
12942        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12943        if (!buf2)
12944            return NULL;
12945    }
12946
12947    switch (kind1) {
12948    case PyUnicode_1BYTE_KIND:
12949        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12950            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12951        else
12952            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12953        break;
12954    case PyUnicode_2BYTE_KIND:
12955        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12956        break;
12957    case PyUnicode_4BYTE_KIND:
12958        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959        break;
12960    default:
12961        assert(0);
12962        out = 0;
12963    }
12964
12965    if (kind2 != kind1)
12966        PyMem_Free(buf2);
12967
12968    return out;
12969}
12970
12971PyDoc_STRVAR(partition__doc__,
12972             "S.partition(sep) -> (head, sep, tail)\n\
12973\n\
12974Search for the separator sep in S, and return the part before it,\n\
12975the separator itself, and the part after it.  If the separator is not\n\
12976found, return S and two empty strings.");
12977
12978static PyObject*
12979unicode_partition(PyObject *self, PyObject *separator)
12980{
12981    return PyUnicode_Partition(self, separator);
12982}
12983
12984PyDoc_STRVAR(rpartition__doc__,
12985             "S.rpartition(sep) -> (head, sep, tail)\n\
12986\n\
12987Search for the separator sep in S, starting at the end of S, and return\n\
12988the part before it, the separator itself, and the part after it.  If the\n\
12989separator is not found, return two empty strings and S.");
12990
12991static PyObject*
12992unicode_rpartition(PyObject *self, PyObject *separator)
12993{
12994    return PyUnicode_RPartition(self, separator);
12995}
12996
12997PyObject *
12998PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12999{
13000    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13001        return NULL;
13002
13003    return rsplit(s, sep, maxsplit);
13004}
13005
13006PyDoc_STRVAR(rsplit__doc__,
13007             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
13008\n\
13009Return a list of the words in S, using sep as the\n\
13010delimiter string, starting at the end of the string and\n\
13011working to the front.  If maxsplit is given, at most maxsplit\n\
13012splits are done. If sep is not specified, any whitespace string\n\
13013is a separator.");
13014
13015static PyObject*
13016unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
13017{
13018    static char *kwlist[] = {"sep", "maxsplit", 0};
13019    PyObject *substring = Py_None;
13020    Py_ssize_t maxcount = -1;
13021
13022    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13023                                     kwlist, &substring, &maxcount))
13024        return NULL;
13025
13026    if (substring == Py_None)
13027        return rsplit(self, NULL, maxcount);
13028
13029    if (PyUnicode_Check(substring))
13030        return rsplit(self, substring, maxcount);
13031
13032    PyErr_Format(PyExc_TypeError,
13033                 "must be str or None, not %.100s",
13034                 Py_TYPE(substring)->tp_name);
13035    return NULL;
13036}
13037
13038PyDoc_STRVAR(splitlines__doc__,
13039             "S.splitlines([keepends]) -> list of strings\n\
13040\n\
13041Return a list of the lines in S, breaking at line boundaries.\n\
13042Line breaks are not included in the resulting list unless keepends\n\
13043is given and true.");
13044
13045static PyObject*
13046unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
13047{
13048    static char *kwlist[] = {"keepends", 0};
13049    int keepends = 0;
13050
13051    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13052                                     kwlist, &keepends))
13053        return NULL;
13054
13055    return PyUnicode_Splitlines(self, keepends);
13056}
13057
13058static
13059PyObject *unicode_str(PyObject *self)
13060{
13061    return unicode_result_unchanged(self);
13062}
13063
13064PyDoc_STRVAR(swapcase__doc__,
13065             "S.swapcase() -> str\n\
13066\n\
13067Return a copy of S with uppercase characters converted to lowercase\n\
13068and vice versa.");
13069
13070static PyObject*
13071unicode_swapcase(PyObject *self)
13072{
13073    if (PyUnicode_READY(self) == -1)
13074        return NULL;
13075    return case_operation(self, do_swapcase);
13076}
13077
13078/*[clinic input]
13079
13080@staticmethod
13081str.maketrans as unicode_maketrans
13082
13083  x: object
13084
13085  y: unicode=NULL
13086
13087  z: unicode=NULL
13088
13089  /
13090
13091Return a translation table usable for str.translate().
13092
13093If there is only one argument, it must be a dictionary mapping Unicode
13094ordinals (integers) or characters to Unicode ordinals, strings or None.
13095Character keys will be then converted to ordinals.
13096If there are two arguments, they must be strings of equal length, and
13097in the resulting dictionary, each character in x will be mapped to the
13098character at the same position in y. If there is a third argument, it
13099must be a string, whose characters will be mapped to None in the result.
13100[clinic start generated code]*/
13101
13102static PyObject *
13103unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13104/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13105{
13106    PyObject *new = NULL, *key, *value;
13107    Py_ssize_t i = 0;
13108    int res;
13109
13110    new = PyDict_New();
13111    if (!new)
13112        return NULL;
13113    if (y != NULL) {
13114        int x_kind, y_kind, z_kind;
13115        void *x_data, *y_data, *z_data;
13116
13117        /* x must be a string too, of equal length */
13118        if (!PyUnicode_Check(x)) {
13119            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13120                            "be a string if there is a second argument");
13121            goto err;
13122        }
13123        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13124            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13125                            "arguments must have equal length");
13126            goto err;
13127        }
13128        /* create entries for translating chars in x to those in y */
13129        x_kind = PyUnicode_KIND(x);
13130        y_kind = PyUnicode_KIND(y);
13131        x_data = PyUnicode_DATA(x);
13132        y_data = PyUnicode_DATA(y);
13133        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13134            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13135            if (!key)
13136                goto err;
13137            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13138            if (!value) {
13139                Py_DECREF(key);
13140                goto err;
13141            }
13142            res = PyDict_SetItem(new, key, value);
13143            Py_DECREF(key);
13144            Py_DECREF(value);
13145            if (res < 0)
13146                goto err;
13147        }
13148        /* create entries for deleting chars in z */
13149        if (z != NULL) {
13150            z_kind = PyUnicode_KIND(z);
13151            z_data = PyUnicode_DATA(z);
13152            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13153                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13154                if (!key)
13155                    goto err;
13156                res = PyDict_SetItem(new, key, Py_None);
13157                Py_DECREF(key);
13158                if (res < 0)
13159                    goto err;
13160            }
13161        }
13162    } else {
13163        int kind;
13164        void *data;
13165
13166        /* x must be a dict */
13167        if (!PyDict_CheckExact(x)) {
13168            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13169                            "to maketrans it must be a dict");
13170            goto err;
13171        }
13172        /* copy entries into the new dict, converting string keys to int keys */
13173        while (PyDict_Next(x, &i, &key, &value)) {
13174            if (PyUnicode_Check(key)) {
13175                /* convert string keys to integer keys */
13176                PyObject *newkey;
13177                if (PyUnicode_GET_LENGTH(key) != 1) {
13178                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13179                                    "table must be of length 1");
13180                    goto err;
13181                }
13182                kind = PyUnicode_KIND(key);
13183                data = PyUnicode_DATA(key);
13184                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13185                if (!newkey)
13186                    goto err;
13187                res = PyDict_SetItem(new, newkey, value);
13188                Py_DECREF(newkey);
13189                if (res < 0)
13190                    goto err;
13191            } else if (PyLong_Check(key)) {
13192                /* just keep integer keys */
13193                if (PyDict_SetItem(new, key, value) < 0)
13194                    goto err;
13195            } else {
13196                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13197                                "be strings or integers");
13198                goto err;
13199            }
13200        }
13201    }
13202    return new;
13203  err:
13204    Py_DECREF(new);
13205    return NULL;
13206}
13207
13208PyDoc_STRVAR(translate__doc__,
13209             "S.translate(table) -> str\n\
13210\n\
13211Return a copy of the string S in which each character has been mapped\n\
13212through the given translation table. The table must implement\n\
13213lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13214mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13215this operation raises LookupError, the character is left untouched.\n\
13216Characters mapped to None are deleted.");
13217
13218static PyObject*
13219unicode_translate(PyObject *self, PyObject *table)
13220{
13221    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13222}
13223
13224PyDoc_STRVAR(upper__doc__,
13225             "S.upper() -> str\n\
13226\n\
13227Return a copy of S converted to uppercase.");
13228
13229static PyObject*
13230unicode_upper(PyObject *self)
13231{
13232    if (PyUnicode_READY(self) == -1)
13233        return NULL;
13234    if (PyUnicode_IS_ASCII(self))
13235        return ascii_upper_or_lower(self, 0);
13236    return case_operation(self, do_upper);
13237}
13238
13239PyDoc_STRVAR(zfill__doc__,
13240             "S.zfill(width) -> str\n\
13241\n\
13242Pad a numeric string S with zeros on the left, to fill a field\n\
13243of the specified width. The string S is never truncated.");
13244
13245static PyObject *
13246unicode_zfill(PyObject *self, PyObject *args)
13247{
13248    Py_ssize_t fill;
13249    PyObject *u;
13250    Py_ssize_t width;
13251    int kind;
13252    void *data;
13253    Py_UCS4 chr;
13254
13255    if (!PyArg_ParseTuple(args, "n:zfill", &width))
13256        return NULL;
13257
13258    if (PyUnicode_READY(self) == -1)
13259        return NULL;
13260
13261    if (PyUnicode_GET_LENGTH(self) >= width)
13262        return unicode_result_unchanged(self);
13263
13264    fill = width - PyUnicode_GET_LENGTH(self);
13265
13266    u = pad(self, fill, 0, '0');
13267
13268    if (u == NULL)
13269        return NULL;
13270
13271    kind = PyUnicode_KIND(u);
13272    data = PyUnicode_DATA(u);
13273    chr = PyUnicode_READ(kind, data, fill);
13274
13275    if (chr == '+' || chr == '-') {
13276        /* move sign to beginning of string */
13277        PyUnicode_WRITE(kind, data, 0, chr);
13278        PyUnicode_WRITE(kind, data, fill, '0');
13279    }
13280
13281    assert(_PyUnicode_CheckConsistency(u, 1));
13282    return u;
13283}
13284
13285#if 0
13286static PyObject *
13287unicode__decimal2ascii(PyObject *self)
13288{
13289    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13290}
13291#endif
13292
13293PyDoc_STRVAR(startswith__doc__,
13294             "S.startswith(prefix[, start[, end]]) -> bool\n\
13295\n\
13296Return True if S starts with the specified prefix, False otherwise.\n\
13297With optional start, test S beginning at that position.\n\
13298With optional end, stop comparing S at that position.\n\
13299prefix can also be a tuple of strings to try.");
13300
13301static PyObject *
13302unicode_startswith(PyObject *self,
13303                   PyObject *args)
13304{
13305    PyObject *subobj;
13306    PyObject *substring;
13307    Py_ssize_t start = 0;
13308    Py_ssize_t end = PY_SSIZE_T_MAX;
13309    int result;
13310
13311    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13312        return NULL;
13313    if (PyTuple_Check(subobj)) {
13314        Py_ssize_t i;
13315        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13316            substring = PyTuple_GET_ITEM(subobj, i);
13317            if (!PyUnicode_Check(substring)) {
13318                PyErr_Format(PyExc_TypeError,
13319                             "tuple for startswith must only contain str, "
13320                             "not %.100s",
13321                             Py_TYPE(substring)->tp_name);
13322                return NULL;
13323            }
13324            result = tailmatch(self, substring, start, end, -1);
13325            if (result == -1)
13326                return NULL;
13327            if (result) {
13328                Py_RETURN_TRUE;
13329            }
13330        }
13331        /* nothing matched */
13332        Py_RETURN_FALSE;
13333    }
13334    if (!PyUnicode_Check(subobj)) {
13335        PyErr_Format(PyExc_TypeError,
13336                     "startswith first arg must be str or "
13337                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13338        return NULL;
13339    }
13340    result = tailmatch(self, subobj, start, end, -1);
13341    if (result == -1)
13342        return NULL;
13343    return PyBool_FromLong(result);
13344}
13345
13346
13347PyDoc_STRVAR(endswith__doc__,
13348             "S.endswith(suffix[, start[, end]]) -> bool\n\
13349\n\
13350Return True if S ends with the specified suffix, False otherwise.\n\
13351With optional start, test S beginning at that position.\n\
13352With optional end, stop comparing S at that position.\n\
13353suffix can also be a tuple of strings to try.");
13354
13355static PyObject *
13356unicode_endswith(PyObject *self,
13357                 PyObject *args)
13358{
13359    PyObject *subobj;
13360    PyObject *substring;
13361    Py_ssize_t start = 0;
13362    Py_ssize_t end = PY_SSIZE_T_MAX;
13363    int result;
13364
13365    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13366        return NULL;
13367    if (PyTuple_Check(subobj)) {
13368        Py_ssize_t i;
13369        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13370            substring = PyTuple_GET_ITEM(subobj, i);
13371            if (!PyUnicode_Check(substring)) {
13372                PyErr_Format(PyExc_TypeError,
13373                             "tuple for endswith must only contain str, "
13374                             "not %.100s",
13375                             Py_TYPE(substring)->tp_name);
13376                return NULL;
13377            }
13378            result = tailmatch(self, substring, start, end, +1);
13379            if (result == -1)
13380                return NULL;
13381            if (result) {
13382                Py_RETURN_TRUE;
13383            }
13384        }
13385        Py_RETURN_FALSE;
13386    }
13387    if (!PyUnicode_Check(subobj)) {
13388        PyErr_Format(PyExc_TypeError,
13389                     "endswith first arg must be str or "
13390                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13391        return NULL;
13392    }
13393    result = tailmatch(self, subobj, start, end, +1);
13394    if (result == -1)
13395        return NULL;
13396    return PyBool_FromLong(result);
13397}
13398
13399static inline void
13400_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13401{
13402    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13403    writer->data = PyUnicode_DATA(writer->buffer);
13404
13405    if (!writer->readonly) {
13406        writer->kind = PyUnicode_KIND(writer->buffer);
13407        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13408    }
13409    else {
13410        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13411           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13412        writer->kind = PyUnicode_WCHAR_KIND;
13413        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13414
13415        /* Copy-on-write mode: set buffer size to 0 so
13416         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13417         * next write. */
13418        writer->size = 0;
13419    }
13420}
13421
13422void
13423_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13424{
13425    memset(writer, 0, sizeof(*writer));
13426
13427    /* ASCII is the bare minimum */
13428    writer->min_char = 127;
13429
13430    /* use a value smaller than PyUnicode_1BYTE_KIND() so
13431       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13432    writer->kind = PyUnicode_WCHAR_KIND;
13433    assert(writer->kind <= PyUnicode_1BYTE_KIND);
13434}
13435
13436int
13437_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13438                                 Py_ssize_t length, Py_UCS4 maxchar)
13439{
13440    Py_ssize_t newlen;
13441    PyObject *newbuffer;
13442
13443    assert(maxchar <= MAX_UNICODE);
13444
13445    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13446    assert((maxchar > writer->maxchar && length >= 0)
13447           || length > 0);
13448
13449    if (length > PY_SSIZE_T_MAX - writer->pos) {
13450        PyErr_NoMemory();
13451        return -1;
13452    }
13453    newlen = writer->pos + length;
13454
13455    maxchar = Py_MAX(maxchar, writer->min_char);
13456
13457    if (writer->buffer == NULL) {
13458        assert(!writer->readonly);
13459        if (writer->overallocate
13460            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13461            /* overallocate to limit the number of realloc() */
13462            newlen += newlen / OVERALLOCATE_FACTOR;
13463        }
13464        if (newlen < writer->min_length)
13465            newlen = writer->min_length;
13466
13467        writer->buffer = PyUnicode_New(newlen, maxchar);
13468        if (writer->buffer == NULL)
13469            return -1;
13470    }
13471    else if (newlen > writer->size) {
13472        if (writer->overallocate
13473            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13474            /* overallocate to limit the number of realloc() */
13475            newlen += newlen / OVERALLOCATE_FACTOR;
13476        }
13477        if (newlen < writer->min_length)
13478            newlen = writer->min_length;
13479
13480        if (maxchar > writer->maxchar || writer->readonly) {
13481            /* resize + widen */
13482            maxchar = Py_MAX(maxchar, writer->maxchar);
13483            newbuffer = PyUnicode_New(newlen, maxchar);
13484            if (newbuffer == NULL)
13485                return -1;
13486            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13487                                          writer->buffer, 0, writer->pos);
13488            Py_DECREF(writer->buffer);
13489            writer->readonly = 0;
13490        }
13491        else {
13492            newbuffer = resize_compact(writer->buffer, newlen);
13493            if (newbuffer == NULL)
13494                return -1;
13495        }
13496        writer->buffer = newbuffer;
13497    }
13498    else if (maxchar > writer->maxchar) {
13499        assert(!writer->readonly);
13500        newbuffer = PyUnicode_New(writer->size, maxchar);
13501        if (newbuffer == NULL)
13502            return -1;
13503        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13504                                      writer->buffer, 0, writer->pos);
13505        Py_SETREF(writer->buffer, newbuffer);
13506    }
13507    _PyUnicodeWriter_Update(writer);
13508    return 0;
13509
13510#undef OVERALLOCATE_FACTOR
13511}
13512
13513int
13514_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13515                                     enum PyUnicode_Kind kind)
13516{
13517    Py_UCS4 maxchar;
13518
13519    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13520    assert(writer->kind < kind);
13521
13522    switch (kind)
13523    {
13524    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13525    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13526    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13527    default:
13528        assert(0 && "invalid kind");
13529        return -1;
13530    }
13531
13532    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13533}
13534
13535static inline int
13536_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13537{
13538    assert(ch <= MAX_UNICODE);
13539    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13540        return -1;
13541    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13542    writer->pos++;
13543    return 0;
13544}
13545
13546int
13547_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13548{
13549    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13550}
13551
13552int
13553_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13554{
13555    Py_UCS4 maxchar;
13556    Py_ssize_t len;
13557
13558    if (PyUnicode_READY(str) == -1)
13559        return -1;
13560    len = PyUnicode_GET_LENGTH(str);
13561    if (len == 0)
13562        return 0;
13563    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13564    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13565        if (writer->buffer == NULL && !writer->overallocate) {
13566            assert(_PyUnicode_CheckConsistency(str, 1));
13567            writer->readonly = 1;
13568            Py_INCREF(str);
13569            writer->buffer = str;
13570            _PyUnicodeWriter_Update(writer);
13571            writer->pos += len;
13572            return 0;
13573        }
13574        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13575            return -1;
13576    }
13577    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13578                                  str, 0, len);
13579    writer->pos += len;
13580    return 0;
13581}
13582
13583int
13584_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13585                                Py_ssize_t start, Py_ssize_t end)
13586{
13587    Py_UCS4 maxchar;
13588    Py_ssize_t len;
13589
13590    if (PyUnicode_READY(str) == -1)
13591        return -1;
13592
13593    assert(0 <= start);
13594    assert(end <= PyUnicode_GET_LENGTH(str));
13595    assert(start <= end);
13596
13597    if (end == 0)
13598        return 0;
13599
13600    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13601        return _PyUnicodeWriter_WriteStr(writer, str);
13602
13603    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13604        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13605    else
13606        maxchar = writer->maxchar;
13607    len = end - start;
13608
13609    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13610        return -1;
13611
13612    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13613                                  str, start, len);
13614    writer->pos += len;
13615    return 0;
13616}
13617
13618int
13619_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13620                                  const char *ascii, Py_ssize_t len)
13621{
13622    if (len == -1)
13623        len = strlen(ascii);
13624
13625    assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13626
13627    if (writer->buffer == NULL && !writer->overallocate) {
13628        PyObject *str;
13629
13630        str = _PyUnicode_FromASCII(ascii, len);
13631        if (str == NULL)
13632            return -1;
13633
13634        writer->readonly = 1;
13635        writer->buffer = str;
13636        _PyUnicodeWriter_Update(writer);
13637        writer->pos += len;
13638        return 0;
13639    }
13640
13641    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13642        return -1;
13643
13644    switch (writer->kind)
13645    {
13646    case PyUnicode_1BYTE_KIND:
13647    {
13648        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13649        Py_UCS1 *data = writer->data;
13650
13651        memcpy(data + writer->pos, str, len);
13652        break;
13653    }
13654    case PyUnicode_2BYTE_KIND:
13655    {
13656        _PyUnicode_CONVERT_BYTES(
13657            Py_UCS1, Py_UCS2,
13658            ascii, ascii + len,
13659            (Py_UCS2 *)writer->data + writer->pos);
13660        break;
13661    }
13662    case PyUnicode_4BYTE_KIND:
13663    {
13664        _PyUnicode_CONVERT_BYTES(
13665            Py_UCS1, Py_UCS4,
13666            ascii, ascii + len,
13667            (Py_UCS4 *)writer->data + writer->pos);
13668        break;
13669    }
13670    default:
13671        assert(0);
13672    }
13673
13674    writer->pos += len;
13675    return 0;
13676}
13677
13678int
13679_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13680                                   const char *str, Py_ssize_t len)
13681{
13682    Py_UCS4 maxchar;
13683
13684    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13685    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13686        return -1;
13687    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13688    writer->pos += len;
13689    return 0;
13690}
13691
13692PyObject *
13693_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13694{
13695    PyObject *str;
13696
13697    if (writer->pos == 0) {
13698        Py_CLEAR(writer->buffer);
13699        _Py_RETURN_UNICODE_EMPTY();
13700    }
13701
13702    str = writer->buffer;
13703    writer->buffer = NULL;
13704
13705    if (writer->readonly) {
13706        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13707        return str;
13708    }
13709
13710    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13711        PyObject *str2;
13712        str2 = resize_compact(str, writer->pos);
13713        if (str2 == NULL) {
13714            Py_DECREF(str);
13715            return NULL;
13716        }
13717        str = str2;
13718    }
13719
13720    assert(_PyUnicode_CheckConsistency(str, 1));
13721    return unicode_result_ready(str);
13722}
13723
13724void
13725_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13726{
13727    Py_CLEAR(writer->buffer);
13728}
13729
13730#include "stringlib/unicode_format.h"
13731
13732PyDoc_STRVAR(format__doc__,
13733             "S.format(*args, **kwargs) -> str\n\
13734\n\
13735Return a formatted version of S, using substitutions from args and kwargs.\n\
13736The substitutions are identified by braces ('{' and '}').");
13737
13738PyDoc_STRVAR(format_map__doc__,
13739             "S.format_map(mapping) -> str\n\
13740\n\
13741Return a formatted version of S, using substitutions from mapping.\n\
13742The substitutions are identified by braces ('{' and '}').");
13743
13744static PyObject *
13745unicode__format__(PyObject* self, PyObject* args)
13746{
13747    PyObject *format_spec;
13748    _PyUnicodeWriter writer;
13749    int ret;
13750
13751    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13752        return NULL;
13753
13754    if (PyUnicode_READY(self) == -1)
13755        return NULL;
13756    _PyUnicodeWriter_Init(&writer);
13757    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13758                                          self, format_spec, 0,
13759                                          PyUnicode_GET_LENGTH(format_spec));
13760    if (ret == -1) {
13761        _PyUnicodeWriter_Dealloc(&writer);
13762        return NULL;
13763    }
13764    return _PyUnicodeWriter_Finish(&writer);
13765}
13766
13767PyDoc_STRVAR(p_format__doc__,
13768             "S.__format__(format_spec) -> str\n\
13769\n\
13770Return a formatted version of S as described by format_spec.");
13771
13772static PyObject *
13773unicode__sizeof__(PyObject *v)
13774{
13775    Py_ssize_t size;
13776
13777    /* If it's a compact object, account for base structure +
13778       character data. */
13779    if (PyUnicode_IS_COMPACT_ASCII(v))
13780        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13781    else if (PyUnicode_IS_COMPACT(v))
13782        size = sizeof(PyCompactUnicodeObject) +
13783            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13784    else {
13785        /* If it is a two-block object, account for base object, and
13786           for character block if present. */
13787        size = sizeof(PyUnicodeObject);
13788        if (_PyUnicode_DATA_ANY(v))
13789            size += (PyUnicode_GET_LENGTH(v) + 1) *
13790                PyUnicode_KIND(v);
13791    }
13792    /* If the wstr pointer is present, account for it unless it is shared
13793       with the data pointer. Check if the data is not shared. */
13794    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13795        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13796    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13797        size += PyUnicode_UTF8_LENGTH(v) + 1;
13798
13799    return PyLong_FromSsize_t(size);
13800}
13801
13802PyDoc_STRVAR(sizeof__doc__,
13803             "S.__sizeof__() -> size of S in memory, in bytes");
13804
13805static PyObject *
13806unicode_getnewargs(PyObject *v)
13807{
13808    PyObject *copy = _PyUnicode_Copy(v);
13809    if (!copy)
13810        return NULL;
13811    return Py_BuildValue("(N)", copy);
13812}
13813
13814static PyMethodDef unicode_methods[] = {
13815    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13816    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13817    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13818    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13819    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13820    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13821    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13822    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13823    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13824    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13825    {"expandtabs", (PyCFunction) unicode_expandtabs,
13826     METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13827    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13828    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13829    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13830    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13831    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13832    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13833    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13834    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13835    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13836    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13837    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13838    {"splitlines", (PyCFunction) unicode_splitlines,
13839     METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13840    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13841    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13842    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13843    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13844    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13845    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13846    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13847    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13848    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13849    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13850    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13851    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13852    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13853    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13854    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13855    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13856    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13857    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13858    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13859    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13860    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13861    UNICODE_MAKETRANS_METHODDEF
13862    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13863#if 0
13864    /* These methods are just used for debugging the implementation. */
13865    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13866#endif
13867
13868    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13869    {NULL, NULL}
13870};
13871
13872static PyObject *
13873unicode_mod(PyObject *v, PyObject *w)
13874{
13875    if (!PyUnicode_Check(v))
13876        Py_RETURN_NOTIMPLEMENTED;
13877    return PyUnicode_Format(v, w);
13878}
13879
13880static PyNumberMethods unicode_as_number = {
13881    0,              /*nb_add*/
13882    0,              /*nb_subtract*/
13883    0,              /*nb_multiply*/
13884    unicode_mod,            /*nb_remainder*/
13885};
13886
13887static PySequenceMethods unicode_as_sequence = {
13888    (lenfunc) unicode_length,       /* sq_length */
13889    PyUnicode_Concat,           /* sq_concat */
13890    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13891    (ssizeargfunc) unicode_getitem,     /* sq_item */
13892    0,                  /* sq_slice */
13893    0,                  /* sq_ass_item */
13894    0,                  /* sq_ass_slice */
13895    PyUnicode_Contains,         /* sq_contains */
13896};
13897
13898static PyObject*
13899unicode_subscript(PyObject* self, PyObject* item)
13900{
13901    if (PyUnicode_READY(self) == -1)
13902        return NULL;
13903
13904    if (PyIndex_Check(item)) {
13905        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13906        if (i == -1 && PyErr_Occurred())
13907            return NULL;
13908        if (i < 0)
13909            i += PyUnicode_GET_LENGTH(self);
13910        return unicode_getitem(self, i);
13911    } else if (PySlice_Check(item)) {
13912        Py_ssize_t start, stop, step, slicelength, cur, i;
13913        PyObject *result;
13914        void *src_data, *dest_data;
13915        int src_kind, dest_kind;
13916        Py_UCS4 ch, max_char, kind_limit;
13917
13918        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13919                                 &start, &stop, &step, &slicelength) < 0) {
13920            return NULL;
13921        }
13922
13923        if (slicelength <= 0) {
13924            _Py_RETURN_UNICODE_EMPTY();
13925        } else if (start == 0 && step == 1 &&
13926                   slicelength == PyUnicode_GET_LENGTH(self)) {
13927            return unicode_result_unchanged(self);
13928        } else if (step == 1) {
13929            return PyUnicode_Substring(self,
13930                                       start, start + slicelength);
13931        }
13932        /* General case */
13933        src_kind = PyUnicode_KIND(self);
13934        src_data = PyUnicode_DATA(self);
13935        if (!PyUnicode_IS_ASCII(self)) {
13936            kind_limit = kind_maxchar_limit(src_kind);
13937            max_char = 0;
13938            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13939                ch = PyUnicode_READ(src_kind, src_data, cur);
13940                if (ch > max_char) {
13941                    max_char = ch;
13942                    if (max_char >= kind_limit)
13943                        break;
13944                }
13945            }
13946        }
13947        else
13948            max_char = 127;
13949        result = PyUnicode_New(slicelength, max_char);
13950        if (result == NULL)
13951            return NULL;
13952        dest_kind = PyUnicode_KIND(result);
13953        dest_data = PyUnicode_DATA(result);
13954
13955        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13956            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13957            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13958        }
13959        assert(_PyUnicode_CheckConsistency(result, 1));
13960        return result;
13961    } else {
13962        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13963        return NULL;
13964    }
13965}
13966
13967static PyMappingMethods unicode_as_mapping = {
13968    (lenfunc)unicode_length,        /* mp_length */
13969    (binaryfunc)unicode_subscript,  /* mp_subscript */
13970    (objobjargproc)0,           /* mp_ass_subscript */
13971};
13972
13973
13974/* Helpers for PyUnicode_Format() */
13975
13976struct unicode_formatter_t {
13977    PyObject *args;
13978    int args_owned;
13979    Py_ssize_t arglen, argidx;
13980    PyObject *dict;
13981
13982    enum PyUnicode_Kind fmtkind;
13983    Py_ssize_t fmtcnt, fmtpos;
13984    void *fmtdata;
13985    PyObject *fmtstr;
13986
13987    _PyUnicodeWriter writer;
13988};
13989
13990struct unicode_format_arg_t {
13991    Py_UCS4 ch;
13992    int flags;
13993    Py_ssize_t width;
13994    int prec;
13995    int sign;
13996};
13997
13998static PyObject *
13999unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14000{
14001    Py_ssize_t argidx = ctx->argidx;
14002
14003    if (argidx < ctx->arglen) {
14004        ctx->argidx++;
14005        if (ctx->arglen < 0)
14006            return ctx->args;
14007        else
14008            return PyTuple_GetItem(ctx->args, argidx);
14009    }
14010    PyErr_SetString(PyExc_TypeError,
14011                    "not enough arguments for format string");
14012    return NULL;
14013}
14014
14015/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14016
14017/* Format a float into the writer if the writer is not NULL, or into *p_output
14018   otherwise.
14019
14020   Return 0 on success, raise an exception and return -1 on error. */
14021static int
14022formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14023            PyObject **p_output,
14024            _PyUnicodeWriter *writer)
14025{
14026    char *p;
14027    double x;
14028    Py_ssize_t len;
14029    int prec;
14030    int dtoa_flags;
14031
14032    x = PyFloat_AsDouble(v);
14033    if (x == -1.0 && PyErr_Occurred())
14034        return -1;
14035
14036    prec = arg->prec;
14037    if (prec < 0)
14038        prec = 6;
14039
14040    if (arg->flags & F_ALT)
14041        dtoa_flags = Py_DTSF_ALT;
14042    else
14043        dtoa_flags = 0;
14044    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14045    if (p == NULL)
14046        return -1;
14047    len = strlen(p);
14048    if (writer) {
14049        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14050            PyMem_Free(p);
14051            return -1;
14052        }
14053    }
14054    else
14055        *p_output = _PyUnicode_FromASCII(p, len);
14056    PyMem_Free(p);
14057    return 0;
14058}
14059
14060/* formatlong() emulates the format codes d, u, o, x and X, and
14061 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14062 * Python's regular ints.
14063 * Return value:  a new PyUnicodeObject*, or NULL if error.
14064 *     The output string is of the form
14065 *         "-"? ("0x" | "0X")? digit+
14066 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14067 *         set in flags.  The case of hex digits will be correct,
14068 *     There will be at least prec digits, zero-filled on the left if
14069 *         necessary to get that many.
14070 * val          object to be converted
14071 * flags        bitmask of format flags; only F_ALT is looked at
14072 * prec         minimum number of digits; 0-fill on left if needed
14073 * type         a character in [duoxX]; u acts the same as d
14074 *
14075 * CAUTION:  o, x and X conversions on regular ints can never
14076 * produce a '-' sign, but can for Python's unbounded ints.
14077 */
14078PyObject *
14079_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14080{
14081    PyObject *result = NULL;
14082    char *buf;
14083    Py_ssize_t i;
14084    int sign;           /* 1 if '-', else 0 */
14085    int len;            /* number of characters */
14086    Py_ssize_t llen;
14087    int numdigits;      /* len == numnondigits + numdigits */
14088    int numnondigits = 0;
14089
14090    /* Avoid exceeding SSIZE_T_MAX */
14091    if (prec > INT_MAX-3) {
14092        PyErr_SetString(PyExc_OverflowError,
14093                        "precision too large");
14094        return NULL;
14095    }
14096
14097    assert(PyLong_Check(val));
14098
14099    switch (type) {
14100    default:
14101        assert(!"'type' not in [diuoxX]");
14102    case 'd':
14103    case 'i':
14104    case 'u':
14105        /* int and int subclasses should print numerically when a numeric */
14106        /* format code is used (see issue18780) */
14107        result = PyNumber_ToBase(val, 10);
14108        break;
14109    case 'o':
14110        numnondigits = 2;
14111        result = PyNumber_ToBase(val, 8);
14112        break;
14113    case 'x':
14114    case 'X':
14115        numnondigits = 2;
14116        result = PyNumber_ToBase(val, 16);
14117        break;
14118    }
14119    if (!result)
14120        return NULL;
14121
14122    assert(unicode_modifiable(result));
14123    assert(PyUnicode_IS_READY(result));
14124    assert(PyUnicode_IS_ASCII(result));
14125
14126    /* To modify the string in-place, there can only be one reference. */
14127    if (Py_REFCNT(result) != 1) {
14128        Py_DECREF(result);
14129        PyErr_BadInternalCall();
14130        return NULL;
14131    }
14132    buf = PyUnicode_DATA(result);
14133    llen = PyUnicode_GET_LENGTH(result);
14134    if (llen > INT_MAX) {
14135        Py_DECREF(result);
14136        PyErr_SetString(PyExc_ValueError,
14137                        "string too large in _PyUnicode_FormatLong");
14138        return NULL;
14139    }
14140    len = (int)llen;
14141    sign = buf[0] == '-';
14142    numnondigits += sign;
14143    numdigits = len - numnondigits;
14144    assert(numdigits > 0);
14145
14146    /* Get rid of base marker unless F_ALT */
14147    if (((alt) == 0 &&
14148        (type == 'o' || type == 'x' || type == 'X'))) {
14149        assert(buf[sign] == '0');
14150        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14151               buf[sign+1] == 'o');
14152        numnondigits -= 2;
14153        buf += 2;
14154        len -= 2;
14155        if (sign)
14156            buf[0] = '-';
14157        assert(len == numnondigits + numdigits);
14158        assert(numdigits > 0);
14159    }
14160
14161    /* Fill with leading zeroes to meet minimum width. */
14162    if (prec > numdigits) {
14163        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14164                                numnondigits + prec);
14165        char *b1;
14166        if (!r1) {
14167            Py_DECREF(result);
14168            return NULL;
14169        }
14170        b1 = PyBytes_AS_STRING(r1);
14171        for (i = 0; i < numnondigits; ++i)
14172            *b1++ = *buf++;
14173        for (i = 0; i < prec - numdigits; i++)
14174            *b1++ = '0';
14175        for (i = 0; i < numdigits; i++)
14176            *b1++ = *buf++;
14177        *b1 = '\0';
14178        Py_DECREF(result);
14179        result = r1;
14180        buf = PyBytes_AS_STRING(result);
14181        len = numnondigits + prec;
14182    }
14183
14184    /* Fix up case for hex conversions. */
14185    if (type == 'X') {
14186        /* Need to convert all lower case letters to upper case.
14187           and need to convert 0x to 0X (and -0x to -0X). */
14188        for (i = 0; i < len; i++)
14189            if (buf[i] >= 'a' && buf[i] <= 'x')
14190                buf[i] -= 'a'-'A';
14191    }
14192    if (!PyUnicode_Check(result)
14193        || buf != PyUnicode_DATA(result)) {
14194        PyObject *unicode;
14195        unicode = _PyUnicode_FromASCII(buf, len);
14196        Py_DECREF(result);
14197        result = unicode;
14198    }
14199    else if (len != PyUnicode_GET_LENGTH(result)) {
14200        if (PyUnicode_Resize(&result, len) < 0)
14201            Py_CLEAR(result);
14202    }
14203    return result;
14204}
14205
14206/* Format an integer or a float as an integer.
14207 * Return 1 if the number has been formatted into the writer,
14208 *        0 if the number has been formatted into *p_output
14209 *       -1 and raise an exception on error */
14210static int
14211mainformatlong(PyObject *v,
14212               struct unicode_format_arg_t *arg,
14213               PyObject **p_output,
14214               _PyUnicodeWriter *writer)
14215{
14216    PyObject *iobj, *res;
14217    char type = (char)arg->ch;
14218
14219    if (!PyNumber_Check(v))
14220        goto wrongtype;
14221
14222    /* make sure number is a type of integer for o, x, and X */
14223    if (!PyLong_Check(v)) {
14224        if (type == 'o' || type == 'x' || type == 'X') {
14225            iobj = PyNumber_Index(v);
14226            if (iobj == NULL) {
14227                if (PyErr_ExceptionMatches(PyExc_TypeError))
14228                    goto wrongtype;
14229                return -1;
14230            }
14231        }
14232        else {
14233            iobj = PyNumber_Long(v);
14234            if (iobj == NULL ) {
14235                if (PyErr_ExceptionMatches(PyExc_TypeError))
14236                    goto wrongtype;
14237                return -1;
14238            }
14239        }
14240        assert(PyLong_Check(iobj));
14241    }
14242    else {
14243        iobj = v;
14244        Py_INCREF(iobj);
14245    }
14246
14247    if (PyLong_CheckExact(v)
14248        && arg->width == -1 && arg->prec == -1
14249        && !(arg->flags & (F_SIGN | F_BLANK))
14250        && type != 'X')
14251    {
14252        /* Fast path */
14253        int alternate = arg->flags & F_ALT;
14254        int base;
14255
14256        switch(type)
14257        {
14258            default:
14259                assert(0 && "'type' not in [diuoxX]");
14260            case 'd':
14261            case 'i':
14262            case 'u':
14263                base = 10;
14264                break;
14265            case 'o':
14266                base = 8;
14267                break;
14268            case 'x':
14269            case 'X':
14270                base = 16;
14271                break;
14272        }
14273
14274        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14275            Py_DECREF(iobj);
14276            return -1;
14277        }
14278        Py_DECREF(iobj);
14279        return 1;
14280    }
14281
14282    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14283    Py_DECREF(iobj);
14284    if (res == NULL)
14285        return -1;
14286    *p_output = res;
14287    return 0;
14288
14289wrongtype:
14290    switch(type)
14291    {
14292        case 'o':
14293        case 'x':
14294        case 'X':
14295            PyErr_Format(PyExc_TypeError,
14296                    "%%%c format: an integer is required, "
14297                    "not %.200s",
14298                    type, Py_TYPE(v)->tp_name);
14299            break;
14300        default:
14301            PyErr_Format(PyExc_TypeError,
14302                    "%%%c format: a number is required, "
14303                    "not %.200s",
14304                    type, Py_TYPE(v)->tp_name);
14305            break;
14306    }
14307    return -1;
14308}
14309
14310static Py_UCS4
14311formatchar(PyObject *v)
14312{
14313    /* presume that the buffer is at least 3 characters long */
14314    if (PyUnicode_Check(v)) {
14315        if (PyUnicode_GET_LENGTH(v) == 1) {
14316            return PyUnicode_READ_CHAR(v, 0);
14317        }
14318        goto onError;
14319    }
14320    else {
14321        PyObject *iobj;
14322        long x;
14323        /* make sure number is a type of integer */
14324        if (!PyLong_Check(v)) {
14325            iobj = PyNumber_Index(v);
14326            if (iobj == NULL) {
14327                goto onError;
14328            }
14329            x = PyLong_AsLong(iobj);
14330            Py_DECREF(iobj);
14331        }
14332        else {
14333            x = PyLong_AsLong(v);
14334        }
14335        if (x == -1 && PyErr_Occurred())
14336            goto onError;
14337
14338        if (x < 0 || x > MAX_UNICODE) {
14339            PyErr_SetString(PyExc_OverflowError,
14340                            "%c arg not in range(0x110000)");
14341            return (Py_UCS4) -1;
14342        }
14343
14344        return (Py_UCS4) x;
14345    }
14346
14347  onError:
14348    PyErr_SetString(PyExc_TypeError,
14349                    "%c requires int or char");
14350    return (Py_UCS4) -1;
14351}
14352
14353/* Parse options of an argument: flags, width, precision.
14354   Handle also "%(name)" syntax.
14355
14356   Return 0 if the argument has been formatted into arg->str.
14357   Return 1 if the argument has been written into ctx->writer,
14358   Raise an exception and return -1 on error. */
14359static int
14360unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14361                         struct unicode_format_arg_t *arg)
14362{
14363#define FORMAT_READ(ctx) \
14364        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14365
14366    PyObject *v;
14367
14368    if (arg->ch == '(') {
14369        /* Get argument value from a dictionary. Example: "%(name)s". */
14370        Py_ssize_t keystart;
14371        Py_ssize_t keylen;
14372        PyObject *key;
14373        int pcount = 1;
14374
14375        if (ctx->dict == NULL) {
14376            PyErr_SetString(PyExc_TypeError,
14377                            "format requires a mapping");
14378            return -1;
14379        }
14380        ++ctx->fmtpos;
14381        --ctx->fmtcnt;
14382        keystart = ctx->fmtpos;
14383        /* Skip over balanced parentheses */
14384        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14385            arg->ch = FORMAT_READ(ctx);
14386            if (arg->ch == ')')
14387                --pcount;
14388            else if (arg->ch == '(')
14389                ++pcount;
14390            ctx->fmtpos++;
14391        }
14392        keylen = ctx->fmtpos - keystart - 1;
14393        if (ctx->fmtcnt < 0 || pcount > 0) {
14394            PyErr_SetString(PyExc_ValueError,
14395                            "incomplete format key");
14396            return -1;
14397        }
14398        key = PyUnicode_Substring(ctx->fmtstr,
14399                                  keystart, keystart + keylen);
14400        if (key == NULL)
14401            return -1;
14402        if (ctx->args_owned) {
14403            ctx->args_owned = 0;
14404            Py_DECREF(ctx->args);
14405        }
14406        ctx->args = PyObject_GetItem(ctx->dict, key);
14407        Py_DECREF(key);
14408        if (ctx->args == NULL)
14409            return -1;
14410        ctx->args_owned = 1;
14411        ctx->arglen = -1;
14412        ctx->argidx = -2;
14413    }
14414
14415    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14416    while (--ctx->fmtcnt >= 0) {
14417        arg->ch = FORMAT_READ(ctx);
14418        ctx->fmtpos++;
14419        switch (arg->ch) {
14420        case '-': arg->flags |= F_LJUST; continue;
14421        case '+': arg->flags |= F_SIGN; continue;
14422        case ' ': arg->flags |= F_BLANK; continue;
14423        case '#': arg->flags |= F_ALT; continue;
14424        case '0': arg->flags |= F_ZERO; continue;
14425        }
14426        break;
14427    }
14428
14429    /* Parse width. Example: "%10s" => width=10 */
14430    if (arg->ch == '*') {
14431        v = unicode_format_getnextarg(ctx);
14432        if (v == NULL)
14433            return -1;
14434        if (!PyLong_Check(v)) {
14435            PyErr_SetString(PyExc_TypeError,
14436                            "* wants int");
14437            return -1;
14438        }
14439        arg->width = PyLong_AsSsize_t(v);
14440        if (arg->width == -1 && PyErr_Occurred())
14441            return -1;
14442        if (arg->width < 0) {
14443            arg->flags |= F_LJUST;
14444            arg->width = -arg->width;
14445        }
14446        if (--ctx->fmtcnt >= 0) {
14447            arg->ch = FORMAT_READ(ctx);
14448            ctx->fmtpos++;
14449        }
14450    }
14451    else if (arg->ch >= '0' && arg->ch <= '9') {
14452        arg->width = arg->ch - '0';
14453        while (--ctx->fmtcnt >= 0) {
14454            arg->ch = FORMAT_READ(ctx);
14455            ctx->fmtpos++;
14456            if (arg->ch < '0' || arg->ch > '9')
14457                break;
14458            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14459               mixing signed and unsigned comparison. Since arg->ch is between
14460               '0' and '9', casting to int is safe. */
14461            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14462                PyErr_SetString(PyExc_ValueError,
14463                                "width too big");
14464                return -1;
14465            }
14466            arg->width = arg->width*10 + (arg->ch - '0');
14467        }
14468    }
14469
14470    /* Parse precision. Example: "%.3f" => prec=3 */
14471    if (arg->ch == '.') {
14472        arg->prec = 0;
14473        if (--ctx->fmtcnt >= 0) {
14474            arg->ch = FORMAT_READ(ctx);
14475            ctx->fmtpos++;
14476        }
14477        if (arg->ch == '*') {
14478            v = unicode_format_getnextarg(ctx);
14479            if (v == NULL)
14480                return -1;
14481            if (!PyLong_Check(v)) {
14482                PyErr_SetString(PyExc_TypeError,
14483                                "* wants int");
14484                return -1;
14485            }
14486            arg->prec = _PyLong_AsInt(v);
14487            if (arg->prec == -1 && PyErr_Occurred())
14488                return -1;
14489            if (arg->prec < 0)
14490                arg->prec = 0;
14491            if (--ctx->fmtcnt >= 0) {
14492                arg->ch = FORMAT_READ(ctx);
14493                ctx->fmtpos++;
14494            }
14495        }
14496        else if (arg->ch >= '0' && arg->ch <= '9') {
14497            arg->prec = arg->ch - '0';
14498            while (--ctx->fmtcnt >= 0) {
14499                arg->ch = FORMAT_READ(ctx);
14500                ctx->fmtpos++;
14501                if (arg->ch < '0' || arg->ch > '9')
14502                    break;
14503                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14504                    PyErr_SetString(PyExc_ValueError,
14505                                    "precision too big");
14506                    return -1;
14507                }
14508                arg->prec = arg->prec*10 + (arg->ch - '0');
14509            }
14510        }
14511    }
14512
14513    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14514    if (ctx->fmtcnt >= 0) {
14515        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14516            if (--ctx->fmtcnt >= 0) {
14517                arg->ch = FORMAT_READ(ctx);
14518                ctx->fmtpos++;
14519            }
14520        }
14521    }
14522    if (ctx->fmtcnt < 0) {
14523        PyErr_SetString(PyExc_ValueError,
14524                        "incomplete format");
14525        return -1;
14526    }
14527    return 0;
14528
14529#undef FORMAT_READ
14530}
14531
14532/* Format one argument. Supported conversion specifiers:
14533
14534   - "s", "r", "a": any type
14535   - "i", "d", "u": int or float
14536   - "o", "x", "X": int
14537   - "e", "E", "f", "F", "g", "G": float
14538   - "c": int or str (1 character)
14539
14540   When possible, the output is written directly into the Unicode writer
14541   (ctx->writer). A string is created when padding is required.
14542
14543   Return 0 if the argument has been formatted into *p_str,
14544          1 if the argument has been written into ctx->writer,
14545         -1 on error. */
14546static int
14547unicode_format_arg_format(struct unicode_formatter_t *ctx,
14548                          struct unicode_format_arg_t *arg,
14549                          PyObject **p_str)
14550{
14551    PyObject *v;
14552    _PyUnicodeWriter *writer = &ctx->writer;
14553
14554    if (ctx->fmtcnt == 0)
14555        ctx->writer.overallocate = 0;
14556
14557    if (arg->ch == '%') {
14558        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14559            return -1;
14560        return 1;
14561    }
14562
14563    v = unicode_format_getnextarg(ctx);
14564    if (v == NULL)
14565        return -1;
14566
14567
14568    switch (arg->ch) {
14569    case 's':
14570    case 'r':
14571    case 'a':
14572        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14573            /* Fast path */
14574            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14575                return -1;
14576            return 1;
14577        }
14578
14579        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14580            *p_str = v;
14581            Py_INCREF(*p_str);
14582        }
14583        else {
14584            if (arg->ch == 's')
14585                *p_str = PyObject_Str(v);
14586            else if (arg->ch == 'r')
14587                *p_str = PyObject_Repr(v);
14588            else
14589                *p_str = PyObject_ASCII(v);
14590        }
14591        break;
14592
14593    case 'i':
14594    case 'd':
14595    case 'u':
14596    case 'o':
14597    case 'x':
14598    case 'X':
14599    {
14600        int ret = mainformatlong(v, arg, p_str, writer);
14601        if (ret != 0)
14602            return ret;
14603        arg->sign = 1;
14604        break;
14605    }
14606
14607    case 'e':
14608    case 'E':
14609    case 'f':
14610    case 'F':
14611    case 'g':
14612    case 'G':
14613        if (arg->width == -1 && arg->prec == -1
14614            && !(arg->flags & (F_SIGN | F_BLANK)))
14615        {
14616            /* Fast path */
14617            if (formatfloat(v, arg, NULL, writer) == -1)
14618                return -1;
14619            return 1;
14620        }
14621
14622        arg->sign = 1;
14623        if (formatfloat(v, arg, p_str, NULL) == -1)
14624            return -1;
14625        break;
14626
14627    case 'c':
14628    {
14629        Py_UCS4 ch = formatchar(v);
14630        if (ch == (Py_UCS4) -1)
14631            return -1;
14632        if (arg->width == -1 && arg->prec == -1) {
14633            /* Fast path */
14634            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14635                return -1;
14636            return 1;
14637        }
14638        *p_str = PyUnicode_FromOrdinal(ch);
14639        break;
14640    }
14641
14642    default:
14643        PyErr_Format(PyExc_ValueError,
14644                     "unsupported format character '%c' (0x%x) "
14645                     "at index %zd",
14646                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14647                     (int)arg->ch,
14648                     ctx->fmtpos - 1);
14649        return -1;
14650    }
14651    if (*p_str == NULL)
14652        return -1;
14653    assert (PyUnicode_Check(*p_str));
14654    return 0;
14655}
14656
14657static int
14658unicode_format_arg_output(struct unicode_formatter_t *ctx,
14659                          struct unicode_format_arg_t *arg,
14660                          PyObject *str)
14661{
14662    Py_ssize_t len;
14663    enum PyUnicode_Kind kind;
14664    void *pbuf;
14665    Py_ssize_t pindex;
14666    Py_UCS4 signchar;
14667    Py_ssize_t buflen;
14668    Py_UCS4 maxchar;
14669    Py_ssize_t sublen;
14670    _PyUnicodeWriter *writer = &ctx->writer;
14671    Py_UCS4 fill;
14672
14673    fill = ' ';
14674    if (arg->sign && arg->flags & F_ZERO)
14675        fill = '0';
14676
14677    if (PyUnicode_READY(str) == -1)
14678        return -1;
14679
14680    len = PyUnicode_GET_LENGTH(str);
14681    if ((arg->width == -1 || arg->width <= len)
14682        && (arg->prec == -1 || arg->prec >= len)
14683        && !(arg->flags & (F_SIGN | F_BLANK)))
14684    {
14685        /* Fast path */
14686        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14687            return -1;
14688        return 0;
14689    }
14690
14691    /* Truncate the string for "s", "r" and "a" formats
14692       if the precision is set */
14693    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14694        if (arg->prec >= 0 && len > arg->prec)
14695            len = arg->prec;
14696    }
14697
14698    /* Adjust sign and width */
14699    kind = PyUnicode_KIND(str);
14700    pbuf = PyUnicode_DATA(str);
14701    pindex = 0;
14702    signchar = '\0';
14703    if (arg->sign) {
14704        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14705        if (ch == '-' || ch == '+') {
14706            signchar = ch;
14707            len--;
14708            pindex++;
14709        }
14710        else if (arg->flags & F_SIGN)
14711            signchar = '+';
14712        else if (arg->flags & F_BLANK)
14713            signchar = ' ';
14714        else
14715            arg->sign = 0;
14716    }
14717    if (arg->width < len)
14718        arg->width = len;
14719
14720    /* Prepare the writer */
14721    maxchar = writer->maxchar;
14722    if (!(arg->flags & F_LJUST)) {
14723        if (arg->sign) {
14724            if ((arg->width-1) > len)
14725                maxchar = Py_MAX(maxchar, fill);
14726        }
14727        else {
14728            if (arg->width > len)
14729                maxchar = Py_MAX(maxchar, fill);
14730        }
14731    }
14732    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14733        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14734        maxchar = Py_MAX(maxchar, strmaxchar);
14735    }
14736
14737    buflen = arg->width;
14738    if (arg->sign && len == arg->width)
14739        buflen++;
14740    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14741        return -1;
14742
14743    /* Write the sign if needed */
14744    if (arg->sign) {
14745        if (fill != ' ') {
14746            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14747            writer->pos += 1;
14748        }
14749        if (arg->width > len)
14750            arg->width--;
14751    }
14752
14753    /* Write the numeric prefix for "x", "X" and "o" formats
14754       if the alternate form is used.
14755       For example, write "0x" for the "%#x" format. */
14756    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14757        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14758        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14759        if (fill != ' ') {
14760            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14761            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14762            writer->pos += 2;
14763            pindex += 2;
14764        }
14765        arg->width -= 2;
14766        if (arg->width < 0)
14767            arg->width = 0;
14768        len -= 2;
14769    }
14770
14771    /* Pad left with the fill character if needed */
14772    if (arg->width > len && !(arg->flags & F_LJUST)) {
14773        sublen = arg->width - len;
14774        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14775        writer->pos += sublen;
14776        arg->width = len;
14777    }
14778
14779    /* If padding with spaces: write sign if needed and/or numeric prefix if
14780       the alternate form is used */
14781    if (fill == ' ') {
14782        if (arg->sign) {
14783            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14784            writer->pos += 1;
14785        }
14786        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14787            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14788            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14789            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14790            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14791            writer->pos += 2;
14792            pindex += 2;
14793        }
14794    }
14795
14796    /* Write characters */
14797    if (len) {
14798        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14799                                      str, pindex, len);
14800        writer->pos += len;
14801    }
14802
14803    /* Pad right with the fill character if needed */
14804    if (arg->width > len) {
14805        sublen = arg->width - len;
14806        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14807        writer->pos += sublen;
14808    }
14809    return 0;
14810}
14811
14812/* Helper of PyUnicode_Format(): format one arg.
14813   Return 0 on success, raise an exception and return -1 on error. */
14814static int
14815unicode_format_arg(struct unicode_formatter_t *ctx)
14816{
14817    struct unicode_format_arg_t arg;
14818    PyObject *str;
14819    int ret;
14820
14821    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14822    arg.flags = 0;
14823    arg.width = -1;
14824    arg.prec = -1;
14825    arg.sign = 0;
14826    str = NULL;
14827
14828    ret = unicode_format_arg_parse(ctx, &arg);
14829    if (ret == -1)
14830        return -1;
14831
14832    ret = unicode_format_arg_format(ctx, &arg, &str);
14833    if (ret == -1)
14834        return -1;
14835
14836    if (ret != 1) {
14837        ret = unicode_format_arg_output(ctx, &arg, str);
14838        Py_DECREF(str);
14839        if (ret == -1)
14840            return -1;
14841    }
14842
14843    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14844        PyErr_SetString(PyExc_TypeError,
14845                        "not all arguments converted during string formatting");
14846        return -1;
14847    }
14848    return 0;
14849}
14850
14851PyObject *
14852PyUnicode_Format(PyObject *format, PyObject *args)
14853{
14854    struct unicode_formatter_t ctx;
14855
14856    if (format == NULL || args == NULL) {
14857        PyErr_BadInternalCall();
14858        return NULL;
14859    }
14860
14861    if (ensure_unicode(format) < 0)
14862        return NULL;
14863
14864    ctx.fmtstr = format;
14865    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14866    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14867    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14868    ctx.fmtpos = 0;
14869
14870    _PyUnicodeWriter_Init(&ctx.writer);
14871    ctx.writer.min_length = ctx.fmtcnt + 100;
14872    ctx.writer.overallocate = 1;
14873
14874    if (PyTuple_Check(args)) {
14875        ctx.arglen = PyTuple_Size(args);
14876        ctx.argidx = 0;
14877    }
14878    else {
14879        ctx.arglen = -1;
14880        ctx.argidx = -2;
14881    }
14882    ctx.args_owned = 0;
14883    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14884        ctx.dict = args;
14885    else
14886        ctx.dict = NULL;
14887    ctx.args = args;
14888
14889    while (--ctx.fmtcnt >= 0) {
14890        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14891            Py_ssize_t nonfmtpos;
14892
14893            nonfmtpos = ctx.fmtpos++;
14894            while (ctx.fmtcnt >= 0 &&
14895                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14896                ctx.fmtpos++;
14897                ctx.fmtcnt--;
14898            }
14899            if (ctx.fmtcnt < 0) {
14900                ctx.fmtpos--;
14901                ctx.writer.overallocate = 0;
14902            }
14903
14904            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14905                                                nonfmtpos, ctx.fmtpos) < 0)
14906                goto onError;
14907        }
14908        else {
14909            ctx.fmtpos++;
14910            if (unicode_format_arg(&ctx) == -1)
14911                goto onError;
14912        }
14913    }
14914
14915    if (ctx.argidx < ctx.arglen && !ctx.dict) {
14916        PyErr_SetString(PyExc_TypeError,
14917                        "not all arguments converted during string formatting");
14918        goto onError;
14919    }
14920
14921    if (ctx.args_owned) {
14922        Py_DECREF(ctx.args);
14923    }
14924    return _PyUnicodeWriter_Finish(&ctx.writer);
14925
14926  onError:
14927    _PyUnicodeWriter_Dealloc(&ctx.writer);
14928    if (ctx.args_owned) {
14929        Py_DECREF(ctx.args);
14930    }
14931    return NULL;
14932}
14933
14934static PyObject *
14935unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14936
14937static PyObject *
14938unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14939{
14940    PyObject *x = NULL;
14941    static char *kwlist[] = {"object", "encoding", "errors", 0};
14942    char *encoding = NULL;
14943    char *errors = NULL;
14944
14945    if (type != &PyUnicode_Type)
14946        return unicode_subtype_new(type, args, kwds);
14947    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14948                                     kwlist, &x, &encoding, &errors))
14949        return NULL;
14950    if (x == NULL)
14951        _Py_RETURN_UNICODE_EMPTY();
14952    if (encoding == NULL && errors == NULL)
14953        return PyObject_Str(x);
14954    else
14955        return PyUnicode_FromEncodedObject(x, encoding, errors);
14956}
14957
14958static PyObject *
14959unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14960{
14961    PyObject *unicode, *self;
14962    Py_ssize_t length, char_size;
14963    int share_wstr, share_utf8;
14964    unsigned int kind;
14965    void *data;
14966
14967    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14968
14969    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14970    if (unicode == NULL)
14971        return NULL;
14972    assert(_PyUnicode_CHECK(unicode));
14973    if (PyUnicode_READY(unicode) == -1) {
14974        Py_DECREF(unicode);
14975        return NULL;
14976    }
14977
14978    self = type->tp_alloc(type, 0);
14979    if (self == NULL) {
14980        Py_DECREF(unicode);
14981        return NULL;
14982    }
14983    kind = PyUnicode_KIND(unicode);
14984    length = PyUnicode_GET_LENGTH(unicode);
14985
14986    _PyUnicode_LENGTH(self) = length;
14987#ifdef Py_DEBUG
14988    _PyUnicode_HASH(self) = -1;
14989#else
14990    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14991#endif
14992    _PyUnicode_STATE(self).interned = 0;
14993    _PyUnicode_STATE(self).kind = kind;
14994    _PyUnicode_STATE(self).compact = 0;
14995    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14996    _PyUnicode_STATE(self).ready = 1;
14997    _PyUnicode_WSTR(self) = NULL;
14998    _PyUnicode_UTF8_LENGTH(self) = 0;
14999    _PyUnicode_UTF8(self) = NULL;
15000    _PyUnicode_WSTR_LENGTH(self) = 0;
15001    _PyUnicode_DATA_ANY(self) = NULL;
15002
15003    share_utf8 = 0;
15004    share_wstr = 0;
15005    if (kind == PyUnicode_1BYTE_KIND) {
15006        char_size = 1;
15007        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15008            share_utf8 = 1;
15009    }
15010    else if (kind == PyUnicode_2BYTE_KIND) {
15011        char_size = 2;
15012        if (sizeof(wchar_t) == 2)
15013            share_wstr = 1;
15014    }
15015    else {
15016        assert(kind == PyUnicode_4BYTE_KIND);
15017        char_size = 4;
15018        if (sizeof(wchar_t) == 4)
15019            share_wstr = 1;
15020    }
15021
15022    /* Ensure we won't overflow the length. */
15023    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15024        PyErr_NoMemory();
15025        goto onError;
15026    }
15027    data = PyObject_MALLOC((length + 1) * char_size);
15028    if (data == NULL) {
15029        PyErr_NoMemory();
15030        goto onError;
15031    }
15032
15033    _PyUnicode_DATA_ANY(self) = data;
15034    if (share_utf8) {
15035        _PyUnicode_UTF8_LENGTH(self) = length;
15036        _PyUnicode_UTF8(self) = data;
15037    }
15038    if (share_wstr) {
15039        _PyUnicode_WSTR_LENGTH(self) = length;
15040        _PyUnicode_WSTR(self) = (wchar_t *)data;
15041    }
15042
15043    memcpy(data, PyUnicode_DATA(unicode),
15044              kind * (length + 1));
15045    assert(_PyUnicode_CheckConsistency(self, 1));
15046#ifdef Py_DEBUG
15047    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15048#endif
15049    Py_DECREF(unicode);
15050    return self;
15051
15052onError:
15053    Py_DECREF(unicode);
15054    Py_DECREF(self);
15055    return NULL;
15056}
15057
15058PyDoc_STRVAR(unicode_doc,
15059"str(object='') -> str\n\
15060str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15061\n\
15062Create a new string object from the given object. If encoding or\n\
15063errors is specified, then the object must expose a data buffer\n\
15064that will be decoded using the given encoding and error handler.\n\
15065Otherwise, returns the result of object.__str__() (if defined)\n\
15066or repr(object).\n\
15067encoding defaults to sys.getdefaultencoding().\n\
15068errors defaults to 'strict'.");
15069
15070static PyObject *unicode_iter(PyObject *seq);
15071
15072PyTypeObject PyUnicode_Type = {
15073    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15074    "str",              /* tp_name */
15075    sizeof(PyUnicodeObject),        /* tp_size */
15076    0,                  /* tp_itemsize */
15077    /* Slots */
15078    (destructor)unicode_dealloc,    /* tp_dealloc */
15079    0,                  /* tp_print */
15080    0,                  /* tp_getattr */
15081    0,                  /* tp_setattr */
15082    0,                  /* tp_reserved */
15083    unicode_repr,           /* tp_repr */
15084    &unicode_as_number,         /* tp_as_number */
15085    &unicode_as_sequence,       /* tp_as_sequence */
15086    &unicode_as_mapping,        /* tp_as_mapping */
15087    (hashfunc) unicode_hash,        /* tp_hash*/
15088    0,                  /* tp_call*/
15089    (reprfunc) unicode_str,     /* tp_str */
15090    PyObject_GenericGetAttr,        /* tp_getattro */
15091    0,                  /* tp_setattro */
15092    0,                  /* tp_as_buffer */
15093    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15094    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
15095    unicode_doc,            /* tp_doc */
15096    0,                  /* tp_traverse */
15097    0,                  /* tp_clear */
15098    PyUnicode_RichCompare,      /* tp_richcompare */
15099    0,                  /* tp_weaklistoffset */
15100    unicode_iter,           /* tp_iter */
15101    0,                  /* tp_iternext */
15102    unicode_methods,            /* tp_methods */
15103    0,                  /* tp_members */
15104    0,                  /* tp_getset */
15105    &PyBaseObject_Type,         /* tp_base */
15106    0,                  /* tp_dict */
15107    0,                  /* tp_descr_get */
15108    0,                  /* tp_descr_set */
15109    0,                  /* tp_dictoffset */
15110    0,                  /* tp_init */
15111    0,                  /* tp_alloc */
15112    unicode_new,            /* tp_new */
15113    PyObject_Del,           /* tp_free */
15114};
15115
15116/* Initialize the Unicode implementation */
15117
15118int _PyUnicode_Init(void)
15119{
15120    /* XXX - move this array to unicodectype.c ? */
15121    Py_UCS2 linebreak[] = {
15122        0x000A, /* LINE FEED */
15123        0x000D, /* CARRIAGE RETURN */
15124        0x001C, /* FILE SEPARATOR */
15125        0x001D, /* GROUP SEPARATOR */
15126        0x001E, /* RECORD SEPARATOR */
15127        0x0085, /* NEXT LINE */
15128        0x2028, /* LINE SEPARATOR */
15129        0x2029, /* PARAGRAPH SEPARATOR */
15130    };
15131
15132    /* Init the implementation */
15133    _Py_INCREF_UNICODE_EMPTY();
15134    if (!unicode_empty)
15135        Py_FatalError("Can't create empty string");
15136    Py_DECREF(unicode_empty);
15137
15138    if (PyType_Ready(&PyUnicode_Type) < 0)
15139        Py_FatalError("Can't initialize 'unicode'");
15140
15141    /* initialize the linebreak bloom filter */
15142    bloom_linebreak = make_bloom_mask(
15143        PyUnicode_2BYTE_KIND, linebreak,
15144        Py_ARRAY_LENGTH(linebreak));
15145
15146    if (PyType_Ready(&EncodingMapType) < 0)
15147         Py_FatalError("Can't initialize encoding map type");
15148
15149    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15150        Py_FatalError("Can't initialize field name iterator type");
15151
15152    if (PyType_Ready(&PyFormatterIter_Type) < 0)
15153        Py_FatalError("Can't initialize formatter iter type");
15154
15155    return 0;
15156}
15157
15158/* Finalize the Unicode implementation */
15159
15160int
15161PyUnicode_ClearFreeList(void)
15162{
15163    return 0;
15164}
15165
15166void
15167_PyUnicode_Fini(void)
15168{
15169    int i;
15170
15171    Py_CLEAR(unicode_empty);
15172
15173    for (i = 0; i < 256; i++)
15174        Py_CLEAR(unicode_latin1[i]);
15175    _PyUnicode_ClearStaticStrings();
15176    (void)PyUnicode_ClearFreeList();
15177}
15178
15179void
15180PyUnicode_InternInPlace(PyObject **p)
15181{
15182    PyObject *s = *p;
15183    PyObject *t;
15184#ifdef Py_DEBUG
15185    assert(s != NULL);
15186    assert(_PyUnicode_CHECK(s));
15187#else
15188    if (s == NULL || !PyUnicode_Check(s))
15189        return;
15190#endif
15191    /* If it's a subclass, we don't really know what putting
15192       it in the interned dict might do. */
15193    if (!PyUnicode_CheckExact(s))
15194        return;
15195    if (PyUnicode_CHECK_INTERNED(s))
15196        return;
15197    if (interned == NULL) {
15198        interned = PyDict_New();
15199        if (interned == NULL) {
15200            PyErr_Clear(); /* Don't leave an exception */
15201            return;
15202        }
15203    }
15204    Py_ALLOW_RECURSION
15205    t = PyDict_SetDefault(interned, s, s);
15206    Py_END_ALLOW_RECURSION
15207    if (t == NULL) {
15208        PyErr_Clear();
15209        return;
15210    }
15211    if (t != s) {
15212        Py_INCREF(t);
15213        Py_SETREF(*p, t);
15214        return;
15215    }
15216    /* The two references in interned are not counted by refcnt.
15217       The deallocator will take care of this */
15218    Py_REFCNT(s) -= 2;
15219    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15220}
15221
15222void
15223PyUnicode_InternImmortal(PyObject **p)
15224{
15225    PyUnicode_InternInPlace(p);
15226    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15227        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15228        Py_INCREF(*p);
15229    }
15230}
15231
15232PyObject *
15233PyUnicode_InternFromString(const char *cp)
15234{
15235    PyObject *s = PyUnicode_FromString(cp);
15236    if (s == NULL)
15237        return NULL;
15238    PyUnicode_InternInPlace(&s);
15239    return s;
15240}
15241
15242void
15243_Py_ReleaseInternedUnicodeStrings(void)
15244{
15245    PyObject *keys;
15246    PyObject *s;
15247    Py_ssize_t i, n;
15248    Py_ssize_t immortal_size = 0, mortal_size = 0;
15249
15250    if (interned == NULL || !PyDict_Check(interned))
15251        return;
15252    keys = PyDict_Keys(interned);
15253    if (keys == NULL || !PyList_Check(keys)) {
15254        PyErr_Clear();
15255        return;
15256    }
15257
15258    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15259       detector, interned unicode strings are not forcibly deallocated;
15260       rather, we give them their stolen references back, and then clear
15261       and DECREF the interned dict. */
15262
15263    n = PyList_GET_SIZE(keys);
15264    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15265            n);
15266    for (i = 0; i < n; i++) {
15267        s = PyList_GET_ITEM(keys, i);
15268        if (PyUnicode_READY(s) == -1) {
15269            assert(0 && "could not ready string");
15270            fprintf(stderr, "could not ready string\n");
15271        }
15272        switch (PyUnicode_CHECK_INTERNED(s)) {
15273        case SSTATE_NOT_INTERNED:
15274            /* XXX Shouldn't happen */
15275            break;
15276        case SSTATE_INTERNED_IMMORTAL:
15277            Py_REFCNT(s) += 1;
15278            immortal_size += PyUnicode_GET_LENGTH(s);
15279            break;
15280        case SSTATE_INTERNED_MORTAL:
15281            Py_REFCNT(s) += 2;
15282            mortal_size += PyUnicode_GET_LENGTH(s);
15283            break;
15284        default:
15285            Py_FatalError("Inconsistent interned string state.");
15286        }
15287        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15288    }
15289    fprintf(stderr, "total size of all interned strings: "
15290            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15291            "mortal/immortal\n", mortal_size, immortal_size);
15292    Py_DECREF(keys);
15293    PyDict_Clear(interned);
15294    Py_CLEAR(interned);
15295}
15296
15297
15298/********************* Unicode Iterator **************************/
15299
15300typedef struct {
15301    PyObject_HEAD
15302    Py_ssize_t it_index;
15303    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15304} unicodeiterobject;
15305
15306static void
15307unicodeiter_dealloc(unicodeiterobject *it)
15308{
15309    _PyObject_GC_UNTRACK(it);
15310    Py_XDECREF(it->it_seq);
15311    PyObject_GC_Del(it);
15312}
15313
15314static int
15315unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15316{
15317    Py_VISIT(it->it_seq);
15318    return 0;
15319}
15320
15321static PyObject *
15322unicodeiter_next(unicodeiterobject *it)
15323{
15324    PyObject *seq, *item;
15325
15326    assert(it != NULL);
15327    seq = it->it_seq;
15328    if (seq == NULL)
15329        return NULL;
15330    assert(_PyUnicode_CHECK(seq));
15331
15332    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15333        int kind = PyUnicode_KIND(seq);
15334        void *data = PyUnicode_DATA(seq);
15335        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15336        item = PyUnicode_FromOrdinal(chr);
15337        if (item != NULL)
15338            ++it->it_index;
15339        return item;
15340    }
15341
15342    it->it_seq = NULL;
15343    Py_DECREF(seq);
15344    return NULL;
15345}
15346
15347static PyObject *
15348unicodeiter_len(unicodeiterobject *it)
15349{
15350    Py_ssize_t len = 0;
15351    if (it->it_seq)
15352        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15353    return PyLong_FromSsize_t(len);
15354}
15355
15356PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15357
15358static PyObject *
15359unicodeiter_reduce(unicodeiterobject *it)
15360{
15361    if (it->it_seq != NULL) {
15362        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15363                             it->it_seq, it->it_index);
15364    } else {
15365        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15366        if (u == NULL)
15367            return NULL;
15368        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15369    }
15370}
15371
15372PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15373
15374static PyObject *
15375unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15376{
15377    Py_ssize_t index = PyLong_AsSsize_t(state);
15378    if (index == -1 && PyErr_Occurred())
15379        return NULL;
15380    if (it->it_seq != NULL) {
15381        if (index < 0)
15382            index = 0;
15383        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15384            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15385        it->it_index = index;
15386    }
15387    Py_RETURN_NONE;
15388}
15389
15390PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15391
15392static PyMethodDef unicodeiter_methods[] = {
15393    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15394     length_hint_doc},
15395    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15396     reduce_doc},
15397    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15398     setstate_doc},
15399    {NULL,      NULL}       /* sentinel */
15400};
15401
15402PyTypeObject PyUnicodeIter_Type = {
15403    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15404    "str_iterator",         /* tp_name */
15405    sizeof(unicodeiterobject),      /* tp_basicsize */
15406    0,                  /* tp_itemsize */
15407    /* methods */
15408    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15409    0,                  /* tp_print */
15410    0,                  /* tp_getattr */
15411    0,                  /* tp_setattr */
15412    0,                  /* tp_reserved */
15413    0,                  /* tp_repr */
15414    0,                  /* tp_as_number */
15415    0,                  /* tp_as_sequence */
15416    0,                  /* tp_as_mapping */
15417    0,                  /* tp_hash */
15418    0,                  /* tp_call */
15419    0,                  /* tp_str */
15420    PyObject_GenericGetAttr,        /* tp_getattro */
15421    0,                  /* tp_setattro */
15422    0,                  /* tp_as_buffer */
15423    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15424    0,                  /* tp_doc */
15425    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15426    0,                  /* tp_clear */
15427    0,                  /* tp_richcompare */
15428    0,                  /* tp_weaklistoffset */
15429    PyObject_SelfIter,          /* tp_iter */
15430    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15431    unicodeiter_methods,            /* tp_methods */
15432    0,
15433};
15434
15435static PyObject *
15436unicode_iter(PyObject *seq)
15437{
15438    unicodeiterobject *it;
15439
15440    if (!PyUnicode_Check(seq)) {
15441        PyErr_BadInternalCall();
15442        return NULL;
15443    }
15444    if (PyUnicode_READY(seq) == -1)
15445        return NULL;
15446    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15447    if (it == NULL)
15448        return NULL;
15449    it->it_index = 0;
15450    Py_INCREF(seq);
15451    it->it_seq = seq;
15452    _PyObject_GC_TRACK(it);
15453    return (PyObject *)it;
15454}
15455
15456
15457size_t
15458Py_UNICODE_strlen(const Py_UNICODE *u)
15459{
15460    int res = 0;
15461    while(*u++)
15462        res++;
15463    return res;
15464}
15465
15466Py_UNICODE*
15467Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15468{
15469    Py_UNICODE *u = s1;
15470    while ((*u++ = *s2++));
15471    return s1;
15472}
15473
15474Py_UNICODE*
15475Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15476{
15477    Py_UNICODE *u = s1;
15478    while ((*u++ = *s2++))
15479        if (n-- == 0)
15480            break;
15481    return s1;
15482}
15483
15484Py_UNICODE*
15485Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15486{
15487    Py_UNICODE *u1 = s1;
15488    u1 += Py_UNICODE_strlen(u1);
15489    Py_UNICODE_strcpy(u1, s2);
15490    return s1;
15491}
15492
15493int
15494Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15495{
15496    while (*s1 && *s2 && *s1 == *s2)
15497        s1++, s2++;
15498    if (*s1 && *s2)
15499        return (*s1 < *s2) ? -1 : +1;
15500    if (*s1)
15501        return 1;
15502    if (*s2)
15503        return -1;
15504    return 0;
15505}
15506
15507int
15508Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15509{
15510    Py_UNICODE u1, u2;
15511    for (; n != 0; n--) {
15512        u1 = *s1;
15513        u2 = *s2;
15514        if (u1 != u2)
15515            return (u1 < u2) ? -1 : +1;
15516        if (u1 == '\0')
15517            return 0;
15518        s1++;
15519        s2++;
15520    }
15521    return 0;
15522}
15523
15524Py_UNICODE*
15525Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15526{
15527    const Py_UNICODE *p;
15528    for (p = s; *p; p++)
15529        if (*p == c)
15530            return (Py_UNICODE*)p;
15531    return NULL;
15532}
15533
15534Py_UNICODE*
15535Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15536{
15537    const Py_UNICODE *p;
15538    p = s + Py_UNICODE_strlen(s);
15539    while (p != s) {
15540        p--;
15541        if (*p == c)
15542            return (Py_UNICODE*)p;
15543    }
15544    return NULL;
15545}
15546
15547Py_UNICODE*
15548PyUnicode_AsUnicodeCopy(PyObject *unicode)
15549{
15550    Py_UNICODE *u, *copy;
15551    Py_ssize_t len, size;
15552
15553    if (!PyUnicode_Check(unicode)) {
15554        PyErr_BadArgument();
15555        return NULL;
15556    }
15557    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15558    if (u == NULL)
15559        return NULL;
15560    /* Ensure we won't overflow the size. */
15561    if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15562        PyErr_NoMemory();
15563        return NULL;
15564    }
15565    size = len + 1; /* copy the null character */
15566    size *= sizeof(Py_UNICODE);
15567    copy = PyMem_Malloc(size);
15568    if (copy == NULL) {
15569        PyErr_NoMemory();
15570        return NULL;
15571    }
15572    memcpy(copy, u, size);
15573    return copy;
15574}
15575
15576/* A _string module, to export formatter_parser and formatter_field_name_split
15577   to the string.Formatter class implemented in Python. */
15578
15579static PyMethodDef _string_methods[] = {
15580    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15581     METH_O, PyDoc_STR("split the argument as a field name")},
15582    {"formatter_parser", (PyCFunction) formatter_parser,
15583     METH_O, PyDoc_STR("parse the argument as a format string")},
15584    {NULL, NULL}
15585};
15586
15587static struct PyModuleDef _string_module = {
15588    PyModuleDef_HEAD_INIT,
15589    "_string",
15590    PyDoc_STR("string helper module"),
15591    0,
15592    _string_methods,
15593    NULL,
15594    NULL,
15595    NULL,
15596    NULL
15597};
15598
15599PyMODINIT_FUNC
15600PyInit__string(void)
15601{
15602    return PyModule_Create(&_string_module);
15603}
15604
15605
15606#ifdef __cplusplus
15607}
15608#endif
15609