unicodeobject.c revision 0d92c4f667518c7a24abda885e10c0c8e72cae57
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* --- Globals ------------------------------------------------------------
51
52   The globals are initialized by the _PyUnicode_Init() API and should
53   not be used before calling that API.
54
55*/
56
57
58#ifdef __cplusplus
59extern "C" {
60#endif
61
62/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63#define MAX_UNICODE 0x10ffff
64
65#ifdef Py_DEBUG
66#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
67#else
68#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
69#endif
70
71#define _PyUnicode_UTF8(op)                             \
72    (((PyCompactUnicodeObject*)(op))->utf8)
73#define PyUnicode_UTF8(op)                              \
74    (assert(_PyUnicode_CHECK(op)),                      \
75     assert(PyUnicode_IS_READY(op)),                    \
76     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
77         ((char*)((PyASCIIObject*)(op) + 1)) :          \
78         _PyUnicode_UTF8(op))
79#define _PyUnicode_UTF8_LENGTH(op)                      \
80    (((PyCompactUnicodeObject*)(op))->utf8_length)
81#define PyUnicode_UTF8_LENGTH(op)                       \
82    (assert(_PyUnicode_CHECK(op)),                      \
83     assert(PyUnicode_IS_READY(op)),                    \
84     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
85         ((PyASCIIObject*)(op))->length :               \
86         _PyUnicode_UTF8_LENGTH(op))
87#define _PyUnicode_WSTR(op)                             \
88    (((PyASCIIObject*)(op))->wstr)
89#define _PyUnicode_WSTR_LENGTH(op)                      \
90    (((PyCompactUnicodeObject*)(op))->wstr_length)
91#define _PyUnicode_LENGTH(op)                           \
92    (((PyASCIIObject *)(op))->length)
93#define _PyUnicode_STATE(op)                            \
94    (((PyASCIIObject *)(op))->state)
95#define _PyUnicode_HASH(op)                             \
96    (((PyASCIIObject *)(op))->hash)
97#define _PyUnicode_KIND(op)                             \
98    (assert(_PyUnicode_CHECK(op)),                      \
99     ((PyASCIIObject *)(op))->state.kind)
100#define _PyUnicode_GET_LENGTH(op)                       \
101    (assert(_PyUnicode_CHECK(op)),                      \
102     ((PyASCIIObject *)(op))->length)
103#define _PyUnicode_DATA_ANY(op)                         \
104    (((PyUnicodeObject*)(op))->data.any)
105
106/* Optimized version of Py_MAX() to compute the maximum character:
107   use it when your are computing the second argument of PyUnicode_New() */
108#define MAX_MAXCHAR(maxchar1, maxchar2)                 \
109    ((maxchar1) | (maxchar2))
110
111#undef PyUnicode_READY
112#define PyUnicode_READY(op)                             \
113    (assert(_PyUnicode_CHECK(op)),                      \
114     (PyUnicode_IS_READY(op) ?                          \
115      0 :                                               \
116      _PyUnicode_Ready(op)))
117
118#define _PyUnicode_SHARE_UTF8(op)                       \
119    (assert(_PyUnicode_CHECK(op)),                      \
120     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
121     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122#define _PyUnicode_SHARE_WSTR(op)                       \
123    (assert(_PyUnicode_CHECK(op)),                      \
124     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125
126/* true if the Unicode object has an allocated UTF-8 memory block
127   (not shared with other data) */
128#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
129    (assert(_PyUnicode_CHECK(op)),                      \
130     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
131      && _PyUnicode_UTF8(op)                            \
132      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated wstr memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
137    (assert(_PyUnicode_CHECK(op)),                      \
138     (_PyUnicode_WSTR(op) &&                            \
139      (!PyUnicode_IS_READY(op) ||                       \
140       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
142/* Generic helper macro to convert characters of different types.
143   from_type and to_type have to be valid type names, begin and end
144   are pointers to the source characters which should be of type
145   "from_type *".  to is a pointer of type "to_type *" and points to the
146   buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148    do {                                                \
149        to_type *_to = (to_type *) to;                  \
150        const from_type *_iter = (begin);               \
151        const from_type *_end = (end);                  \
152        Py_ssize_t n = (_end) - (_iter);                \
153        const from_type *_unrolled_end =                \
154            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
155        while (_iter < (_unrolled_end)) {               \
156            _to[0] = (to_type) _iter[0];                \
157            _to[1] = (to_type) _iter[1];                \
158            _to[2] = (to_type) _iter[2];                \
159            _to[3] = (to_type) _iter[3];                \
160            _iter += 4; _to += 4;                       \
161        }                                               \
162        while (_iter < (_end))                          \
163            *_to++ = (to_type) *_iter++;                \
164    } while (0)
165
166/* This dictionary holds all interned unicode strings.  Note that references
167   to strings in this dictionary are *not* counted in the string's ob_refcnt.
168   When the interned string reaches a refcnt of 0 the string deallocation
169   function will delete the reference from this dictionary.
170
171   Another way to look at this is that to say that the actual reference
172   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
173*/
174static PyObject *interned;
175
176/* The empty Unicode object is shared to improve performance. */
177static PyObject *unicode_empty;
178
179/* List of static strings. */
180static _Py_Identifier *static_strings;
181
182/* Single character Unicode strings in the Latin-1 range are being
183   shared as well. */
184static PyObject *unicode_latin1[256];
185
186/* Fast detection of the most frequent whitespace characters */
187const unsigned char _Py_ascii_whitespace[] = {
188    0, 0, 0, 0, 0, 0, 0, 0,
189/*     case 0x0009: * CHARACTER TABULATION */
190/*     case 0x000A: * LINE FEED */
191/*     case 0x000B: * LINE TABULATION */
192/*     case 0x000C: * FORM FEED */
193/*     case 0x000D: * CARRIAGE RETURN */
194    0, 1, 1, 1, 1, 1, 0, 0,
195    0, 0, 0, 0, 0, 0, 0, 0,
196/*     case 0x001C: * FILE SEPARATOR */
197/*     case 0x001D: * GROUP SEPARATOR */
198/*     case 0x001E: * RECORD SEPARATOR */
199/*     case 0x001F: * UNIT SEPARATOR */
200    0, 0, 0, 0, 1, 1, 1, 1,
201/*     case 0x0020: * SPACE */
202    1, 0, 0, 0, 0, 0, 0, 0,
203    0, 0, 0, 0, 0, 0, 0, 0,
204    0, 0, 0, 0, 0, 0, 0, 0,
205    0, 0, 0, 0, 0, 0, 0, 0,
206
207    0, 0, 0, 0, 0, 0, 0, 0,
208    0, 0, 0, 0, 0, 0, 0, 0,
209    0, 0, 0, 0, 0, 0, 0, 0,
210    0, 0, 0, 0, 0, 0, 0, 0,
211    0, 0, 0, 0, 0, 0, 0, 0,
212    0, 0, 0, 0, 0, 0, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214    0, 0, 0, 0, 0, 0, 0, 0
215};
216
217/* forward */
218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
219static PyObject* get_latin1_char(unsigned char ch);
220static int unicode_modifiable(PyObject *unicode);
221
222
223static PyObject *
224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
225static PyObject *
226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227static PyObject *
228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229
230static PyObject *
231unicode_encode_call_errorhandler(const char *errors,
232       PyObject **errorHandler,const char *encoding, const char *reason,
233       PyObject *unicode, PyObject **exceptionObject,
234       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
235
236static void
237raise_encode_exception(PyObject **exceptionObject,
238                       const char *encoding,
239                       PyObject *unicode,
240                       Py_ssize_t startpos, Py_ssize_t endpos,
241                       const char *reason);
242
243/* Same for linebreaks */
244static unsigned char ascii_linebreak[] = {
245    0, 0, 0, 0, 0, 0, 0, 0,
246/*         0x000A, * LINE FEED */
247/*         0x000B, * LINE TABULATION */
248/*         0x000C, * FORM FEED */
249/*         0x000D, * CARRIAGE RETURN */
250    0, 0, 1, 1, 1, 1, 0, 0,
251    0, 0, 0, 0, 0, 0, 0, 0,
252/*         0x001C, * FILE SEPARATOR */
253/*         0x001D, * GROUP SEPARATOR */
254/*         0x001E, * RECORD SEPARATOR */
255    0, 0, 0, 0, 1, 1, 1, 0,
256    0, 0, 0, 0, 0, 0, 0, 0,
257    0, 0, 0, 0, 0, 0, 0, 0,
258    0, 0, 0, 0, 0, 0, 0, 0,
259    0, 0, 0, 0, 0, 0, 0, 0,
260
261    0, 0, 0, 0, 0, 0, 0, 0,
262    0, 0, 0, 0, 0, 0, 0, 0,
263    0, 0, 0, 0, 0, 0, 0, 0,
264    0, 0, 0, 0, 0, 0, 0, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268    0, 0, 0, 0, 0, 0, 0, 0
269};
270
271/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
272   This function is kept for backward compatibility with the old API. */
273Py_UNICODE
274PyUnicode_GetMax(void)
275{
276#ifdef Py_UNICODE_WIDE
277    return 0x10FFFF;
278#else
279    /* This is actually an illegal character, so it should
280       not be passed to unichr. */
281    return 0xFFFF;
282#endif
283}
284
285#ifdef Py_DEBUG
286int
287_PyUnicode_CheckConsistency(PyObject *op, int check_content)
288{
289    PyASCIIObject *ascii;
290    unsigned int kind;
291
292    assert(PyUnicode_Check(op));
293
294    ascii = (PyASCIIObject *)op;
295    kind = ascii->state.kind;
296
297    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
298        assert(kind == PyUnicode_1BYTE_KIND);
299        assert(ascii->state.ready == 1);
300    }
301    else {
302        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
303        void *data;
304
305        if (ascii->state.compact == 1) {
306            data = compact + 1;
307            assert(kind == PyUnicode_1BYTE_KIND
308                   || kind == PyUnicode_2BYTE_KIND
309                   || kind == PyUnicode_4BYTE_KIND);
310            assert(ascii->state.ascii == 0);
311            assert(ascii->state.ready == 1);
312            assert (compact->utf8 != data);
313        }
314        else {
315            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
316
317            data = unicode->data.any;
318            if (kind == PyUnicode_WCHAR_KIND) {
319                assert(ascii->length == 0);
320                assert(ascii->hash == -1);
321                assert(ascii->state.compact == 0);
322                assert(ascii->state.ascii == 0);
323                assert(ascii->state.ready == 0);
324                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
325                assert(ascii->wstr != NULL);
326                assert(data == NULL);
327                assert(compact->utf8 == NULL);
328            }
329            else {
330                assert(kind == PyUnicode_1BYTE_KIND
331                       || kind == PyUnicode_2BYTE_KIND
332                       || kind == PyUnicode_4BYTE_KIND);
333                assert(ascii->state.compact == 0);
334                assert(ascii->state.ready == 1);
335                assert(data != NULL);
336                if (ascii->state.ascii) {
337                    assert (compact->utf8 == data);
338                    assert (compact->utf8_length == ascii->length);
339                }
340                else
341                    assert (compact->utf8 != data);
342            }
343        }
344        if (kind != PyUnicode_WCHAR_KIND) {
345            if (
346#if SIZEOF_WCHAR_T == 2
347                kind == PyUnicode_2BYTE_KIND
348#else
349                kind == PyUnicode_4BYTE_KIND
350#endif
351               )
352            {
353                assert(ascii->wstr == data);
354                assert(compact->wstr_length == ascii->length);
355            } else
356                assert(ascii->wstr != data);
357        }
358
359        if (compact->utf8 == NULL)
360            assert(compact->utf8_length == 0);
361        if (ascii->wstr == NULL)
362            assert(compact->wstr_length == 0);
363    }
364    /* check that the best kind is used */
365    if (check_content && kind != PyUnicode_WCHAR_KIND)
366    {
367        Py_ssize_t i;
368        Py_UCS4 maxchar = 0;
369        void *data;
370        Py_UCS4 ch;
371
372        data = PyUnicode_DATA(ascii);
373        for (i=0; i < ascii->length; i++)
374        {
375            ch = PyUnicode_READ(kind, data, i);
376            if (ch > maxchar)
377                maxchar = ch;
378        }
379        if (kind == PyUnicode_1BYTE_KIND) {
380            if (ascii->state.ascii == 0) {
381                assert(maxchar >= 128);
382                assert(maxchar <= 255);
383            }
384            else
385                assert(maxchar < 128);
386        }
387        else if (kind == PyUnicode_2BYTE_KIND) {
388            assert(maxchar >= 0x100);
389            assert(maxchar <= 0xFFFF);
390        }
391        else {
392            assert(maxchar >= 0x10000);
393            assert(maxchar <= MAX_UNICODE);
394        }
395        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
396    }
397    return 1;
398}
399#endif
400
401static PyObject*
402unicode_result_wchar(PyObject *unicode)
403{
404#ifndef Py_DEBUG
405    Py_ssize_t len;
406
407    len = _PyUnicode_WSTR_LENGTH(unicode);
408    if (len == 0) {
409        Py_INCREF(unicode_empty);
410        Py_DECREF(unicode);
411        return unicode_empty;
412    }
413
414    if (len == 1) {
415        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416        if (ch < 256) {
417            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418            Py_DECREF(unicode);
419            return latin1_char;
420        }
421    }
422
423    if (_PyUnicode_Ready(unicode) < 0) {
424        Py_DECREF(unicode);
425        return NULL;
426    }
427#else
428    assert(Py_REFCNT(unicode) == 1);
429
430    /* don't make the result ready in debug mode to ensure that the caller
431       makes the string ready before using it */
432    assert(_PyUnicode_CheckConsistency(unicode, 1));
433#endif
434    return unicode;
435}
436
437static PyObject*
438unicode_result_ready(PyObject *unicode)
439{
440    Py_ssize_t length;
441
442    length = PyUnicode_GET_LENGTH(unicode);
443    if (length == 0) {
444        if (unicode != unicode_empty) {
445            Py_INCREF(unicode_empty);
446            Py_DECREF(unicode);
447        }
448        return unicode_empty;
449    }
450
451    if (length == 1) {
452        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
453        if (ch < 256) {
454            PyObject *latin1_char = unicode_latin1[ch];
455            if (latin1_char != NULL) {
456                if (unicode != latin1_char) {
457                    Py_INCREF(latin1_char);
458                    Py_DECREF(unicode);
459                }
460                return latin1_char;
461            }
462            else {
463                assert(_PyUnicode_CheckConsistency(unicode, 1));
464                Py_INCREF(unicode);
465                unicode_latin1[ch] = unicode;
466                return unicode;
467            }
468        }
469    }
470
471    assert(_PyUnicode_CheckConsistency(unicode, 1));
472    return unicode;
473}
474
475static PyObject*
476unicode_result(PyObject *unicode)
477{
478    assert(_PyUnicode_CHECK(unicode));
479    if (PyUnicode_IS_READY(unicode))
480        return unicode_result_ready(unicode);
481    else
482        return unicode_result_wchar(unicode);
483}
484
485static PyObject*
486unicode_result_unchanged(PyObject *unicode)
487{
488    if (PyUnicode_CheckExact(unicode)) {
489        if (PyUnicode_READY(unicode) == -1)
490            return NULL;
491        Py_INCREF(unicode);
492        return unicode;
493    }
494    else
495        /* Subtype -- return genuine unicode string with the same value. */
496        return _PyUnicode_Copy(unicode);
497}
498
499#ifdef HAVE_MBCS
500static OSVERSIONINFOEX winver;
501#endif
502
503/* --- Bloom Filters ----------------------------------------------------- */
504
505/* stuff to implement simple "bloom filters" for Unicode characters.
506   to keep things simple, we use a single bitmask, using the least 5
507   bits from each unicode characters as the bit index. */
508
509/* the linebreak mask is set up by Unicode_Init below */
510
511#if LONG_BIT >= 128
512#define BLOOM_WIDTH 128
513#elif LONG_BIT >= 64
514#define BLOOM_WIDTH 64
515#elif LONG_BIT >= 32
516#define BLOOM_WIDTH 32
517#else
518#error "LONG_BIT is smaller than 32"
519#endif
520
521#define BLOOM_MASK unsigned long
522
523static BLOOM_MASK bloom_linebreak;
524
525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
527
528#define BLOOM_LINEBREAK(ch)                                             \
529    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
530     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
531
532Py_LOCAL_INLINE(BLOOM_MASK)
533make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
534{
535    /* calculate simple bloom-style bitmask for a given unicode string */
536
537    BLOOM_MASK mask;
538    Py_ssize_t i;
539
540    mask = 0;
541    for (i = 0; i < len; i++)
542        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
543
544    return mask;
545}
546
547#define BLOOM_MEMBER(mask, chr, str) \
548    (BLOOM(mask, chr) \
549     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
550
551/* Compilation of templated routines */
552
553#include "stringlib/asciilib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs1lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs2lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
583#include "stringlib/ucs4lib.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/partition.h"
586#include "stringlib/split.h"
587#include "stringlib/count.h"
588#include "stringlib/find.h"
589#include "stringlib/find_max_char.h"
590#include "stringlib/localeutil.h"
591#include "stringlib/undef.h"
592
593#include "stringlib/unicodedefs.h"
594#include "stringlib/fastsearch.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/undef.h"
598
599/* --- Unicode Object ----------------------------------------------------- */
600
601static PyObject *
602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
603
604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
605                                     Py_ssize_t size, Py_UCS4 ch,
606                                     int direction)
607{
608    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
609
610    switch (kind) {
611    case PyUnicode_1BYTE_KIND:
612        {
613            Py_UCS1 ch1 = (Py_UCS1) ch;
614            if (ch1 == ch)
615                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
616            else
617                return -1;
618        }
619    case PyUnicode_2BYTE_KIND:
620        {
621            Py_UCS2 ch2 = (Py_UCS2) ch;
622            if (ch2 == ch)
623                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
624            else
625                return -1;
626        }
627    case PyUnicode_4BYTE_KIND:
628        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
629    default:
630        assert(0);
631        return -1;
632    }
633}
634
635#ifdef Py_DEBUG
636/* Fill the data of an Unicode string with invalid characters to detect bugs
637   earlier.
638
639   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
640   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
641   invalid character in Unicode 6.0. */
642static void
643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
644{
645    int kind = PyUnicode_KIND(unicode);
646    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
647    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
648    if (length <= old_length)
649        return;
650    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
651}
652#endif
653
654static PyObject*
655resize_compact(PyObject *unicode, Py_ssize_t length)
656{
657    Py_ssize_t char_size;
658    Py_ssize_t struct_size;
659    Py_ssize_t new_size;
660    int share_wstr;
661    PyObject *new_unicode;
662#ifdef Py_DEBUG
663    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
664#endif
665
666    assert(unicode_modifiable(unicode));
667    assert(PyUnicode_IS_READY(unicode));
668    assert(PyUnicode_IS_COMPACT(unicode));
669
670    char_size = PyUnicode_KIND(unicode);
671    if (PyUnicode_IS_ASCII(unicode))
672        struct_size = sizeof(PyASCIIObject);
673    else
674        struct_size = sizeof(PyCompactUnicodeObject);
675    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
676
677    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
678        PyErr_NoMemory();
679        return NULL;
680    }
681    new_size = (struct_size + (length + 1) * char_size);
682
683    _Py_DEC_REFTOTAL;
684    _Py_ForgetReference(unicode);
685
686    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
687    if (new_unicode == NULL) {
688        _Py_NewReference(unicode);
689        PyErr_NoMemory();
690        return NULL;
691    }
692    unicode = new_unicode;
693    _Py_NewReference(unicode);
694
695    _PyUnicode_LENGTH(unicode) = length;
696    if (share_wstr) {
697        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
698        if (!PyUnicode_IS_ASCII(unicode))
699            _PyUnicode_WSTR_LENGTH(unicode) = length;
700    }
701#ifdef Py_DEBUG
702    unicode_fill_invalid(unicode, old_length);
703#endif
704    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705                    length, 0);
706    assert(_PyUnicode_CheckConsistency(unicode, 0));
707    return unicode;
708}
709
710static int
711resize_inplace(PyObject *unicode, Py_ssize_t length)
712{
713    wchar_t *wstr;
714    Py_ssize_t new_size;
715    assert(!PyUnicode_IS_COMPACT(unicode));
716    assert(Py_REFCNT(unicode) == 1);
717
718    if (PyUnicode_IS_READY(unicode)) {
719        Py_ssize_t char_size;
720        int share_wstr, share_utf8;
721        void *data;
722#ifdef Py_DEBUG
723        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
724#endif
725
726        data = _PyUnicode_DATA_ANY(unicode);
727        char_size = PyUnicode_KIND(unicode);
728        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
729        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
730
731        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
732            PyErr_NoMemory();
733            return -1;
734        }
735        new_size = (length + 1) * char_size;
736
737        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
738        {
739            PyObject_DEL(_PyUnicode_UTF8(unicode));
740            _PyUnicode_UTF8(unicode) = NULL;
741            _PyUnicode_UTF8_LENGTH(unicode) = 0;
742        }
743
744        data = (PyObject *)PyObject_REALLOC(data, new_size);
745        if (data == NULL) {
746            PyErr_NoMemory();
747            return -1;
748        }
749        _PyUnicode_DATA_ANY(unicode) = data;
750        if (share_wstr) {
751            _PyUnicode_WSTR(unicode) = data;
752            _PyUnicode_WSTR_LENGTH(unicode) = length;
753        }
754        if (share_utf8) {
755            _PyUnicode_UTF8(unicode) = data;
756            _PyUnicode_UTF8_LENGTH(unicode) = length;
757        }
758        _PyUnicode_LENGTH(unicode) = length;
759        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
760#ifdef Py_DEBUG
761        unicode_fill_invalid(unicode, old_length);
762#endif
763        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
764            assert(_PyUnicode_CheckConsistency(unicode, 0));
765            return 0;
766        }
767    }
768    assert(_PyUnicode_WSTR(unicode) != NULL);
769
770    /* check for integer overflow */
771    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
772        PyErr_NoMemory();
773        return -1;
774    }
775    new_size = sizeof(wchar_t) * (length + 1);
776    wstr =  _PyUnicode_WSTR(unicode);
777    wstr = PyObject_REALLOC(wstr, new_size);
778    if (!wstr) {
779        PyErr_NoMemory();
780        return -1;
781    }
782    _PyUnicode_WSTR(unicode) = wstr;
783    _PyUnicode_WSTR(unicode)[length] = 0;
784    _PyUnicode_WSTR_LENGTH(unicode) = length;
785    assert(_PyUnicode_CheckConsistency(unicode, 0));
786    return 0;
787}
788
789static PyObject*
790resize_copy(PyObject *unicode, Py_ssize_t length)
791{
792    Py_ssize_t copy_length;
793    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
794        PyObject *copy;
795
796        if (PyUnicode_READY(unicode) == -1)
797            return NULL;
798
799        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
800        if (copy == NULL)
801            return NULL;
802
803        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
804        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
805        return copy;
806    }
807    else {
808        PyObject *w;
809
810        w = (PyObject*)_PyUnicode_New(length);
811        if (w == NULL)
812            return NULL;
813        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814        copy_length = Py_MIN(copy_length, length);
815        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816                  copy_length * sizeof(wchar_t));
817        return w;
818    }
819}
820
821/* We allocate one more byte to make sure the string is
822   Ux0000 terminated; some code (e.g. new_identifier)
823   relies on that.
824
825   XXX This allocator could further be enhanced by assuring that the
826   free list never reduces its size below 1.
827
828*/
829
830static PyUnicodeObject *
831_PyUnicode_New(Py_ssize_t length)
832{
833    register PyUnicodeObject *unicode;
834    size_t new_size;
835
836    /* Optimization for empty strings */
837    if (length == 0 && unicode_empty != NULL) {
838        Py_INCREF(unicode_empty);
839        return (PyUnicodeObject*)unicode_empty;
840    }
841
842    /* Ensure we won't overflow the size. */
843    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844        return (PyUnicodeObject *)PyErr_NoMemory();
845    }
846    if (length < 0) {
847        PyErr_SetString(PyExc_SystemError,
848                        "Negative size passed to _PyUnicode_New");
849        return NULL;
850    }
851
852    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853    if (unicode == NULL)
854        return NULL;
855    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
857    if (!_PyUnicode_WSTR(unicode)) {
858        Py_DECREF(unicode);
859        PyErr_NoMemory();
860        return NULL;
861    }
862
863    /* Initialize the first element to guard against cases where
864     * the caller fails before initializing str -- unicode_resize()
865     * reads str[0], and the Keep-Alive optimization can keep memory
866     * allocated for str alive across a call to unicode_dealloc(unicode).
867     * We don't want unicode_resize to read uninitialized memory in
868     * that case.
869     */
870    _PyUnicode_WSTR(unicode)[0] = 0;
871    _PyUnicode_WSTR(unicode)[length] = 0;
872    _PyUnicode_WSTR_LENGTH(unicode) = length;
873    _PyUnicode_HASH(unicode) = -1;
874    _PyUnicode_STATE(unicode).interned = 0;
875    _PyUnicode_STATE(unicode).kind = 0;
876    _PyUnicode_STATE(unicode).compact = 0;
877    _PyUnicode_STATE(unicode).ready = 0;
878    _PyUnicode_STATE(unicode).ascii = 0;
879    _PyUnicode_DATA_ANY(unicode) = NULL;
880    _PyUnicode_LENGTH(unicode) = 0;
881    _PyUnicode_UTF8(unicode) = NULL;
882    _PyUnicode_UTF8_LENGTH(unicode) = 0;
883    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
884    return unicode;
885}
886
887static const char*
888unicode_kind_name(PyObject *unicode)
889{
890    /* don't check consistency: unicode_kind_name() is called from
891       _PyUnicode_Dump() */
892    if (!PyUnicode_IS_COMPACT(unicode))
893    {
894        if (!PyUnicode_IS_READY(unicode))
895            return "wstr";
896        switch (PyUnicode_KIND(unicode))
897        {
898        case PyUnicode_1BYTE_KIND:
899            if (PyUnicode_IS_ASCII(unicode))
900                return "legacy ascii";
901            else
902                return "legacy latin1";
903        case PyUnicode_2BYTE_KIND:
904            return "legacy UCS2";
905        case PyUnicode_4BYTE_KIND:
906            return "legacy UCS4";
907        default:
908            return "<legacy invalid kind>";
909        }
910    }
911    assert(PyUnicode_IS_READY(unicode));
912    switch (PyUnicode_KIND(unicode)) {
913    case PyUnicode_1BYTE_KIND:
914        if (PyUnicode_IS_ASCII(unicode))
915            return "ascii";
916        else
917            return "latin1";
918    case PyUnicode_2BYTE_KIND:
919        return "UCS2";
920    case PyUnicode_4BYTE_KIND:
921        return "UCS4";
922    default:
923        return "<invalid compact kind>";
924    }
925}
926
927#ifdef Py_DEBUG
928/* Functions wrapping macros for use in debugger */
929char *_PyUnicode_utf8(void *unicode){
930    return PyUnicode_UTF8(unicode);
931}
932
933void *_PyUnicode_compact_data(void *unicode) {
934    return _PyUnicode_COMPACT_DATA(unicode);
935}
936void *_PyUnicode_data(void *unicode){
937    printf("obj %p\n", unicode);
938    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
939    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
940    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
941    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
942    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
943    return PyUnicode_DATA(unicode);
944}
945
946void
947_PyUnicode_Dump(PyObject *op)
948{
949    PyASCIIObject *ascii = (PyASCIIObject *)op;
950    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
951    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
952    void *data;
953
954    if (ascii->state.compact)
955    {
956        if (ascii->state.ascii)
957            data = (ascii + 1);
958        else
959            data = (compact + 1);
960    }
961    else
962        data = unicode->data.any;
963    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
964
965    if (ascii->wstr == data)
966        printf("shared ");
967    printf("wstr=%p", ascii->wstr);
968
969    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
970        printf(" (%zu), ", compact->wstr_length);
971        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
972            printf("shared ");
973        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
974    }
975    printf(", data=%p\n", data);
976}
977#endif
978
979PyObject *
980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
981{
982    PyObject *obj;
983    PyCompactUnicodeObject *unicode;
984    void *data;
985    enum PyUnicode_Kind kind;
986    int is_sharing, is_ascii;
987    Py_ssize_t char_size;
988    Py_ssize_t struct_size;
989
990    /* Optimization for empty strings */
991    if (size == 0 && unicode_empty != NULL) {
992        Py_INCREF(unicode_empty);
993        return unicode_empty;
994    }
995
996    is_ascii = 0;
997    is_sharing = 0;
998    struct_size = sizeof(PyCompactUnicodeObject);
999    if (maxchar < 128) {
1000        kind = PyUnicode_1BYTE_KIND;
1001        char_size = 1;
1002        is_ascii = 1;
1003        struct_size = sizeof(PyASCIIObject);
1004    }
1005    else if (maxchar < 256) {
1006        kind = PyUnicode_1BYTE_KIND;
1007        char_size = 1;
1008    }
1009    else if (maxchar < 65536) {
1010        kind = PyUnicode_2BYTE_KIND;
1011        char_size = 2;
1012        if (sizeof(wchar_t) == 2)
1013            is_sharing = 1;
1014    }
1015    else {
1016        if (maxchar > MAX_UNICODE) {
1017            PyErr_SetString(PyExc_SystemError,
1018                            "invalid maximum character passed to PyUnicode_New");
1019            return NULL;
1020        }
1021        kind = PyUnicode_4BYTE_KIND;
1022        char_size = 4;
1023        if (sizeof(wchar_t) == 4)
1024            is_sharing = 1;
1025    }
1026
1027    /* Ensure we won't overflow the size. */
1028    if (size < 0) {
1029        PyErr_SetString(PyExc_SystemError,
1030                        "Negative size passed to PyUnicode_New");
1031        return NULL;
1032    }
1033    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1034        return PyErr_NoMemory();
1035
1036    /* Duplicated allocation code from _PyObject_New() instead of a call to
1037     * PyObject_New() so we are able to allocate space for the object and
1038     * it's data buffer.
1039     */
1040    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1041    if (obj == NULL)
1042        return PyErr_NoMemory();
1043    obj = PyObject_INIT(obj, &PyUnicode_Type);
1044    if (obj == NULL)
1045        return NULL;
1046
1047    unicode = (PyCompactUnicodeObject *)obj;
1048    if (is_ascii)
1049        data = ((PyASCIIObject*)obj) + 1;
1050    else
1051        data = unicode + 1;
1052    _PyUnicode_LENGTH(unicode) = size;
1053    _PyUnicode_HASH(unicode) = -1;
1054    _PyUnicode_STATE(unicode).interned = 0;
1055    _PyUnicode_STATE(unicode).kind = kind;
1056    _PyUnicode_STATE(unicode).compact = 1;
1057    _PyUnicode_STATE(unicode).ready = 1;
1058    _PyUnicode_STATE(unicode).ascii = is_ascii;
1059    if (is_ascii) {
1060        ((char*)data)[size] = 0;
1061        _PyUnicode_WSTR(unicode) = NULL;
1062    }
1063    else if (kind == PyUnicode_1BYTE_KIND) {
1064        ((char*)data)[size] = 0;
1065        _PyUnicode_WSTR(unicode) = NULL;
1066        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1067        unicode->utf8 = NULL;
1068        unicode->utf8_length = 0;
1069    }
1070    else {
1071        unicode->utf8 = NULL;
1072        unicode->utf8_length = 0;
1073        if (kind == PyUnicode_2BYTE_KIND)
1074            ((Py_UCS2*)data)[size] = 0;
1075        else /* kind == PyUnicode_4BYTE_KIND */
1076            ((Py_UCS4*)data)[size] = 0;
1077        if (is_sharing) {
1078            _PyUnicode_WSTR_LENGTH(unicode) = size;
1079            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1080        }
1081        else {
1082            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1083            _PyUnicode_WSTR(unicode) = NULL;
1084        }
1085    }
1086#ifdef Py_DEBUG
1087    unicode_fill_invalid((PyObject*)unicode, 0);
1088#endif
1089    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1090    return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095   will decode surrogate pairs, the other conversions are implemented as macros
1096   for efficiency.
1097
1098   This function assumes that unicode can hold one more code point than wstr
1099   characters for a terminating null character. */
1100static void
1101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1102                              PyObject *unicode)
1103{
1104    const wchar_t *iter;
1105    Py_UCS4 *ucs4_out;
1106
1107    assert(unicode != NULL);
1108    assert(_PyUnicode_CHECK(unicode));
1109    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112    for (iter = begin; iter < end; ) {
1113        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114                           _PyUnicode_GET_LENGTH(unicode)));
1115        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116            && (iter+1) < end
1117            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1118        {
1119            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1120            iter += 2;
1121        }
1122        else {
1123            *ucs4_out++ = *iter;
1124            iter++;
1125        }
1126    }
1127    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128                        _PyUnicode_GET_LENGTH(unicode)));
1129
1130}
1131#endif
1132
1133static int
1134unicode_check_modifiable(PyObject *unicode)
1135{
1136    if (!unicode_modifiable(unicode)) {
1137        PyErr_SetString(PyExc_SystemError,
1138                        "Cannot modify a string currently used");
1139        return -1;
1140    }
1141    return 0;
1142}
1143
1144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146                 PyObject *from, Py_ssize_t from_start,
1147                 Py_ssize_t how_many, int check_maxchar)
1148{
1149    unsigned int from_kind, to_kind;
1150    void *from_data, *to_data;
1151
1152    assert(0 <= how_many);
1153    assert(0 <= from_start);
1154    assert(0 <= to_start);
1155    assert(PyUnicode_Check(from));
1156    assert(PyUnicode_IS_READY(from));
1157    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1158
1159    assert(PyUnicode_Check(to));
1160    assert(PyUnicode_IS_READY(to));
1161    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
1163    if (how_many == 0)
1164        return 0;
1165
1166    from_kind = PyUnicode_KIND(from);
1167    from_data = PyUnicode_DATA(from);
1168    to_kind = PyUnicode_KIND(to);
1169    to_data = PyUnicode_DATA(to);
1170
1171#ifdef Py_DEBUG
1172    if (!check_maxchar
1173        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1174    {
1175        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176        Py_UCS4 ch;
1177        Py_ssize_t i;
1178        for (i=0; i < how_many; i++) {
1179            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180            assert(ch <= to_maxchar);
1181        }
1182    }
1183#endif
1184
1185    if (from_kind == to_kind) {
1186        if (check_maxchar
1187            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1188        {
1189            /* Writing Latin-1 characters into an ASCII string requires to
1190               check that all written characters are pure ASCII */
1191            Py_UCS4 max_char;
1192            max_char = ucs1lib_find_max_char(from_data,
1193                                             (Py_UCS1*)from_data + how_many);
1194            if (max_char >= 128)
1195                return -1;
1196        }
1197        Py_MEMCPY((char*)to_data + to_kind * to_start,
1198                  (char*)from_data + from_kind * from_start,
1199                  to_kind * how_many);
1200    }
1201    else if (from_kind == PyUnicode_1BYTE_KIND
1202             && to_kind == PyUnicode_2BYTE_KIND)
1203    {
1204        _PyUnicode_CONVERT_BYTES(
1205            Py_UCS1, Py_UCS2,
1206            PyUnicode_1BYTE_DATA(from) + from_start,
1207            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1208            PyUnicode_2BYTE_DATA(to) + to_start
1209            );
1210    }
1211    else if (from_kind == PyUnicode_1BYTE_KIND
1212             && to_kind == PyUnicode_4BYTE_KIND)
1213    {
1214        _PyUnicode_CONVERT_BYTES(
1215            Py_UCS1, Py_UCS4,
1216            PyUnicode_1BYTE_DATA(from) + from_start,
1217            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1218            PyUnicode_4BYTE_DATA(to) + to_start
1219            );
1220    }
1221    else if (from_kind == PyUnicode_2BYTE_KIND
1222             && to_kind == PyUnicode_4BYTE_KIND)
1223    {
1224        _PyUnicode_CONVERT_BYTES(
1225            Py_UCS2, Py_UCS4,
1226            PyUnicode_2BYTE_DATA(from) + from_start,
1227            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1228            PyUnicode_4BYTE_DATA(to) + to_start
1229            );
1230    }
1231    else {
1232        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1233
1234        if (!check_maxchar) {
1235            if (from_kind == PyUnicode_2BYTE_KIND
1236                && to_kind == PyUnicode_1BYTE_KIND)
1237            {
1238                _PyUnicode_CONVERT_BYTES(
1239                    Py_UCS2, Py_UCS1,
1240                    PyUnicode_2BYTE_DATA(from) + from_start,
1241                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1242                    PyUnicode_1BYTE_DATA(to) + to_start
1243                    );
1244            }
1245            else if (from_kind == PyUnicode_4BYTE_KIND
1246                     && to_kind == PyUnicode_1BYTE_KIND)
1247            {
1248                _PyUnicode_CONVERT_BYTES(
1249                    Py_UCS4, Py_UCS1,
1250                    PyUnicode_4BYTE_DATA(from) + from_start,
1251                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1252                    PyUnicode_1BYTE_DATA(to) + to_start
1253                    );
1254            }
1255            else if (from_kind == PyUnicode_4BYTE_KIND
1256                     && to_kind == PyUnicode_2BYTE_KIND)
1257            {
1258                _PyUnicode_CONVERT_BYTES(
1259                    Py_UCS4, Py_UCS2,
1260                    PyUnicode_4BYTE_DATA(from) + from_start,
1261                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1262                    PyUnicode_2BYTE_DATA(to) + to_start
1263                    );
1264            }
1265            else {
1266                assert(0);
1267                return -1;
1268            }
1269        }
1270        else {
1271            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1272            Py_UCS4 ch;
1273            Py_ssize_t i;
1274
1275            for (i=0; i < how_many; i++) {
1276                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1277                if (ch > to_maxchar)
1278                    return -1;
1279                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1280            }
1281        }
1282    }
1283    return 0;
1284}
1285
1286void
1287_PyUnicode_FastCopyCharacters(
1288    PyObject *to, Py_ssize_t to_start,
1289    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1290{
1291    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1292}
1293
1294Py_ssize_t
1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1296                         PyObject *from, Py_ssize_t from_start,
1297                         Py_ssize_t how_many)
1298{
1299    int err;
1300
1301    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1302        PyErr_BadInternalCall();
1303        return -1;
1304    }
1305
1306    if (PyUnicode_READY(from) == -1)
1307        return -1;
1308    if (PyUnicode_READY(to) == -1)
1309        return -1;
1310
1311    if (from_start < 0) {
1312        PyErr_SetString(PyExc_IndexError, "string index out of range");
1313        return -1;
1314    }
1315    if (to_start < 0) {
1316        PyErr_SetString(PyExc_IndexError, "string index out of range");
1317        return -1;
1318    }
1319    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1320    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1321        PyErr_Format(PyExc_SystemError,
1322                     "Cannot write %zi characters at %zi "
1323                     "in a string of %zi characters",
1324                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1325        return -1;
1326    }
1327
1328    if (how_many == 0)
1329        return 0;
1330
1331    if (unicode_check_modifiable(to))
1332        return -1;
1333
1334    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1335    if (err) {
1336        PyErr_Format(PyExc_SystemError,
1337                     "Cannot copy %s characters "
1338                     "into a string of %s characters",
1339                     unicode_kind_name(from),
1340                     unicode_kind_name(to));
1341        return -1;
1342    }
1343    return how_many;
1344}
1345
1346/* Find the maximum code point and count the number of surrogate pairs so a
1347   correct string length can be computed before converting a string to UCS4.
1348   This function counts single surrogates as a character and not as a pair.
1349
1350   Return 0 on success, or -1 on error. */
1351static int
1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1353                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1354{
1355    const wchar_t *iter;
1356    Py_UCS4 ch;
1357
1358    assert(num_surrogates != NULL && maxchar != NULL);
1359    *num_surrogates = 0;
1360    *maxchar = 0;
1361
1362    for (iter = begin; iter < end; ) {
1363#if SIZEOF_WCHAR_T == 2
1364        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1365            && (iter+1) < end
1366            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1367        {
1368            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1369            ++(*num_surrogates);
1370            iter += 2;
1371        }
1372        else
1373#endif
1374        {
1375            ch = *iter;
1376            iter++;
1377        }
1378        if (ch > *maxchar) {
1379            *maxchar = ch;
1380            if (*maxchar > MAX_UNICODE) {
1381                PyErr_Format(PyExc_ValueError,
1382                             "character U+%x is not in range [U+0000; U+10ffff]",
1383                             ch);
1384                return -1;
1385            }
1386        }
1387    }
1388    return 0;
1389}
1390
1391int
1392_PyUnicode_Ready(PyObject *unicode)
1393{
1394    wchar_t *end;
1395    Py_UCS4 maxchar = 0;
1396    Py_ssize_t num_surrogates;
1397#if SIZEOF_WCHAR_T == 2
1398    Py_ssize_t length_wo_surrogates;
1399#endif
1400
1401    /* _PyUnicode_Ready() is only intended for old-style API usage where
1402       strings were created using _PyObject_New() and where no canonical
1403       representation (the str field) has been set yet aka strings
1404       which are not yet ready. */
1405    assert(_PyUnicode_CHECK(unicode));
1406    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1407    assert(_PyUnicode_WSTR(unicode) != NULL);
1408    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1409    assert(_PyUnicode_UTF8(unicode) == NULL);
1410    /* Actually, it should neither be interned nor be anything else: */
1411    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1412
1413    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1414    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1415                                &maxchar, &num_surrogates) == -1)
1416        return -1;
1417
1418    if (maxchar < 256) {
1419        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1420        if (!_PyUnicode_DATA_ANY(unicode)) {
1421            PyErr_NoMemory();
1422            return -1;
1423        }
1424        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1425                                _PyUnicode_WSTR(unicode), end,
1426                                PyUnicode_1BYTE_DATA(unicode));
1427        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1428        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1429        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1430        if (maxchar < 128) {
1431            _PyUnicode_STATE(unicode).ascii = 1;
1432            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1433            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1434        }
1435        else {
1436            _PyUnicode_STATE(unicode).ascii = 0;
1437            _PyUnicode_UTF8(unicode) = NULL;
1438            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1439        }
1440        PyObject_FREE(_PyUnicode_WSTR(unicode));
1441        _PyUnicode_WSTR(unicode) = NULL;
1442        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1443    }
1444    /* In this case we might have to convert down from 4-byte native
1445       wchar_t to 2-byte unicode. */
1446    else if (maxchar < 65536) {
1447        assert(num_surrogates == 0 &&
1448               "FindMaxCharAndNumSurrogatePairs() messed up");
1449
1450#if SIZEOF_WCHAR_T == 2
1451        /* We can share representations and are done. */
1452        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1453        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1456        _PyUnicode_UTF8(unicode) = NULL;
1457        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1458#else
1459        /* sizeof(wchar_t) == 4 */
1460        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1461            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1462        if (!_PyUnicode_DATA_ANY(unicode)) {
1463            PyErr_NoMemory();
1464            return -1;
1465        }
1466        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1467                                _PyUnicode_WSTR(unicode), end,
1468                                PyUnicode_2BYTE_DATA(unicode));
1469        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1470        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1471        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1472        _PyUnicode_UTF8(unicode) = NULL;
1473        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1474        PyObject_FREE(_PyUnicode_WSTR(unicode));
1475        _PyUnicode_WSTR(unicode) = NULL;
1476        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1477#endif
1478    }
1479    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1480    else {
1481#if SIZEOF_WCHAR_T == 2
1482        /* in case the native representation is 2-bytes, we need to allocate a
1483           new normalized 4-byte version. */
1484        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1485        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1486        if (!_PyUnicode_DATA_ANY(unicode)) {
1487            PyErr_NoMemory();
1488            return -1;
1489        }
1490        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1491        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1492        _PyUnicode_UTF8(unicode) = NULL;
1493        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1494        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1495        _PyUnicode_STATE(unicode).ready = 1;
1496        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1497        PyObject_FREE(_PyUnicode_WSTR(unicode));
1498        _PyUnicode_WSTR(unicode) = NULL;
1499        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#else
1501        assert(num_surrogates == 0);
1502
1503        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1504        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1505        _PyUnicode_UTF8(unicode) = NULL;
1506        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1507        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508#endif
1509        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510    }
1511    _PyUnicode_STATE(unicode).ready = 1;
1512    assert(_PyUnicode_CheckConsistency(unicode, 1));
1513    return 0;
1514}
1515
1516static void
1517unicode_dealloc(register PyObject *unicode)
1518{
1519    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1520    case SSTATE_NOT_INTERNED:
1521        break;
1522
1523    case SSTATE_INTERNED_MORTAL:
1524        /* revive dead object temporarily for DelItem */
1525        Py_REFCNT(unicode) = 3;
1526        if (PyDict_DelItem(interned, unicode) != 0)
1527            Py_FatalError(
1528                "deletion of interned string failed");
1529        break;
1530
1531    case SSTATE_INTERNED_IMMORTAL:
1532        Py_FatalError("Immortal interned string died.");
1533
1534    default:
1535        Py_FatalError("Inconsistent interned string state.");
1536    }
1537
1538    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1539        PyObject_DEL(_PyUnicode_WSTR(unicode));
1540    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1541        PyObject_DEL(_PyUnicode_UTF8(unicode));
1542    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1543        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1544
1545    Py_TYPE(unicode)->tp_free(unicode);
1546}
1547
1548#ifdef Py_DEBUG
1549static int
1550unicode_is_singleton(PyObject *unicode)
1551{
1552    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1553    if (unicode == unicode_empty)
1554        return 1;
1555    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1556    {
1557        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1558        if (ch < 256 && unicode_latin1[ch] == unicode)
1559            return 1;
1560    }
1561    return 0;
1562}
1563#endif
1564
1565static int
1566unicode_modifiable(PyObject *unicode)
1567{
1568    assert(_PyUnicode_CHECK(unicode));
1569    if (Py_REFCNT(unicode) != 1)
1570        return 0;
1571    if (_PyUnicode_HASH(unicode) != -1)
1572        return 0;
1573    if (PyUnicode_CHECK_INTERNED(unicode))
1574        return 0;
1575    if (!PyUnicode_CheckExact(unicode))
1576        return 0;
1577#ifdef Py_DEBUG
1578    /* singleton refcount is greater than 1 */
1579    assert(!unicode_is_singleton(unicode));
1580#endif
1581    return 1;
1582}
1583
1584static int
1585unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1586{
1587    PyObject *unicode;
1588    Py_ssize_t old_length;
1589
1590    assert(p_unicode != NULL);
1591    unicode = *p_unicode;
1592
1593    assert(unicode != NULL);
1594    assert(PyUnicode_Check(unicode));
1595    assert(0 <= length);
1596
1597    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1598        old_length = PyUnicode_WSTR_LENGTH(unicode);
1599    else
1600        old_length = PyUnicode_GET_LENGTH(unicode);
1601    if (old_length == length)
1602        return 0;
1603
1604    if (length == 0) {
1605        Py_DECREF(*p_unicode);
1606        *p_unicode = unicode_empty;
1607        Py_INCREF(*p_unicode);
1608        return 0;
1609    }
1610
1611    if (!unicode_modifiable(unicode)) {
1612        PyObject *copy = resize_copy(unicode, length);
1613        if (copy == NULL)
1614            return -1;
1615        Py_DECREF(*p_unicode);
1616        *p_unicode = copy;
1617        return 0;
1618    }
1619
1620    if (PyUnicode_IS_COMPACT(unicode)) {
1621        PyObject *new_unicode = resize_compact(unicode, length);
1622        if (new_unicode == NULL)
1623            return -1;
1624        *p_unicode = new_unicode;
1625        return 0;
1626    }
1627    return resize_inplace(unicode, length);
1628}
1629
1630int
1631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1632{
1633    PyObject *unicode;
1634    if (p_unicode == NULL) {
1635        PyErr_BadInternalCall();
1636        return -1;
1637    }
1638    unicode = *p_unicode;
1639    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1640    {
1641        PyErr_BadInternalCall();
1642        return -1;
1643    }
1644    return unicode_resize(p_unicode, length);
1645}
1646
1647/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1648
1649   WARNING: The function doesn't copy the terminating null character and
1650   doesn't check the maximum character (may write a latin1 character in an
1651   ASCII string). */
1652static void
1653unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1654                   const char *str, Py_ssize_t len)
1655{
1656    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1657    void *data = PyUnicode_DATA(unicode);
1658    const char *end = str + len;
1659
1660    switch (kind) {
1661    case PyUnicode_1BYTE_KIND: {
1662        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1663#ifdef Py_DEBUG
1664        if (PyUnicode_IS_ASCII(unicode)) {
1665            Py_UCS4 maxchar = ucs1lib_find_max_char(
1666                (const Py_UCS1*)str,
1667                (const Py_UCS1*)str + len);
1668            assert(maxchar < 128);
1669        }
1670#endif
1671        memcpy((char *) data + index, str, len);
1672        break;
1673    }
1674    case PyUnicode_2BYTE_KIND: {
1675        Py_UCS2 *start = (Py_UCS2 *)data + index;
1676        Py_UCS2 *ucs2 = start;
1677        assert(index <= PyUnicode_GET_LENGTH(unicode));
1678
1679        for (; str < end; ++ucs2, ++str)
1680            *ucs2 = (Py_UCS2)*str;
1681
1682        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1683        break;
1684    }
1685    default: {
1686        Py_UCS4 *start = (Py_UCS4 *)data + index;
1687        Py_UCS4 *ucs4 = start;
1688        assert(kind == PyUnicode_4BYTE_KIND);
1689        assert(index <= PyUnicode_GET_LENGTH(unicode));
1690
1691        for (; str < end; ++ucs4, ++str)
1692            *ucs4 = (Py_UCS4)*str;
1693
1694        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1695    }
1696    }
1697}
1698
1699
1700static PyObject*
1701get_latin1_char(unsigned char ch)
1702{
1703    PyObject *unicode = unicode_latin1[ch];
1704    if (!unicode) {
1705        unicode = PyUnicode_New(1, ch);
1706        if (!unicode)
1707            return NULL;
1708        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1709        assert(_PyUnicode_CheckConsistency(unicode, 1));
1710        unicode_latin1[ch] = unicode;
1711    }
1712    Py_INCREF(unicode);
1713    return unicode;
1714}
1715
1716PyObject *
1717PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1718{
1719    PyObject *unicode;
1720    Py_UCS4 maxchar = 0;
1721    Py_ssize_t num_surrogates;
1722
1723    if (u == NULL)
1724        return (PyObject*)_PyUnicode_New(size);
1725
1726    /* If the Unicode data is known at construction time, we can apply
1727       some optimizations which share commonly used objects. */
1728
1729    /* Optimization for empty strings */
1730    if (size == 0 && unicode_empty != NULL) {
1731        Py_INCREF(unicode_empty);
1732        return unicode_empty;
1733    }
1734
1735    /* Single character Unicode objects in the Latin-1 range are
1736       shared when using this constructor */
1737    if (size == 1 && *u < 256)
1738        return get_latin1_char((unsigned char)*u);
1739
1740    /* If not empty and not single character, copy the Unicode data
1741       into the new object */
1742    if (find_maxchar_surrogates(u, u + size,
1743                                &maxchar, &num_surrogates) == -1)
1744        return NULL;
1745
1746    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1747    if (!unicode)
1748        return NULL;
1749
1750    switch (PyUnicode_KIND(unicode)) {
1751    case PyUnicode_1BYTE_KIND:
1752        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1753                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1754        break;
1755    case PyUnicode_2BYTE_KIND:
1756#if Py_UNICODE_SIZE == 2
1757        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1758#else
1759        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1760                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1761#endif
1762        break;
1763    case PyUnicode_4BYTE_KIND:
1764#if SIZEOF_WCHAR_T == 2
1765        /* This is the only case which has to process surrogates, thus
1766           a simple copy loop is not enough and we need a function. */
1767        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1768#else
1769        assert(num_surrogates == 0);
1770        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1771#endif
1772        break;
1773    default:
1774        assert(0 && "Impossible state");
1775    }
1776
1777    return unicode_result(unicode);
1778}
1779
1780PyObject *
1781PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1782{
1783    if (size < 0) {
1784        PyErr_SetString(PyExc_SystemError,
1785                        "Negative size passed to PyUnicode_FromStringAndSize");
1786        return NULL;
1787    }
1788    if (u != NULL)
1789        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1790    else
1791        return (PyObject *)_PyUnicode_New(size);
1792}
1793
1794PyObject *
1795PyUnicode_FromString(const char *u)
1796{
1797    size_t size = strlen(u);
1798    if (size > PY_SSIZE_T_MAX) {
1799        PyErr_SetString(PyExc_OverflowError, "input too long");
1800        return NULL;
1801    }
1802    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1803}
1804
1805PyObject *
1806_PyUnicode_FromId(_Py_Identifier *id)
1807{
1808    if (!id->object) {
1809        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1810                                                  strlen(id->string),
1811                                                  NULL, NULL);
1812        if (!id->object)
1813            return NULL;
1814        PyUnicode_InternInPlace(&id->object);
1815        assert(!id->next);
1816        id->next = static_strings;
1817        static_strings = id;
1818    }
1819    return id->object;
1820}
1821
1822void
1823_PyUnicode_ClearStaticStrings()
1824{
1825    _Py_Identifier *i;
1826    for (i = static_strings; i; i = i->next) {
1827        Py_DECREF(i->object);
1828        i->object = NULL;
1829        i->next = NULL;
1830    }
1831}
1832
1833/* Internal function, doesn't check maximum character */
1834
1835PyObject*
1836_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1837{
1838    const unsigned char *s = (const unsigned char *)buffer;
1839    PyObject *unicode;
1840    if (size == 1) {
1841#ifdef Py_DEBUG
1842        assert(s[0] < 128);
1843#endif
1844        return get_latin1_char(s[0]);
1845    }
1846    unicode = PyUnicode_New(size, 127);
1847    if (!unicode)
1848        return NULL;
1849    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1850    assert(_PyUnicode_CheckConsistency(unicode, 1));
1851    return unicode;
1852}
1853
1854static Py_UCS4
1855kind_maxchar_limit(unsigned int kind)
1856{
1857    switch (kind) {
1858    case PyUnicode_1BYTE_KIND:
1859        return 0x80;
1860    case PyUnicode_2BYTE_KIND:
1861        return 0x100;
1862    case PyUnicode_4BYTE_KIND:
1863        return 0x10000;
1864    default:
1865        assert(0 && "invalid kind");
1866        return MAX_UNICODE;
1867    }
1868}
1869
1870Py_LOCAL_INLINE(Py_UCS4)
1871align_maxchar(Py_UCS4 maxchar)
1872{
1873    if (maxchar <= 127)
1874        return 127;
1875    else if (maxchar <= 255)
1876        return 255;
1877    else if (maxchar <= 65535)
1878        return 65535;
1879    else
1880        return MAX_UNICODE;
1881}
1882
1883static PyObject*
1884_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1885{
1886    PyObject *res;
1887    unsigned char max_char;
1888
1889    if (size == 0) {
1890        Py_INCREF(unicode_empty);
1891        return unicode_empty;
1892    }
1893    assert(size > 0);
1894    if (size == 1)
1895        return get_latin1_char(u[0]);
1896
1897    max_char = ucs1lib_find_max_char(u, u + size);
1898    res = PyUnicode_New(size, max_char);
1899    if (!res)
1900        return NULL;
1901    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1902    assert(_PyUnicode_CheckConsistency(res, 1));
1903    return res;
1904}
1905
1906static PyObject*
1907_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1908{
1909    PyObject *res;
1910    Py_UCS2 max_char;
1911
1912    if (size == 0) {
1913        Py_INCREF(unicode_empty);
1914        return unicode_empty;
1915    }
1916    assert(size > 0);
1917    if (size == 1) {
1918        Py_UCS4 ch = u[0];
1919        if (ch < 256)
1920            return get_latin1_char((unsigned char)ch);
1921
1922        res = PyUnicode_New(1, ch);
1923        if (res == NULL)
1924            return NULL;
1925        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1926        assert(_PyUnicode_CheckConsistency(res, 1));
1927        return res;
1928    }
1929
1930    max_char = ucs2lib_find_max_char(u, u + size);
1931    res = PyUnicode_New(size, max_char);
1932    if (!res)
1933        return NULL;
1934    if (max_char >= 256)
1935        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1936    else {
1937        _PyUnicode_CONVERT_BYTES(
1938            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1939    }
1940    assert(_PyUnicode_CheckConsistency(res, 1));
1941    return res;
1942}
1943
1944static PyObject*
1945_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1946{
1947    PyObject *res;
1948    Py_UCS4 max_char;
1949
1950    if (size == 0) {
1951        Py_INCREF(unicode_empty);
1952        return unicode_empty;
1953    }
1954    assert(size > 0);
1955    if (size == 1) {
1956        Py_UCS4 ch = u[0];
1957        if (ch < 256)
1958            return get_latin1_char((unsigned char)ch);
1959
1960        res = PyUnicode_New(1, ch);
1961        if (res == NULL)
1962            return NULL;
1963        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1964        assert(_PyUnicode_CheckConsistency(res, 1));
1965        return res;
1966    }
1967
1968    max_char = ucs4lib_find_max_char(u, u + size);
1969    res = PyUnicode_New(size, max_char);
1970    if (!res)
1971        return NULL;
1972    if (max_char < 256)
1973        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1974                                 PyUnicode_1BYTE_DATA(res));
1975    else if (max_char < 0x10000)
1976        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1977                                 PyUnicode_2BYTE_DATA(res));
1978    else
1979        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1980    assert(_PyUnicode_CheckConsistency(res, 1));
1981    return res;
1982}
1983
1984PyObject*
1985PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1986{
1987    if (size < 0) {
1988        PyErr_SetString(PyExc_ValueError, "size must be positive");
1989        return NULL;
1990    }
1991    switch (kind) {
1992    case PyUnicode_1BYTE_KIND:
1993        return _PyUnicode_FromUCS1(buffer, size);
1994    case PyUnicode_2BYTE_KIND:
1995        return _PyUnicode_FromUCS2(buffer, size);
1996    case PyUnicode_4BYTE_KIND:
1997        return _PyUnicode_FromUCS4(buffer, size);
1998    default:
1999        PyErr_SetString(PyExc_SystemError, "invalid kind");
2000        return NULL;
2001    }
2002}
2003
2004Py_UCS4
2005_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2006{
2007    enum PyUnicode_Kind kind;
2008    void *startptr, *endptr;
2009
2010    assert(PyUnicode_IS_READY(unicode));
2011    assert(0 <= start);
2012    assert(end <= PyUnicode_GET_LENGTH(unicode));
2013    assert(start <= end);
2014
2015    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2016        return PyUnicode_MAX_CHAR_VALUE(unicode);
2017
2018    if (start == end)
2019        return 127;
2020
2021    if (PyUnicode_IS_ASCII(unicode))
2022        return 127;
2023
2024    kind = PyUnicode_KIND(unicode);
2025    startptr = PyUnicode_DATA(unicode);
2026    endptr = (char *)startptr + end * kind;
2027    startptr = (char *)startptr + start * kind;
2028    switch(kind) {
2029    case PyUnicode_1BYTE_KIND:
2030        return ucs1lib_find_max_char(startptr, endptr);
2031    case PyUnicode_2BYTE_KIND:
2032        return ucs2lib_find_max_char(startptr, endptr);
2033    case PyUnicode_4BYTE_KIND:
2034        return ucs4lib_find_max_char(startptr, endptr);
2035    default:
2036        assert(0);
2037        return 0;
2038    }
2039}
2040
2041/* Ensure that a string uses the most efficient storage, if it is not the
2042   case: create a new string with of the right kind. Write NULL into *p_unicode
2043   on error. */
2044static void
2045unicode_adjust_maxchar(PyObject **p_unicode)
2046{
2047    PyObject *unicode, *copy;
2048    Py_UCS4 max_char;
2049    Py_ssize_t len;
2050    unsigned int kind;
2051
2052    assert(p_unicode != NULL);
2053    unicode = *p_unicode;
2054    assert(PyUnicode_IS_READY(unicode));
2055    if (PyUnicode_IS_ASCII(unicode))
2056        return;
2057
2058    len = PyUnicode_GET_LENGTH(unicode);
2059    kind = PyUnicode_KIND(unicode);
2060    if (kind == PyUnicode_1BYTE_KIND) {
2061        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2062        max_char = ucs1lib_find_max_char(u, u + len);
2063        if (max_char >= 128)
2064            return;
2065    }
2066    else if (kind == PyUnicode_2BYTE_KIND) {
2067        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2068        max_char = ucs2lib_find_max_char(u, u + len);
2069        if (max_char >= 256)
2070            return;
2071    }
2072    else {
2073        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2074        assert(kind == PyUnicode_4BYTE_KIND);
2075        max_char = ucs4lib_find_max_char(u, u + len);
2076        if (max_char >= 0x10000)
2077            return;
2078    }
2079    copy = PyUnicode_New(len, max_char);
2080    if (copy != NULL)
2081        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2082    Py_DECREF(unicode);
2083    *p_unicode = copy;
2084}
2085
2086PyObject*
2087_PyUnicode_Copy(PyObject *unicode)
2088{
2089    Py_ssize_t length;
2090    PyObject *copy;
2091
2092    if (!PyUnicode_Check(unicode)) {
2093        PyErr_BadInternalCall();
2094        return NULL;
2095    }
2096    if (PyUnicode_READY(unicode) == -1)
2097        return NULL;
2098
2099    length = PyUnicode_GET_LENGTH(unicode);
2100    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2101    if (!copy)
2102        return NULL;
2103    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2104
2105    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2106              length * PyUnicode_KIND(unicode));
2107    assert(_PyUnicode_CheckConsistency(copy, 1));
2108    return copy;
2109}
2110
2111
2112/* Widen Unicode objects to larger buffers. Don't write terminating null
2113   character. Return NULL on error. */
2114
2115void*
2116_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2117{
2118    Py_ssize_t len;
2119    void *result;
2120    unsigned int skind;
2121
2122    if (PyUnicode_READY(s) == -1)
2123        return NULL;
2124
2125    len = PyUnicode_GET_LENGTH(s);
2126    skind = PyUnicode_KIND(s);
2127    if (skind >= kind) {
2128        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2129        return NULL;
2130    }
2131    switch (kind) {
2132    case PyUnicode_2BYTE_KIND:
2133        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2134        if (!result)
2135            return PyErr_NoMemory();
2136        assert(skind == PyUnicode_1BYTE_KIND);
2137        _PyUnicode_CONVERT_BYTES(
2138            Py_UCS1, Py_UCS2,
2139            PyUnicode_1BYTE_DATA(s),
2140            PyUnicode_1BYTE_DATA(s) + len,
2141            result);
2142        return result;
2143    case PyUnicode_4BYTE_KIND:
2144        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2145        if (!result)
2146            return PyErr_NoMemory();
2147        if (skind == PyUnicode_2BYTE_KIND) {
2148            _PyUnicode_CONVERT_BYTES(
2149                Py_UCS2, Py_UCS4,
2150                PyUnicode_2BYTE_DATA(s),
2151                PyUnicode_2BYTE_DATA(s) + len,
2152                result);
2153        }
2154        else {
2155            assert(skind == PyUnicode_1BYTE_KIND);
2156            _PyUnicode_CONVERT_BYTES(
2157                Py_UCS1, Py_UCS4,
2158                PyUnicode_1BYTE_DATA(s),
2159                PyUnicode_1BYTE_DATA(s) + len,
2160                result);
2161        }
2162        return result;
2163    default:
2164        break;
2165    }
2166    PyErr_SetString(PyExc_SystemError, "invalid kind");
2167    return NULL;
2168}
2169
2170static Py_UCS4*
2171as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2172        int copy_null)
2173{
2174    int kind;
2175    void *data;
2176    Py_ssize_t len, targetlen;
2177    if (PyUnicode_READY(string) == -1)
2178        return NULL;
2179    kind = PyUnicode_KIND(string);
2180    data = PyUnicode_DATA(string);
2181    len = PyUnicode_GET_LENGTH(string);
2182    targetlen = len;
2183    if (copy_null)
2184        targetlen++;
2185    if (!target) {
2186        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2187            PyErr_NoMemory();
2188            return NULL;
2189        }
2190        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2191        if (!target) {
2192            PyErr_NoMemory();
2193            return NULL;
2194        }
2195    }
2196    else {
2197        if (targetsize < targetlen) {
2198            PyErr_Format(PyExc_SystemError,
2199                         "string is longer than the buffer");
2200            if (copy_null && 0 < targetsize)
2201                target[0] = 0;
2202            return NULL;
2203        }
2204    }
2205    if (kind == PyUnicode_1BYTE_KIND) {
2206        Py_UCS1 *start = (Py_UCS1 *) data;
2207        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2208    }
2209    else if (kind == PyUnicode_2BYTE_KIND) {
2210        Py_UCS2 *start = (Py_UCS2 *) data;
2211        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2212    }
2213    else {
2214        assert(kind == PyUnicode_4BYTE_KIND);
2215        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2216    }
2217    if (copy_null)
2218        target[len] = 0;
2219    return target;
2220}
2221
2222Py_UCS4*
2223PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2224                 int copy_null)
2225{
2226    if (target == NULL || targetsize < 0) {
2227        PyErr_BadInternalCall();
2228        return NULL;
2229    }
2230    return as_ucs4(string, target, targetsize, copy_null);
2231}
2232
2233Py_UCS4*
2234PyUnicode_AsUCS4Copy(PyObject *string)
2235{
2236    return as_ucs4(string, NULL, 0, 1);
2237}
2238
2239#ifdef HAVE_WCHAR_H
2240
2241PyObject *
2242PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2243{
2244    if (w == NULL) {
2245        if (size == 0) {
2246            Py_INCREF(unicode_empty);
2247            return unicode_empty;
2248        }
2249        PyErr_BadInternalCall();
2250        return NULL;
2251    }
2252
2253    if (size == -1) {
2254        size = wcslen(w);
2255    }
2256
2257    return PyUnicode_FromUnicode(w, size);
2258}
2259
2260#endif /* HAVE_WCHAR_H */
2261
2262static void
2263makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2264        char c)
2265{
2266    *fmt++ = '%';
2267    if (longflag)
2268        *fmt++ = 'l';
2269    else if (longlongflag) {
2270        /* longlongflag should only ever be nonzero on machines with
2271           HAVE_LONG_LONG defined */
2272#ifdef HAVE_LONG_LONG
2273        char *f = PY_FORMAT_LONG_LONG;
2274        while (*f)
2275            *fmt++ = *f++;
2276#else
2277        /* we shouldn't ever get here */
2278        assert(0);
2279        *fmt++ = 'l';
2280#endif
2281    }
2282    else if (size_tflag) {
2283        char *f = PY_FORMAT_SIZE_T;
2284        while (*f)
2285            *fmt++ = *f++;
2286    }
2287    *fmt++ = c;
2288    *fmt = '\0';
2289}
2290
2291/* maximum number of characters required for output of %lld or %p.
2292   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2293   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2294#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2295
2296static const char*
2297unicode_fromformat_arg(_PyUnicodeWriter *writer,
2298                       const char *f, va_list *vargs)
2299{
2300    const char *p;
2301    Py_ssize_t len;
2302    int zeropad;
2303    int width;
2304    int precision;
2305    int longflag;
2306    int longlongflag;
2307    int size_tflag;
2308    int fill;
2309
2310    p = f;
2311    f++;
2312    zeropad = 0;
2313    if (*f == '0') {
2314        zeropad = 1;
2315        f++;
2316    }
2317
2318    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2319    width = 0;
2320    while (Py_ISDIGIT((unsigned)*f)) {
2321        if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2322            PyErr_SetString(PyExc_ValueError,
2323                            "width too big");
2324            return NULL;
2325        }
2326        width = (width*10) + (*f - '0');
2327        f++;
2328    }
2329    precision = 0;
2330    if (*f == '.') {
2331        f++;
2332        while (Py_ISDIGIT((unsigned)*f)) {
2333            if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2334                PyErr_SetString(PyExc_ValueError,
2335                                "precision too big");
2336                return NULL;
2337            }
2338            precision = (precision*10) + (*f - '0');
2339            f++;
2340        }
2341        if (*f == '%') {
2342            /* "%.3%s" => f points to "3" */
2343            f--;
2344        }
2345    }
2346    if (*f == '\0') {
2347        /* bogus format "%.123" => go backward, f points to "3" */
2348        f--;
2349    }
2350
2351    /* Handle %ld, %lu, %lld and %llu. */
2352    longflag = 0;
2353    longlongflag = 0;
2354    size_tflag = 0;
2355    if (*f == 'l') {
2356        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2357            longflag = 1;
2358            ++f;
2359        }
2360#ifdef HAVE_LONG_LONG
2361        else if (f[1] == 'l' &&
2362                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2363            longlongflag = 1;
2364            f += 2;
2365        }
2366#endif
2367    }
2368    /* handle the size_t flag. */
2369    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2370        size_tflag = 1;
2371        ++f;
2372    }
2373
2374    if (f[1] == '\0')
2375        writer->overallocate = 0;
2376
2377    switch (*f) {
2378    case 'c':
2379    {
2380        int ordinal = va_arg(*vargs, int);
2381        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2382            PyErr_SetString(PyExc_ValueError,
2383                            "character argument not in range(0x110000)");
2384            return NULL;
2385        }
2386        if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2387            return NULL;
2388        PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2389        writer->pos++;
2390        break;
2391    }
2392
2393    case 'i':
2394    case 'd':
2395    case 'u':
2396    case 'x':
2397    {
2398        /* used by sprintf */
2399        char fmt[10]; /* should be enough for "%0lld\0" */
2400        char buffer[MAX_LONG_LONG_CHARS];
2401
2402        if (*f == 'u') {
2403            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2404
2405            if (longflag)
2406                len = sprintf(buffer, fmt,
2407                        va_arg(*vargs, unsigned long));
2408#ifdef HAVE_LONG_LONG
2409            else if (longlongflag)
2410                len = sprintf(buffer, fmt,
2411                        va_arg(*vargs, unsigned PY_LONG_LONG));
2412#endif
2413            else if (size_tflag)
2414                len = sprintf(buffer, fmt,
2415                        va_arg(*vargs, size_t));
2416            else
2417                len = sprintf(buffer, fmt,
2418                        va_arg(*vargs, unsigned int));
2419        }
2420        else if (*f == 'x') {
2421            makefmt(fmt, 0, 0, 0, 'x');
2422            len = sprintf(buffer, fmt, va_arg(*vargs, int));
2423        }
2424        else {
2425            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2426
2427            if (longflag)
2428                len = sprintf(buffer, fmt,
2429                        va_arg(*vargs, long));
2430#ifdef HAVE_LONG_LONG
2431            else if (longlongflag)
2432                len = sprintf(buffer, fmt,
2433                        va_arg(*vargs, PY_LONG_LONG));
2434#endif
2435            else if (size_tflag)
2436                len = sprintf(buffer, fmt,
2437                        va_arg(*vargs, Py_ssize_t));
2438            else
2439                len = sprintf(buffer, fmt,
2440                        va_arg(*vargs, int));
2441        }
2442        assert(len >= 0);
2443
2444        if (precision < len)
2445            precision = len;
2446        if (width > precision) {
2447            Py_UCS4 fillchar;
2448            fill = width - precision;
2449            fillchar = zeropad?'0':' ';
2450            if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2451                return NULL;
2452            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2453                return NULL;
2454            writer->pos += fill;
2455        }
2456        if (precision > len) {
2457            fill = precision - len;
2458            if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2459                return NULL;
2460            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2461                return NULL;
2462            writer->pos += fill;
2463        }
2464        if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
2465            return NULL;
2466        break;
2467    }
2468
2469    case 'p':
2470    {
2471        char number[MAX_LONG_LONG_CHARS];
2472
2473        len = sprintf(number, "%p", va_arg(*vargs, void*));
2474        assert(len >= 0);
2475
2476        /* %p is ill-defined:  ensure leading 0x. */
2477        if (number[1] == 'X')
2478            number[1] = 'x';
2479        else if (number[1] != 'x') {
2480            memmove(number + 2, number,
2481                    strlen(number) + 1);
2482            number[0] = '0';
2483            number[1] = 'x';
2484            len += 2;
2485        }
2486
2487        if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2488            return NULL;
2489        break;
2490    }
2491
2492    case 's':
2493    {
2494        /* UTF-8 */
2495        const char *s = va_arg(*vargs, const char*);
2496        PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2497        if (!str)
2498            return NULL;
2499        if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2500            Py_DECREF(str);
2501            return NULL;
2502        }
2503        Py_DECREF(str);
2504        break;
2505    }
2506
2507    case 'U':
2508    {
2509        PyObject *obj = va_arg(*vargs, PyObject *);
2510        assert(obj && _PyUnicode_CHECK(obj));
2511
2512        if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2513            return NULL;
2514        break;
2515    }
2516
2517    case 'V':
2518    {
2519        PyObject *obj = va_arg(*vargs, PyObject *);
2520        const char *str = va_arg(*vargs, const char *);
2521        PyObject *str_obj;
2522        assert(obj || str);
2523        if (obj) {
2524            assert(_PyUnicode_CHECK(obj));
2525            if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2526                return NULL;
2527        }
2528        else {
2529            str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2530            if (!str_obj)
2531                return NULL;
2532            if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2533                Py_DECREF(str_obj);
2534                return NULL;
2535            }
2536            Py_DECREF(str_obj);
2537        }
2538        break;
2539    }
2540
2541    case 'S':
2542    {
2543        PyObject *obj = va_arg(*vargs, PyObject *);
2544        PyObject *str;
2545        assert(obj);
2546        str = PyObject_Str(obj);
2547        if (!str)
2548            return NULL;
2549        if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2550            Py_DECREF(str);
2551            return NULL;
2552        }
2553        Py_DECREF(str);
2554        break;
2555    }
2556
2557    case 'R':
2558    {
2559        PyObject *obj = va_arg(*vargs, PyObject *);
2560        PyObject *repr;
2561        assert(obj);
2562        repr = PyObject_Repr(obj);
2563        if (!repr)
2564            return NULL;
2565        if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2566            Py_DECREF(repr);
2567            return NULL;
2568        }
2569        Py_DECREF(repr);
2570        break;
2571    }
2572
2573    case 'A':
2574    {
2575        PyObject *obj = va_arg(*vargs, PyObject *);
2576        PyObject *ascii;
2577        assert(obj);
2578        ascii = PyObject_ASCII(obj);
2579        if (!ascii)
2580            return NULL;
2581        if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2582            Py_DECREF(ascii);
2583            return NULL;
2584        }
2585        Py_DECREF(ascii);
2586        break;
2587    }
2588
2589    case '%':
2590        if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2591            return NULL;
2592        PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2593        writer->pos++;
2594        break;
2595
2596    default:
2597        /* if we stumble upon an unknown formatting code, copy the rest
2598           of the format string to the output string. (we cannot just
2599           skip the code, since there's no way to know what's in the
2600           argument list) */
2601        len = strlen(p);
2602        if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2603            return NULL;
2604        f = p+len;
2605        return f;
2606    }
2607
2608    f++;
2609    return f;
2610}
2611
2612PyObject *
2613PyUnicode_FromFormatV(const char *format, va_list vargs)
2614{
2615    va_list vargs2;
2616    const char *f;
2617    _PyUnicodeWriter writer;
2618
2619    _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2620
2621    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2622       Copy it to be able to pass a reference to a subfunction. */
2623    Py_VA_COPY(vargs2, vargs);
2624
2625    for (f = format; *f; ) {
2626        if (*f == '%') {
2627            f = unicode_fromformat_arg(&writer, f, &vargs2);
2628            if (f == NULL)
2629                goto fail;
2630        }
2631        else {
2632            const char *p;
2633            Py_ssize_t len;
2634
2635            p = f;
2636            do
2637            {
2638                if ((unsigned char)*p > 127) {
2639                    PyErr_Format(PyExc_ValueError,
2640                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2641                        "string, got a non-ASCII byte: 0x%02x",
2642                        (unsigned char)*p);
2643                    return NULL;
2644                }
2645                p++;
2646            }
2647            while (*p != '\0' && *p != '%');
2648            len = p - f;
2649
2650            if (*p == '\0')
2651                writer.overallocate = 0;
2652            if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2653                goto fail;
2654            unicode_write_cstr(writer.buffer, writer.pos, f, len);
2655            writer.pos += len;
2656
2657            f = p;
2658        }
2659    }
2660    return _PyUnicodeWriter_Finish(&writer);
2661
2662  fail:
2663    _PyUnicodeWriter_Dealloc(&writer);
2664    return NULL;
2665}
2666
2667PyObject *
2668PyUnicode_FromFormat(const char *format, ...)
2669{
2670    PyObject* ret;
2671    va_list vargs;
2672
2673#ifdef HAVE_STDARG_PROTOTYPES
2674    va_start(vargs, format);
2675#else
2676    va_start(vargs);
2677#endif
2678    ret = PyUnicode_FromFormatV(format, vargs);
2679    va_end(vargs);
2680    return ret;
2681}
2682
2683#ifdef HAVE_WCHAR_H
2684
2685/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2686   convert a Unicode object to a wide character string.
2687
2688   - If w is NULL: return the number of wide characters (including the null
2689     character) required to convert the unicode object. Ignore size argument.
2690
2691   - Otherwise: return the number of wide characters (excluding the null
2692     character) written into w. Write at most size wide characters (including
2693     the null character). */
2694static Py_ssize_t
2695unicode_aswidechar(PyObject *unicode,
2696                   wchar_t *w,
2697                   Py_ssize_t size)
2698{
2699    Py_ssize_t res;
2700    const wchar_t *wstr;
2701
2702    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2703    if (wstr == NULL)
2704        return -1;
2705
2706    if (w != NULL) {
2707        if (size > res)
2708            size = res + 1;
2709        else
2710            res = size;
2711        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2712        return res;
2713    }
2714    else
2715        return res + 1;
2716}
2717
2718Py_ssize_t
2719PyUnicode_AsWideChar(PyObject *unicode,
2720                     wchar_t *w,
2721                     Py_ssize_t size)
2722{
2723    if (unicode == NULL) {
2724        PyErr_BadInternalCall();
2725        return -1;
2726    }
2727    return unicode_aswidechar(unicode, w, size);
2728}
2729
2730wchar_t*
2731PyUnicode_AsWideCharString(PyObject *unicode,
2732                           Py_ssize_t *size)
2733{
2734    wchar_t* buffer;
2735    Py_ssize_t buflen;
2736
2737    if (unicode == NULL) {
2738        PyErr_BadInternalCall();
2739        return NULL;
2740    }
2741
2742    buflen = unicode_aswidechar(unicode, NULL, 0);
2743    if (buflen == -1)
2744        return NULL;
2745    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2746        PyErr_NoMemory();
2747        return NULL;
2748    }
2749
2750    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2751    if (buffer == NULL) {
2752        PyErr_NoMemory();
2753        return NULL;
2754    }
2755    buflen = unicode_aswidechar(unicode, buffer, buflen);
2756    if (buflen == -1) {
2757        PyMem_FREE(buffer);
2758        return NULL;
2759    }
2760    if (size != NULL)
2761        *size = buflen;
2762    return buffer;
2763}
2764
2765#endif /* HAVE_WCHAR_H */
2766
2767PyObject *
2768PyUnicode_FromOrdinal(int ordinal)
2769{
2770    PyObject *v;
2771    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2772        PyErr_SetString(PyExc_ValueError,
2773                        "chr() arg not in range(0x110000)");
2774        return NULL;
2775    }
2776
2777    if (ordinal < 256)
2778        return get_latin1_char(ordinal);
2779
2780    v = PyUnicode_New(1, ordinal);
2781    if (v == NULL)
2782        return NULL;
2783    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2784    assert(_PyUnicode_CheckConsistency(v, 1));
2785    return v;
2786}
2787
2788PyObject *
2789PyUnicode_FromObject(register PyObject *obj)
2790{
2791    /* XXX Perhaps we should make this API an alias of
2792       PyObject_Str() instead ?! */
2793    if (PyUnicode_CheckExact(obj)) {
2794        if (PyUnicode_READY(obj) == -1)
2795            return NULL;
2796        Py_INCREF(obj);
2797        return obj;
2798    }
2799    if (PyUnicode_Check(obj)) {
2800        /* For a Unicode subtype that's not a Unicode object,
2801           return a true Unicode object with the same data. */
2802        return _PyUnicode_Copy(obj);
2803    }
2804    PyErr_Format(PyExc_TypeError,
2805                 "Can't convert '%.100s' object to str implicitly",
2806                 Py_TYPE(obj)->tp_name);
2807    return NULL;
2808}
2809
2810PyObject *
2811PyUnicode_FromEncodedObject(register PyObject *obj,
2812                            const char *encoding,
2813                            const char *errors)
2814{
2815    Py_buffer buffer;
2816    PyObject *v;
2817
2818    if (obj == NULL) {
2819        PyErr_BadInternalCall();
2820        return NULL;
2821    }
2822
2823    /* Decoding bytes objects is the most common case and should be fast */
2824    if (PyBytes_Check(obj)) {
2825        if (PyBytes_GET_SIZE(obj) == 0) {
2826            Py_INCREF(unicode_empty);
2827            v = unicode_empty;
2828        }
2829        else {
2830            v = PyUnicode_Decode(
2831                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2832                    encoding, errors);
2833        }
2834        return v;
2835    }
2836
2837    if (PyUnicode_Check(obj)) {
2838        PyErr_SetString(PyExc_TypeError,
2839                        "decoding str is not supported");
2840        return NULL;
2841    }
2842
2843    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2844    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2845        PyErr_Format(PyExc_TypeError,
2846                     "coercing to str: need bytes, bytearray "
2847                     "or buffer-like object, %.80s found",
2848                     Py_TYPE(obj)->tp_name);
2849        return NULL;
2850    }
2851
2852    if (buffer.len == 0) {
2853        Py_INCREF(unicode_empty);
2854        v = unicode_empty;
2855    }
2856    else
2857        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2858
2859    PyBuffer_Release(&buffer);
2860    return v;
2861}
2862
2863/* Convert encoding to lower case and replace '_' with '-' in order to
2864   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2865   1 on success. */
2866static int
2867normalize_encoding(const char *encoding,
2868                   char *lower,
2869                   size_t lower_len)
2870{
2871    const char *e;
2872    char *l;
2873    char *l_end;
2874
2875    if (encoding == NULL) {
2876        strcpy(lower, "utf-8");
2877        return 1;
2878    }
2879    e = encoding;
2880    l = lower;
2881    l_end = &lower[lower_len - 1];
2882    while (*e) {
2883        if (l == l_end)
2884            return 0;
2885        if (Py_ISUPPER(*e)) {
2886            *l++ = Py_TOLOWER(*e++);
2887        }
2888        else if (*e == '_') {
2889            *l++ = '-';
2890            e++;
2891        }
2892        else {
2893            *l++ = *e++;
2894        }
2895    }
2896    *l = '\0';
2897    return 1;
2898}
2899
2900PyObject *
2901PyUnicode_Decode(const char *s,
2902                 Py_ssize_t size,
2903                 const char *encoding,
2904                 const char *errors)
2905{
2906    PyObject *buffer = NULL, *unicode;
2907    Py_buffer info;
2908    char lower[11];  /* Enough for any encoding shortcut */
2909
2910    /* Shortcuts for common default encodings */
2911    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2912        if ((strcmp(lower, "utf-8") == 0) ||
2913            (strcmp(lower, "utf8") == 0))
2914            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2915        else if ((strcmp(lower, "latin-1") == 0) ||
2916                 (strcmp(lower, "latin1") == 0) ||
2917                 (strcmp(lower, "iso-8859-1") == 0))
2918            return PyUnicode_DecodeLatin1(s, size, errors);
2919#ifdef HAVE_MBCS
2920        else if (strcmp(lower, "mbcs") == 0)
2921            return PyUnicode_DecodeMBCS(s, size, errors);
2922#endif
2923        else if (strcmp(lower, "ascii") == 0)
2924            return PyUnicode_DecodeASCII(s, size, errors);
2925        else if (strcmp(lower, "utf-16") == 0)
2926            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2927        else if (strcmp(lower, "utf-32") == 0)
2928            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2929    }
2930
2931    /* Decode via the codec registry */
2932    buffer = NULL;
2933    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2934        goto onError;
2935    buffer = PyMemoryView_FromBuffer(&info);
2936    if (buffer == NULL)
2937        goto onError;
2938    unicode = PyCodec_Decode(buffer, encoding, errors);
2939    if (unicode == NULL)
2940        goto onError;
2941    if (!PyUnicode_Check(unicode)) {
2942        PyErr_Format(PyExc_TypeError,
2943                     "decoder did not return a str object (type=%.400s)",
2944                     Py_TYPE(unicode)->tp_name);
2945        Py_DECREF(unicode);
2946        goto onError;
2947    }
2948    Py_DECREF(buffer);
2949    return unicode_result(unicode);
2950
2951  onError:
2952    Py_XDECREF(buffer);
2953    return NULL;
2954}
2955
2956PyObject *
2957PyUnicode_AsDecodedObject(PyObject *unicode,
2958                          const char *encoding,
2959                          const char *errors)
2960{
2961    PyObject *v;
2962
2963    if (!PyUnicode_Check(unicode)) {
2964        PyErr_BadArgument();
2965        goto onError;
2966    }
2967
2968    if (encoding == NULL)
2969        encoding = PyUnicode_GetDefaultEncoding();
2970
2971    /* Decode via the codec registry */
2972    v = PyCodec_Decode(unicode, encoding, errors);
2973    if (v == NULL)
2974        goto onError;
2975    return unicode_result(v);
2976
2977  onError:
2978    return NULL;
2979}
2980
2981PyObject *
2982PyUnicode_AsDecodedUnicode(PyObject *unicode,
2983                           const char *encoding,
2984                           const char *errors)
2985{
2986    PyObject *v;
2987
2988    if (!PyUnicode_Check(unicode)) {
2989        PyErr_BadArgument();
2990        goto onError;
2991    }
2992
2993    if (encoding == NULL)
2994        encoding = PyUnicode_GetDefaultEncoding();
2995
2996    /* Decode via the codec registry */
2997    v = PyCodec_Decode(unicode, encoding, errors);
2998    if (v == NULL)
2999        goto onError;
3000    if (!PyUnicode_Check(v)) {
3001        PyErr_Format(PyExc_TypeError,
3002                     "decoder did not return a str object (type=%.400s)",
3003                     Py_TYPE(v)->tp_name);
3004        Py_DECREF(v);
3005        goto onError;
3006    }
3007    return unicode_result(v);
3008
3009  onError:
3010    return NULL;
3011}
3012
3013PyObject *
3014PyUnicode_Encode(const Py_UNICODE *s,
3015                 Py_ssize_t size,
3016                 const char *encoding,
3017                 const char *errors)
3018{
3019    PyObject *v, *unicode;
3020
3021    unicode = PyUnicode_FromUnicode(s, size);
3022    if (unicode == NULL)
3023        return NULL;
3024    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3025    Py_DECREF(unicode);
3026    return v;
3027}
3028
3029PyObject *
3030PyUnicode_AsEncodedObject(PyObject *unicode,
3031                          const char *encoding,
3032                          const char *errors)
3033{
3034    PyObject *v;
3035
3036    if (!PyUnicode_Check(unicode)) {
3037        PyErr_BadArgument();
3038        goto onError;
3039    }
3040
3041    if (encoding == NULL)
3042        encoding = PyUnicode_GetDefaultEncoding();
3043
3044    /* Encode via the codec registry */
3045    v = PyCodec_Encode(unicode, encoding, errors);
3046    if (v == NULL)
3047        goto onError;
3048    return v;
3049
3050  onError:
3051    return NULL;
3052}
3053
3054static size_t
3055wcstombs_errorpos(const wchar_t *wstr)
3056{
3057    size_t len;
3058#if SIZEOF_WCHAR_T == 2
3059    wchar_t buf[3];
3060#else
3061    wchar_t buf[2];
3062#endif
3063    char outbuf[MB_LEN_MAX];
3064    const wchar_t *start, *previous;
3065
3066#if SIZEOF_WCHAR_T == 2
3067    buf[2] = 0;
3068#else
3069    buf[1] = 0;
3070#endif
3071    start = wstr;
3072    while (*wstr != L'\0')
3073    {
3074        previous = wstr;
3075#if SIZEOF_WCHAR_T == 2
3076        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3077            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3078        {
3079            buf[0] = wstr[0];
3080            buf[1] = wstr[1];
3081            wstr += 2;
3082        }
3083        else {
3084            buf[0] = *wstr;
3085            buf[1] = 0;
3086            wstr++;
3087        }
3088#else
3089        buf[0] = *wstr;
3090        wstr++;
3091#endif
3092        len = wcstombs(outbuf, buf, sizeof(outbuf));
3093        if (len == (size_t)-1)
3094            return previous - start;
3095    }
3096
3097    /* failed to find the unencodable character */
3098    return 0;
3099}
3100
3101static int
3102locale_error_handler(const char *errors, int *surrogateescape)
3103{
3104    if (errors == NULL) {
3105        *surrogateescape = 0;
3106        return 0;
3107    }
3108
3109    if (strcmp(errors, "strict") == 0) {
3110        *surrogateescape = 0;
3111        return 0;
3112    }
3113    if (strcmp(errors, "surrogateescape") == 0) {
3114        *surrogateescape = 1;
3115        return 0;
3116    }
3117    PyErr_Format(PyExc_ValueError,
3118                 "only 'strict' and 'surrogateescape' error handlers "
3119                 "are supported, not '%s'",
3120                 errors);
3121    return -1;
3122}
3123
3124PyObject *
3125PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3126{
3127    Py_ssize_t wlen, wlen2;
3128    wchar_t *wstr;
3129    PyObject *bytes = NULL;
3130    char *errmsg;
3131    PyObject *reason;
3132    PyObject *exc;
3133    size_t error_pos;
3134    int surrogateescape;
3135
3136    if (locale_error_handler(errors, &surrogateescape) < 0)
3137        return NULL;
3138
3139    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3140    if (wstr == NULL)
3141        return NULL;
3142
3143    wlen2 = wcslen(wstr);
3144    if (wlen2 != wlen) {
3145        PyMem_Free(wstr);
3146        PyErr_SetString(PyExc_TypeError, "embedded null character");
3147        return NULL;
3148    }
3149
3150    if (surrogateescape) {
3151        /* locale encoding with surrogateescape */
3152        char *str;
3153
3154        str = _Py_wchar2char(wstr, &error_pos);
3155        if (str == NULL) {
3156            if (error_pos == (size_t)-1) {
3157                PyErr_NoMemory();
3158                PyMem_Free(wstr);
3159                return NULL;
3160            }
3161            else {
3162                goto encode_error;
3163            }
3164        }
3165        PyMem_Free(wstr);
3166
3167        bytes = PyBytes_FromString(str);
3168        PyMem_Free(str);
3169    }
3170    else {
3171        size_t len, len2;
3172
3173        len = wcstombs(NULL, wstr, 0);
3174        if (len == (size_t)-1) {
3175            error_pos = (size_t)-1;
3176            goto encode_error;
3177        }
3178
3179        bytes = PyBytes_FromStringAndSize(NULL, len);
3180        if (bytes == NULL) {
3181            PyMem_Free(wstr);
3182            return NULL;
3183        }
3184
3185        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3186        if (len2 == (size_t)-1 || len2 > len) {
3187            error_pos = (size_t)-1;
3188            goto encode_error;
3189        }
3190        PyMem_Free(wstr);
3191    }
3192    return bytes;
3193
3194encode_error:
3195    errmsg = strerror(errno);
3196    assert(errmsg != NULL);
3197
3198    if (error_pos == (size_t)-1)
3199        error_pos = wcstombs_errorpos(wstr);
3200
3201    PyMem_Free(wstr);
3202    Py_XDECREF(bytes);
3203
3204    if (errmsg != NULL) {
3205        size_t errlen;
3206        wstr = _Py_char2wchar(errmsg, &errlen);
3207        if (wstr != NULL) {
3208            reason = PyUnicode_FromWideChar(wstr, errlen);
3209            PyMem_Free(wstr);
3210        } else
3211            errmsg = NULL;
3212    }
3213    if (errmsg == NULL)
3214        reason = PyUnicode_FromString(
3215            "wcstombs() encountered an unencodable "
3216            "wide character");
3217    if (reason == NULL)
3218        return NULL;
3219
3220    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3221                                "locale", unicode,
3222                                (Py_ssize_t)error_pos,
3223                                (Py_ssize_t)(error_pos+1),
3224                                reason);
3225    Py_DECREF(reason);
3226    if (exc != NULL) {
3227        PyCodec_StrictErrors(exc);
3228        Py_XDECREF(exc);
3229    }
3230    return NULL;
3231}
3232
3233PyObject *
3234PyUnicode_EncodeFSDefault(PyObject *unicode)
3235{
3236#ifdef HAVE_MBCS
3237    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3238#elif defined(__APPLE__)
3239    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3240#else
3241    PyInterpreterState *interp = PyThreadState_GET()->interp;
3242    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3243       cannot use it to encode and decode filenames before it is loaded. Load
3244       the Python codec requires to encode at least its own filename. Use the C
3245       version of the locale codec until the codec registry is initialized and
3246       the Python codec is loaded.
3247
3248       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3249       cannot only rely on it: check also interp->fscodec_initialized for
3250       subinterpreters. */
3251    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3252        return PyUnicode_AsEncodedString(unicode,
3253                                         Py_FileSystemDefaultEncoding,
3254                                         "surrogateescape");
3255    }
3256    else {
3257        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3258    }
3259#endif
3260}
3261
3262PyObject *
3263PyUnicode_AsEncodedString(PyObject *unicode,
3264                          const char *encoding,
3265                          const char *errors)
3266{
3267    PyObject *v;
3268    char lower[11];  /* Enough for any encoding shortcut */
3269
3270    if (!PyUnicode_Check(unicode)) {
3271        PyErr_BadArgument();
3272        return NULL;
3273    }
3274
3275    /* Shortcuts for common default encodings */
3276    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3277        if ((strcmp(lower, "utf-8") == 0) ||
3278            (strcmp(lower, "utf8") == 0))
3279        {
3280            if (errors == NULL || strcmp(errors, "strict") == 0)
3281                return _PyUnicode_AsUTF8String(unicode, NULL);
3282            else
3283                return _PyUnicode_AsUTF8String(unicode, errors);
3284        }
3285        else if ((strcmp(lower, "latin-1") == 0) ||
3286                 (strcmp(lower, "latin1") == 0) ||
3287                 (strcmp(lower, "iso-8859-1") == 0))
3288            return _PyUnicode_AsLatin1String(unicode, errors);
3289#ifdef HAVE_MBCS
3290        else if (strcmp(lower, "mbcs") == 0)
3291            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3292#endif
3293        else if (strcmp(lower, "ascii") == 0)
3294            return _PyUnicode_AsASCIIString(unicode, errors);
3295    }
3296
3297    /* Encode via the codec registry */
3298    v = PyCodec_Encode(unicode, encoding, errors);
3299    if (v == NULL)
3300        return NULL;
3301
3302    /* The normal path */
3303    if (PyBytes_Check(v))
3304        return v;
3305
3306    /* If the codec returns a buffer, raise a warning and convert to bytes */
3307    if (PyByteArray_Check(v)) {
3308        int error;
3309        PyObject *b;
3310
3311        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3312            "encoder %s returned bytearray instead of bytes",
3313            encoding);
3314        if (error) {
3315            Py_DECREF(v);
3316            return NULL;
3317        }
3318
3319        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3320        Py_DECREF(v);
3321        return b;
3322    }
3323
3324    PyErr_Format(PyExc_TypeError,
3325                 "encoder did not return a bytes object (type=%.400s)",
3326                 Py_TYPE(v)->tp_name);
3327    Py_DECREF(v);
3328    return NULL;
3329}
3330
3331PyObject *
3332PyUnicode_AsEncodedUnicode(PyObject *unicode,
3333                           const char *encoding,
3334                           const char *errors)
3335{
3336    PyObject *v;
3337
3338    if (!PyUnicode_Check(unicode)) {
3339        PyErr_BadArgument();
3340        goto onError;
3341    }
3342
3343    if (encoding == NULL)
3344        encoding = PyUnicode_GetDefaultEncoding();
3345
3346    /* Encode via the codec registry */
3347    v = PyCodec_Encode(unicode, encoding, errors);
3348    if (v == NULL)
3349        goto onError;
3350    if (!PyUnicode_Check(v)) {
3351        PyErr_Format(PyExc_TypeError,
3352                     "encoder did not return an str object (type=%.400s)",
3353                     Py_TYPE(v)->tp_name);
3354        Py_DECREF(v);
3355        goto onError;
3356    }
3357    return v;
3358
3359  onError:
3360    return NULL;
3361}
3362
3363static size_t
3364mbstowcs_errorpos(const char *str, size_t len)
3365{
3366#ifdef HAVE_MBRTOWC
3367    const char *start = str;
3368    mbstate_t mbs;
3369    size_t converted;
3370    wchar_t ch;
3371
3372    memset(&mbs, 0, sizeof mbs);
3373    while (len)
3374    {
3375        converted = mbrtowc(&ch, (char*)str, len, &mbs);
3376        if (converted == 0)
3377            /* Reached end of string */
3378            break;
3379        if (converted == (size_t)-1 || converted == (size_t)-2) {
3380            /* Conversion error or incomplete character */
3381            return str - start;
3382        }
3383        else {
3384            str += converted;
3385            len -= converted;
3386        }
3387    }
3388    /* failed to find the undecodable byte sequence */
3389    return 0;
3390#endif
3391    return 0;
3392}
3393
3394PyObject*
3395PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3396                              const char *errors)
3397{
3398    wchar_t smallbuf[256];
3399    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3400    wchar_t *wstr;
3401    size_t wlen, wlen2;
3402    PyObject *unicode;
3403    int surrogateescape;
3404    size_t error_pos;
3405    char *errmsg;
3406    PyObject *reason, *exc;
3407
3408    if (locale_error_handler(errors, &surrogateescape) < 0)
3409        return NULL;
3410
3411    if (str[len] != '\0' || len != strlen(str)) {
3412        PyErr_SetString(PyExc_TypeError, "embedded null character");
3413        return NULL;
3414    }
3415
3416    if (surrogateescape)
3417    {
3418        wstr = _Py_char2wchar(str, &wlen);
3419        if (wstr == NULL) {
3420            if (wlen == (size_t)-1)
3421                PyErr_NoMemory();
3422            else
3423                PyErr_SetFromErrno(PyExc_OSError);
3424            return NULL;
3425        }
3426
3427        unicode = PyUnicode_FromWideChar(wstr, wlen);
3428        PyMem_Free(wstr);
3429    }
3430    else {
3431#ifndef HAVE_BROKEN_MBSTOWCS
3432        wlen = mbstowcs(NULL, str, 0);
3433#else
3434        wlen = len;
3435#endif
3436        if (wlen == (size_t)-1)
3437            goto decode_error;
3438        if (wlen+1 <= smallbuf_len) {
3439            wstr = smallbuf;
3440        }
3441        else {
3442            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3443                return PyErr_NoMemory();
3444
3445            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3446            if (!wstr)
3447                return PyErr_NoMemory();
3448        }
3449
3450        /* This shouldn't fail now */
3451        wlen2 = mbstowcs(wstr, str, wlen+1);
3452        if (wlen2 == (size_t)-1) {
3453            if (wstr != smallbuf)
3454                PyMem_Free(wstr);
3455            goto decode_error;
3456        }
3457#ifdef HAVE_BROKEN_MBSTOWCS
3458        assert(wlen2 == wlen);
3459#endif
3460        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3461        if (wstr != smallbuf)
3462            PyMem_Free(wstr);
3463    }
3464    return unicode;
3465
3466decode_error:
3467    errmsg = strerror(errno);
3468    assert(errmsg != NULL);
3469
3470    error_pos = mbstowcs_errorpos(str, len);
3471    if (errmsg != NULL) {
3472        size_t errlen;
3473        wstr = _Py_char2wchar(errmsg, &errlen);
3474        if (wstr != NULL) {
3475            reason = PyUnicode_FromWideChar(wstr, errlen);
3476            PyMem_Free(wstr);
3477        } else
3478            errmsg = NULL;
3479    }
3480    if (errmsg == NULL)
3481        reason = PyUnicode_FromString(
3482            "mbstowcs() encountered an invalid multibyte sequence");
3483    if (reason == NULL)
3484        return NULL;
3485
3486    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3487                                "locale", str, len,
3488                                (Py_ssize_t)error_pos,
3489                                (Py_ssize_t)(error_pos+1),
3490                                reason);
3491    Py_DECREF(reason);
3492    if (exc != NULL) {
3493        PyCodec_StrictErrors(exc);
3494        Py_XDECREF(exc);
3495    }
3496    return NULL;
3497}
3498
3499PyObject*
3500PyUnicode_DecodeLocale(const char *str, const char *errors)
3501{
3502    Py_ssize_t size = (Py_ssize_t)strlen(str);
3503    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3504}
3505
3506
3507PyObject*
3508PyUnicode_DecodeFSDefault(const char *s) {
3509    Py_ssize_t size = (Py_ssize_t)strlen(s);
3510    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3511}
3512
3513PyObject*
3514PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3515{
3516#ifdef HAVE_MBCS
3517    return PyUnicode_DecodeMBCS(s, size, NULL);
3518#elif defined(__APPLE__)
3519    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3520#else
3521    PyInterpreterState *interp = PyThreadState_GET()->interp;
3522    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3523       cannot use it to encode and decode filenames before it is loaded. Load
3524       the Python codec requires to encode at least its own filename. Use the C
3525       version of the locale codec until the codec registry is initialized and
3526       the Python codec is loaded.
3527
3528       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3529       cannot only rely on it: check also interp->fscodec_initialized for
3530       subinterpreters. */
3531    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3532        return PyUnicode_Decode(s, size,
3533                                Py_FileSystemDefaultEncoding,
3534                                "surrogateescape");
3535    }
3536    else {
3537        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3538    }
3539#endif
3540}
3541
3542
3543int
3544_PyUnicode_HasNULChars(PyObject* str)
3545{
3546    Py_ssize_t pos;
3547
3548    if (PyUnicode_READY(str) == -1)
3549        return -1;
3550    pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3551                   PyUnicode_GET_LENGTH(str), '\0', 1);
3552    if (pos == -1)
3553        return 0;
3554    else
3555        return 1;
3556}
3557
3558int
3559PyUnicode_FSConverter(PyObject* arg, void* addr)
3560{
3561    PyObject *output = NULL;
3562    Py_ssize_t size;
3563    void *data;
3564    if (arg == NULL) {
3565        Py_DECREF(*(PyObject**)addr);
3566        return 1;
3567    }
3568    if (PyBytes_Check(arg)) {
3569        output = arg;
3570        Py_INCREF(output);
3571    }
3572    else {
3573        arg = PyUnicode_FromObject(arg);
3574        if (!arg)
3575            return 0;
3576        output = PyUnicode_EncodeFSDefault(arg);
3577        Py_DECREF(arg);
3578        if (!output)
3579            return 0;
3580        if (!PyBytes_Check(output)) {
3581            Py_DECREF(output);
3582            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3583            return 0;
3584        }
3585    }
3586    size = PyBytes_GET_SIZE(output);
3587    data = PyBytes_AS_STRING(output);
3588    if (size != strlen(data)) {
3589        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3590        Py_DECREF(output);
3591        return 0;
3592    }
3593    *(PyObject**)addr = output;
3594    return Py_CLEANUP_SUPPORTED;
3595}
3596
3597
3598int
3599PyUnicode_FSDecoder(PyObject* arg, void* addr)
3600{
3601    PyObject *output = NULL;
3602    if (arg == NULL) {
3603        Py_DECREF(*(PyObject**)addr);
3604        return 1;
3605    }
3606    if (PyUnicode_Check(arg)) {
3607        if (PyUnicode_READY(arg) == -1)
3608            return 0;
3609        output = arg;
3610        Py_INCREF(output);
3611    }
3612    else {
3613        arg = PyBytes_FromObject(arg);
3614        if (!arg)
3615            return 0;
3616        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3617                                                  PyBytes_GET_SIZE(arg));
3618        Py_DECREF(arg);
3619        if (!output)
3620            return 0;
3621        if (!PyUnicode_Check(output)) {
3622            Py_DECREF(output);
3623            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3624            return 0;
3625        }
3626    }
3627    if (PyUnicode_READY(output) == -1) {
3628        Py_DECREF(output);
3629        return 0;
3630    }
3631    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3632                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3633        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3634        Py_DECREF(output);
3635        return 0;
3636    }
3637    *(PyObject**)addr = output;
3638    return Py_CLEANUP_SUPPORTED;
3639}
3640
3641
3642char*
3643PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3644{
3645    PyObject *bytes;
3646
3647    if (!PyUnicode_Check(unicode)) {
3648        PyErr_BadArgument();
3649        return NULL;
3650    }
3651    if (PyUnicode_READY(unicode) == -1)
3652        return NULL;
3653
3654    if (PyUnicode_UTF8(unicode) == NULL) {
3655        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3656        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3657        if (bytes == NULL)
3658            return NULL;
3659        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3660        if (_PyUnicode_UTF8(unicode) == NULL) {
3661            Py_DECREF(bytes);
3662            return NULL;
3663        }
3664        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3665        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3666                  PyBytes_AS_STRING(bytes),
3667                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3668        Py_DECREF(bytes);
3669    }
3670
3671    if (psize)
3672        *psize = PyUnicode_UTF8_LENGTH(unicode);
3673    return PyUnicode_UTF8(unicode);
3674}
3675
3676char*
3677PyUnicode_AsUTF8(PyObject *unicode)
3678{
3679    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3680}
3681
3682Py_UNICODE *
3683PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3684{
3685    const unsigned char *one_byte;
3686#if SIZEOF_WCHAR_T == 4
3687    const Py_UCS2 *two_bytes;
3688#else
3689    const Py_UCS4 *four_bytes;
3690    const Py_UCS4 *ucs4_end;
3691    Py_ssize_t num_surrogates;
3692#endif
3693    wchar_t *w;
3694    wchar_t *wchar_end;
3695
3696    if (!PyUnicode_Check(unicode)) {
3697        PyErr_BadArgument();
3698        return NULL;
3699    }
3700    if (_PyUnicode_WSTR(unicode) == NULL) {
3701        /* Non-ASCII compact unicode object */
3702        assert(_PyUnicode_KIND(unicode) != 0);
3703        assert(PyUnicode_IS_READY(unicode));
3704
3705        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3706#if SIZEOF_WCHAR_T == 2
3707            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3708            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3709            num_surrogates = 0;
3710
3711            for (; four_bytes < ucs4_end; ++four_bytes) {
3712                if (*four_bytes > 0xFFFF)
3713                    ++num_surrogates;
3714            }
3715
3716            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3717                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3718            if (!_PyUnicode_WSTR(unicode)) {
3719                PyErr_NoMemory();
3720                return NULL;
3721            }
3722            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3723
3724            w = _PyUnicode_WSTR(unicode);
3725            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3726            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3727            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3728                if (*four_bytes > 0xFFFF) {
3729                    assert(*four_bytes <= MAX_UNICODE);
3730                    /* encode surrogate pair in this case */
3731                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3732                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3733                }
3734                else
3735                    *w = *four_bytes;
3736
3737                if (w > wchar_end) {
3738                    assert(0 && "Miscalculated string end");
3739                }
3740            }
3741            *w = 0;
3742#else
3743            /* sizeof(wchar_t) == 4 */
3744            Py_FatalError("Impossible unicode object state, wstr and str "
3745                          "should share memory already.");
3746            return NULL;
3747#endif
3748        }
3749        else {
3750            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3751                                                  (_PyUnicode_LENGTH(unicode) + 1));
3752            if (!_PyUnicode_WSTR(unicode)) {
3753                PyErr_NoMemory();
3754                return NULL;
3755            }
3756            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3757                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3758            w = _PyUnicode_WSTR(unicode);
3759            wchar_end = w + _PyUnicode_LENGTH(unicode);
3760
3761            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3762                one_byte = PyUnicode_1BYTE_DATA(unicode);
3763                for (; w < wchar_end; ++one_byte, ++w)
3764                    *w = *one_byte;
3765                /* null-terminate the wstr */
3766                *w = 0;
3767            }
3768            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3769#if SIZEOF_WCHAR_T == 4
3770                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3771                for (; w < wchar_end; ++two_bytes, ++w)
3772                    *w = *two_bytes;
3773                /* null-terminate the wstr */
3774                *w = 0;
3775#else
3776                /* sizeof(wchar_t) == 2 */
3777                PyObject_FREE(_PyUnicode_WSTR(unicode));
3778                _PyUnicode_WSTR(unicode) = NULL;
3779                Py_FatalError("Impossible unicode object state, wstr "
3780                              "and str should share memory already.");
3781                return NULL;
3782#endif
3783            }
3784            else {
3785                assert(0 && "This should never happen.");
3786            }
3787        }
3788    }
3789    if (size != NULL)
3790        *size = PyUnicode_WSTR_LENGTH(unicode);
3791    return _PyUnicode_WSTR(unicode);
3792}
3793
3794Py_UNICODE *
3795PyUnicode_AsUnicode(PyObject *unicode)
3796{
3797    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3798}
3799
3800
3801Py_ssize_t
3802PyUnicode_GetSize(PyObject *unicode)
3803{
3804    if (!PyUnicode_Check(unicode)) {
3805        PyErr_BadArgument();
3806        goto onError;
3807    }
3808    return PyUnicode_GET_SIZE(unicode);
3809
3810  onError:
3811    return -1;
3812}
3813
3814Py_ssize_t
3815PyUnicode_GetLength(PyObject *unicode)
3816{
3817    if (!PyUnicode_Check(unicode)) {
3818        PyErr_BadArgument();
3819        return -1;
3820    }
3821    if (PyUnicode_READY(unicode) == -1)
3822        return -1;
3823    return PyUnicode_GET_LENGTH(unicode);
3824}
3825
3826Py_UCS4
3827PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3828{
3829    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3830        PyErr_BadArgument();
3831        return (Py_UCS4)-1;
3832    }
3833    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3834        PyErr_SetString(PyExc_IndexError, "string index out of range");
3835        return (Py_UCS4)-1;
3836    }
3837    return PyUnicode_READ_CHAR(unicode, index);
3838}
3839
3840int
3841PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3842{
3843    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3844        PyErr_BadArgument();
3845        return -1;
3846    }
3847    assert(PyUnicode_IS_READY(unicode));
3848    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3849        PyErr_SetString(PyExc_IndexError, "string index out of range");
3850        return -1;
3851    }
3852    if (unicode_check_modifiable(unicode))
3853        return -1;
3854    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3855        PyErr_SetString(PyExc_ValueError, "character out of range");
3856        return -1;
3857    }
3858    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3859                    index, ch);
3860    return 0;
3861}
3862
3863const char *
3864PyUnicode_GetDefaultEncoding(void)
3865{
3866    return "utf-8";
3867}
3868
3869/* create or adjust a UnicodeDecodeError */
3870static void
3871make_decode_exception(PyObject **exceptionObject,
3872                      const char *encoding,
3873                      const char *input, Py_ssize_t length,
3874                      Py_ssize_t startpos, Py_ssize_t endpos,
3875                      const char *reason)
3876{
3877    if (*exceptionObject == NULL) {
3878        *exceptionObject = PyUnicodeDecodeError_Create(
3879            encoding, input, length, startpos, endpos, reason);
3880    }
3881    else {
3882        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3883            goto onError;
3884        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3885            goto onError;
3886        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3887            goto onError;
3888    }
3889    return;
3890
3891onError:
3892    Py_DECREF(*exceptionObject);
3893    *exceptionObject = NULL;
3894}
3895
3896#ifdef HAVE_MBCS
3897/* error handling callback helper:
3898   build arguments, call the callback and check the arguments,
3899   if no exception occurred, copy the replacement to the output
3900   and adjust various state variables.
3901   return 0 on success, -1 on error
3902*/
3903
3904static int
3905unicode_decode_call_errorhandler_wchar(
3906    const char *errors, PyObject **errorHandler,
3907    const char *encoding, const char *reason,
3908    const char **input, const char **inend, Py_ssize_t *startinpos,
3909    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3910    PyObject **output, Py_ssize_t *outpos)
3911{
3912    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3913
3914    PyObject *restuple = NULL;
3915    PyObject *repunicode = NULL;
3916    Py_ssize_t outsize;
3917    Py_ssize_t insize;
3918    Py_ssize_t requiredsize;
3919    Py_ssize_t newpos;
3920    PyObject *inputobj = NULL;
3921    wchar_t *repwstr;
3922    Py_ssize_t repwlen;
3923
3924    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3925    outsize = _PyUnicode_WSTR_LENGTH(*output);
3926
3927    if (*errorHandler == NULL) {
3928        *errorHandler = PyCodec_LookupError(errors);
3929        if (*errorHandler == NULL)
3930            goto onError;
3931    }
3932
3933    make_decode_exception(exceptionObject,
3934        encoding,
3935        *input, *inend - *input,
3936        *startinpos, *endinpos,
3937        reason);
3938    if (*exceptionObject == NULL)
3939        goto onError;
3940
3941    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3942    if (restuple == NULL)
3943        goto onError;
3944    if (!PyTuple_Check(restuple)) {
3945        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3946        goto onError;
3947    }
3948    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3949        goto onError;
3950
3951    /* Copy back the bytes variables, which might have been modified by the
3952       callback */
3953    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3954    if (!inputobj)
3955        goto onError;
3956    if (!PyBytes_Check(inputobj)) {
3957        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3958    }
3959    *input = PyBytes_AS_STRING(inputobj);
3960    insize = PyBytes_GET_SIZE(inputobj);
3961    *inend = *input + insize;
3962    /* we can DECREF safely, as the exception has another reference,
3963       so the object won't go away. */
3964    Py_DECREF(inputobj);
3965
3966    if (newpos<0)
3967        newpos = insize+newpos;
3968    if (newpos<0 || newpos>insize) {
3969        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3970        goto onError;
3971    }
3972
3973    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3974    if (repwstr == NULL)
3975        goto onError;
3976    /* need more space? (at least enough for what we
3977       have+the replacement+the rest of the string (starting
3978       at the new input position), so we won't have to check space
3979       when there are no errors in the rest of the string) */
3980    requiredsize = *outpos + repwlen + insize-newpos;
3981    if (requiredsize > outsize) {
3982        if (requiredsize < 2*outsize)
3983            requiredsize = 2*outsize;
3984        if (unicode_resize(output, requiredsize) < 0)
3985            goto onError;
3986    }
3987    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3988    *outpos += repwlen;
3989
3990    *endinpos = newpos;
3991    *inptr = *input + newpos;
3992
3993    /* we made it! */
3994    Py_XDECREF(restuple);
3995    return 0;
3996
3997  onError:
3998    Py_XDECREF(restuple);
3999    return -1;
4000}
4001#endif   /* HAVE_MBCS */
4002
4003static int
4004unicode_decode_call_errorhandler_writer(
4005    const char *errors, PyObject **errorHandler,
4006    const char *encoding, const char *reason,
4007    const char **input, const char **inend, Py_ssize_t *startinpos,
4008    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4009    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4010{
4011    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4012
4013    PyObject *restuple = NULL;
4014    PyObject *repunicode = NULL;
4015    Py_ssize_t insize;
4016    Py_ssize_t newpos;
4017    PyObject *inputobj = NULL;
4018
4019    if (*errorHandler == NULL) {
4020        *errorHandler = PyCodec_LookupError(errors);
4021        if (*errorHandler == NULL)
4022            goto onError;
4023    }
4024
4025    make_decode_exception(exceptionObject,
4026        encoding,
4027        *input, *inend - *input,
4028        *startinpos, *endinpos,
4029        reason);
4030    if (*exceptionObject == NULL)
4031        goto onError;
4032
4033    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4034    if (restuple == NULL)
4035        goto onError;
4036    if (!PyTuple_Check(restuple)) {
4037        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4038        goto onError;
4039    }
4040    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4041        goto onError;
4042
4043    /* Copy back the bytes variables, which might have been modified by the
4044       callback */
4045    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4046    if (!inputobj)
4047        goto onError;
4048    if (!PyBytes_Check(inputobj)) {
4049        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4050    }
4051    *input = PyBytes_AS_STRING(inputobj);
4052    insize = PyBytes_GET_SIZE(inputobj);
4053    *inend = *input + insize;
4054    /* we can DECREF safely, as the exception has another reference,
4055       so the object won't go away. */
4056    Py_DECREF(inputobj);
4057
4058    if (newpos<0)
4059        newpos = insize+newpos;
4060    if (newpos<0 || newpos>insize) {
4061        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4062        goto onError;
4063    }
4064
4065    writer->overallocate = 1;
4066    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4067        return
4068
4069    *endinpos = newpos;
4070    *inptr = *input + newpos;
4071
4072    /* we made it! */
4073    Py_XDECREF(restuple);
4074    return 0;
4075
4076  onError:
4077    Py_XDECREF(restuple);
4078    return -1;
4079}
4080
4081/* --- UTF-7 Codec -------------------------------------------------------- */
4082
4083/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4084
4085/* Three simple macros defining base-64. */
4086
4087/* Is c a base-64 character? */
4088
4089#define IS_BASE64(c) \
4090    (((c) >= 'A' && (c) <= 'Z') ||     \
4091     ((c) >= 'a' && (c) <= 'z') ||     \
4092     ((c) >= '0' && (c) <= '9') ||     \
4093     (c) == '+' || (c) == '/')
4094
4095/* given that c is a base-64 character, what is its base-64 value? */
4096
4097#define FROM_BASE64(c)                                                  \
4098    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4099     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4100     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4101     (c) == '+' ? 62 : 63)
4102
4103/* What is the base-64 character of the bottom 6 bits of n? */
4104
4105#define TO_BASE64(n)  \
4106    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4107
4108/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4109 * decoded as itself.  We are permissive on decoding; the only ASCII
4110 * byte not decoding to itself is the + which begins a base64
4111 * string. */
4112
4113#define DECODE_DIRECT(c)                                \
4114    ((c) <= 127 && (c) != '+')
4115
4116/* The UTF-7 encoder treats ASCII characters differently according to
4117 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4118 * the above).  See RFC2152.  This array identifies these different
4119 * sets:
4120 * 0 : "Set D"
4121 *     alphanumeric and '(),-./:?
4122 * 1 : "Set O"
4123 *     !"#$%&*;<=>@[]^_`{|}
4124 * 2 : "whitespace"
4125 *     ht nl cr sp
4126 * 3 : special (must be base64 encoded)
4127 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4128 */
4129
4130static
4131char utf7_category[128] = {
4132/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4133    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4134/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4135    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4136/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4137    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4138/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4139    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4140/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4141    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4142/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4143    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4144/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4145    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4146/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4147    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4148};
4149
4150/* ENCODE_DIRECT: this character should be encoded as itself.  The
4151 * answer depends on whether we are encoding set O as itself, and also
4152 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4153 * clear that the answers to these questions vary between
4154 * applications, so this code needs to be flexible.  */
4155
4156#define ENCODE_DIRECT(c, directO, directWS)             \
4157    ((c) < 128 && (c) > 0 &&                            \
4158     ((utf7_category[(c)] == 0) ||                      \
4159      (directWS && (utf7_category[(c)] == 2)) ||        \
4160      (directO && (utf7_category[(c)] == 1))))
4161
4162PyObject *
4163PyUnicode_DecodeUTF7(const char *s,
4164                     Py_ssize_t size,
4165                     const char *errors)
4166{
4167    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4168}
4169
4170/* The decoder.  The only state we preserve is our read position,
4171 * i.e. how many characters we have consumed.  So if we end in the
4172 * middle of a shift sequence we have to back off the read position
4173 * and the output to the beginning of the sequence, otherwise we lose
4174 * all the shift state (seen bits, number of bits seen, high
4175 * surrogate). */
4176
4177PyObject *
4178PyUnicode_DecodeUTF7Stateful(const char *s,
4179                             Py_ssize_t size,
4180                             const char *errors,
4181                             Py_ssize_t *consumed)
4182{
4183    const char *starts = s;
4184    Py_ssize_t startinpos;
4185    Py_ssize_t endinpos;
4186    const char *e;
4187    _PyUnicodeWriter writer;
4188    const char *errmsg = "";
4189    int inShift = 0;
4190    Py_ssize_t shiftOutStart;
4191    unsigned int base64bits = 0;
4192    unsigned long base64buffer = 0;
4193    Py_UCS4 surrogate = 0;
4194    PyObject *errorHandler = NULL;
4195    PyObject *exc = NULL;
4196
4197    if (size == 0) {
4198        if (consumed)
4199            *consumed = 0;
4200        Py_INCREF(unicode_empty);
4201        return unicode_empty;
4202    }
4203
4204    /* Start off assuming it's all ASCII. Widen later as necessary. */
4205    _PyUnicodeWriter_Init(&writer, 0);
4206    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4207        goto onError;
4208
4209    shiftOutStart = 0;
4210    e = s + size;
4211
4212    while (s < e) {
4213        Py_UCS4 ch;
4214      restart:
4215        ch = (unsigned char) *s;
4216
4217        if (inShift) { /* in a base-64 section */
4218            if (IS_BASE64(ch)) { /* consume a base-64 character */
4219                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4220                base64bits += 6;
4221                s++;
4222                if (base64bits >= 16) {
4223                    /* we have enough bits for a UTF-16 value */
4224                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4225                    base64bits -= 16;
4226                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4227                    if (surrogate) {
4228                        /* expecting a second surrogate */
4229                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4230                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4231                            if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
4232                                goto onError;
4233                            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4234                            writer.pos++;
4235                            surrogate = 0;
4236                            continue;
4237                        }
4238                        else {
4239                            if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
4240                                goto onError;
4241                            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4242                            writer.pos++;
4243                            surrogate = 0;
4244                        }
4245                    }
4246                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4247                        /* first surrogate */
4248                        surrogate = outCh;
4249                    }
4250                    else {
4251                        if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
4252                            goto onError;
4253                        PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4254                        writer.pos++;
4255                    }
4256                }
4257            }
4258            else { /* now leaving a base-64 section */
4259                inShift = 0;
4260                s++;
4261                if (surrogate) {
4262                    if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
4263                        goto onError;
4264                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4265                    writer.pos++;
4266                    surrogate = 0;
4267                }
4268                if (base64bits > 0) { /* left-over bits */
4269                    if (base64bits >= 6) {
4270                        /* We've seen at least one base-64 character */
4271                        errmsg = "partial character in shift sequence";
4272                        goto utf7Error;
4273                    }
4274                    else {
4275                        /* Some bits remain; they should be zero */
4276                        if (base64buffer != 0) {
4277                            errmsg = "non-zero padding bits in shift sequence";
4278                            goto utf7Error;
4279                        }
4280                    }
4281                }
4282                if (ch != '-') {
4283                    /* '-' is absorbed; other terminating
4284                       characters are preserved */
4285                    if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4286                        goto onError;
4287                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4288                    writer.pos++;
4289                }
4290            }
4291        }
4292        else if ( ch == '+' ) {
4293            startinpos = s-starts;
4294            s++; /* consume '+' */
4295            if (s < e && *s == '-') { /* '+-' encodes '+' */
4296                s++;
4297                if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
4298                    goto onError;
4299                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4300                writer.pos++;
4301            }
4302            else { /* begin base64-encoded section */
4303                inShift = 1;
4304                shiftOutStart = writer.pos;
4305                base64bits = 0;
4306            }
4307        }
4308        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4309            s++;
4310            if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4311                goto onError;
4312            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4313            writer.pos++;
4314        }
4315        else {
4316            startinpos = s-starts;
4317            s++;
4318            errmsg = "unexpected special character";
4319            goto utf7Error;
4320        }
4321        continue;
4322utf7Error:
4323        endinpos = s-starts;
4324        if (unicode_decode_call_errorhandler_writer(
4325                errors, &errorHandler,
4326                "utf7", errmsg,
4327                &starts, &e, &startinpos, &endinpos, &exc, &s,
4328                &writer))
4329            goto onError;
4330    }
4331
4332    /* end of string */
4333
4334    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4335        /* if we're in an inconsistent state, that's an error */
4336        if (surrogate ||
4337                (base64bits >= 6) ||
4338                (base64bits > 0 && base64buffer != 0)) {
4339            endinpos = size;
4340            if (unicode_decode_call_errorhandler_writer(
4341                    errors, &errorHandler,
4342                    "utf7", "unterminated shift sequence",
4343                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4344                    &writer))
4345                goto onError;
4346            if (s < e)
4347                goto restart;
4348        }
4349    }
4350
4351    /* return state */
4352    if (consumed) {
4353        if (inShift) {
4354            writer.pos = shiftOutStart; /* back off output */
4355            *consumed = startinpos;
4356        }
4357        else {
4358            *consumed = s-starts;
4359        }
4360    }
4361
4362    Py_XDECREF(errorHandler);
4363    Py_XDECREF(exc);
4364    return _PyUnicodeWriter_Finish(&writer);
4365
4366  onError:
4367    Py_XDECREF(errorHandler);
4368    Py_XDECREF(exc);
4369    _PyUnicodeWriter_Dealloc(&writer);
4370    return NULL;
4371}
4372
4373
4374PyObject *
4375_PyUnicode_EncodeUTF7(PyObject *str,
4376                      int base64SetO,
4377                      int base64WhiteSpace,
4378                      const char *errors)
4379{
4380    int kind;
4381    void *data;
4382    Py_ssize_t len;
4383    PyObject *v;
4384    int inShift = 0;
4385    Py_ssize_t i;
4386    unsigned int base64bits = 0;
4387    unsigned long base64buffer = 0;
4388    char * out;
4389    char * start;
4390
4391    if (PyUnicode_READY(str) == -1)
4392        return NULL;
4393    kind = PyUnicode_KIND(str);
4394    data = PyUnicode_DATA(str);
4395    len = PyUnicode_GET_LENGTH(str);
4396
4397    if (len == 0)
4398        return PyBytes_FromStringAndSize(NULL, 0);
4399
4400    /* It might be possible to tighten this worst case */
4401    if (len > PY_SSIZE_T_MAX / 8)
4402        return PyErr_NoMemory();
4403    v = PyBytes_FromStringAndSize(NULL, len * 8);
4404    if (v == NULL)
4405        return NULL;
4406
4407    start = out = PyBytes_AS_STRING(v);
4408    for (i = 0; i < len; ++i) {
4409        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4410
4411        if (inShift) {
4412            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4413                /* shifting out */
4414                if (base64bits) { /* output remaining bits */
4415                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4416                    base64buffer = 0;
4417                    base64bits = 0;
4418                }
4419                inShift = 0;
4420                /* Characters not in the BASE64 set implicitly unshift the sequence
4421                   so no '-' is required, except if the character is itself a '-' */
4422                if (IS_BASE64(ch) || ch == '-') {
4423                    *out++ = '-';
4424                }
4425                *out++ = (char) ch;
4426            }
4427            else {
4428                goto encode_char;
4429            }
4430        }
4431        else { /* not in a shift sequence */
4432            if (ch == '+') {
4433                *out++ = '+';
4434                        *out++ = '-';
4435            }
4436            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4437                *out++ = (char) ch;
4438            }
4439            else {
4440                *out++ = '+';
4441                inShift = 1;
4442                goto encode_char;
4443            }
4444        }
4445        continue;
4446encode_char:
4447        if (ch >= 0x10000) {
4448            assert(ch <= MAX_UNICODE);
4449
4450            /* code first surrogate */
4451            base64bits += 16;
4452            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4453            while (base64bits >= 6) {
4454                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4455                base64bits -= 6;
4456            }
4457            /* prepare second surrogate */
4458            ch = Py_UNICODE_LOW_SURROGATE(ch);
4459        }
4460        base64bits += 16;
4461        base64buffer = (base64buffer << 16) | ch;
4462        while (base64bits >= 6) {
4463            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4464            base64bits -= 6;
4465        }
4466    }
4467    if (base64bits)
4468        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4469    if (inShift)
4470        *out++ = '-';
4471    if (_PyBytes_Resize(&v, out - start) < 0)
4472        return NULL;
4473    return v;
4474}
4475PyObject *
4476PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4477                     Py_ssize_t size,
4478                     int base64SetO,
4479                     int base64WhiteSpace,
4480                     const char *errors)
4481{
4482    PyObject *result;
4483    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4484    if (tmp == NULL)
4485        return NULL;
4486    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4487                                   base64WhiteSpace, errors);
4488    Py_DECREF(tmp);
4489    return result;
4490}
4491
4492#undef IS_BASE64
4493#undef FROM_BASE64
4494#undef TO_BASE64
4495#undef DECODE_DIRECT
4496#undef ENCODE_DIRECT
4497
4498/* --- UTF-8 Codec -------------------------------------------------------- */
4499
4500PyObject *
4501PyUnicode_DecodeUTF8(const char *s,
4502                     Py_ssize_t size,
4503                     const char *errors)
4504{
4505    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4506}
4507
4508#include "stringlib/asciilib.h"
4509#include "stringlib/codecs.h"
4510#include "stringlib/undef.h"
4511
4512#include "stringlib/ucs1lib.h"
4513#include "stringlib/codecs.h"
4514#include "stringlib/undef.h"
4515
4516#include "stringlib/ucs2lib.h"
4517#include "stringlib/codecs.h"
4518#include "stringlib/undef.h"
4519
4520#include "stringlib/ucs4lib.h"
4521#include "stringlib/codecs.h"
4522#include "stringlib/undef.h"
4523
4524/* Mask to quickly check whether a C 'long' contains a
4525   non-ASCII, UTF8-encoded char. */
4526#if (SIZEOF_LONG == 8)
4527# define ASCII_CHAR_MASK 0x8080808080808080UL
4528#elif (SIZEOF_LONG == 4)
4529# define ASCII_CHAR_MASK 0x80808080UL
4530#else
4531# error C 'long' size should be either 4 or 8!
4532#endif
4533
4534static Py_ssize_t
4535ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4536{
4537    const char *p = start;
4538    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4539
4540#if SIZEOF_LONG <= SIZEOF_VOID_P
4541    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4542    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4543        /* Fast path, see in STRINGLIB(utf8_decode) for
4544           an explanation. */
4545        /* Help register allocation */
4546        register const char *_p = p;
4547        register Py_UCS1 * q = dest;
4548        while (_p < aligned_end) {
4549            unsigned long value = *(const unsigned long *) _p;
4550            if (value & ASCII_CHAR_MASK)
4551                break;
4552            *((unsigned long *)q) = value;
4553            _p += SIZEOF_LONG;
4554            q += SIZEOF_LONG;
4555        }
4556        p = _p;
4557        while (p < end) {
4558            if ((unsigned char)*p & 0x80)
4559                break;
4560            *q++ = *p++;
4561        }
4562        return p - start;
4563    }
4564#endif
4565    while (p < end) {
4566        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4567           for an explanation. */
4568        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4569            /* Help register allocation */
4570            register const char *_p = p;
4571            while (_p < aligned_end) {
4572                unsigned long value = *(unsigned long *) _p;
4573                if (value & ASCII_CHAR_MASK)
4574                    break;
4575                _p += SIZEOF_LONG;
4576            }
4577            p = _p;
4578            if (_p == end)
4579                break;
4580        }
4581        if ((unsigned char)*p & 0x80)
4582            break;
4583        ++p;
4584    }
4585    memcpy(dest, start, p - start);
4586    return p - start;
4587}
4588
4589PyObject *
4590PyUnicode_DecodeUTF8Stateful(const char *s,
4591                             Py_ssize_t size,
4592                             const char *errors,
4593                             Py_ssize_t *consumed)
4594{
4595    _PyUnicodeWriter writer;
4596    const char *starts = s;
4597    const char *end = s + size;
4598
4599    Py_ssize_t startinpos;
4600    Py_ssize_t endinpos;
4601    const char *errmsg = "";
4602    PyObject *errorHandler = NULL;
4603    PyObject *exc = NULL;
4604
4605    if (size == 0) {
4606        if (consumed)
4607            *consumed = 0;
4608        Py_INCREF(unicode_empty);
4609        return unicode_empty;
4610    }
4611
4612    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4613    if (size == 1 && (unsigned char)s[0] < 128) {
4614        if (consumed)
4615            *consumed = 1;
4616        return get_latin1_char((unsigned char)s[0]);
4617    }
4618
4619    _PyUnicodeWriter_Init(&writer, 0);
4620    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4621        goto onError;
4622
4623    writer.pos = ascii_decode(s, end, writer.data);
4624    s += writer.pos;
4625    while (s < end) {
4626        Py_UCS4 ch;
4627        int kind = writer.kind;
4628        if (kind == PyUnicode_1BYTE_KIND) {
4629            if (PyUnicode_IS_ASCII(writer.buffer))
4630                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4631            else
4632                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4633        } else if (kind == PyUnicode_2BYTE_KIND) {
4634            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4635        } else {
4636            assert(kind == PyUnicode_4BYTE_KIND);
4637            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4638        }
4639
4640        switch (ch) {
4641        case 0:
4642            if (s == end || consumed)
4643                goto End;
4644            errmsg = "unexpected end of data";
4645            startinpos = s - starts;
4646            endinpos = end - starts;
4647            break;
4648        case 1:
4649            errmsg = "invalid start byte";
4650            startinpos = s - starts;
4651            endinpos = startinpos + 1;
4652            break;
4653        case 2:
4654        case 3:
4655        case 4:
4656            errmsg = "invalid continuation byte";
4657            startinpos = s - starts;
4658            endinpos = startinpos + ch - 1;
4659            break;
4660        default:
4661            if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4662                goto onError;
4663            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4664            writer.pos++;
4665            continue;
4666        }
4667
4668        if (unicode_decode_call_errorhandler_writer(
4669                errors, &errorHandler,
4670                "utf-8", errmsg,
4671                &starts, &end, &startinpos, &endinpos, &exc, &s,
4672                &writer))
4673            goto onError;
4674    }
4675
4676End:
4677    if (consumed)
4678        *consumed = s - starts;
4679
4680    Py_XDECREF(errorHandler);
4681    Py_XDECREF(exc);
4682    return _PyUnicodeWriter_Finish(&writer);
4683
4684onError:
4685    Py_XDECREF(errorHandler);
4686    Py_XDECREF(exc);
4687    _PyUnicodeWriter_Dealloc(&writer);
4688    return NULL;
4689}
4690
4691#ifdef __APPLE__
4692
4693/* Simplified UTF-8 decoder using surrogateescape error handler,
4694   used to decode the command line arguments on Mac OS X.
4695
4696   Return a pointer to a newly allocated wide character string (use
4697   PyMem_Free() to free the memory), or NULL on memory allocation error. */
4698
4699wchar_t*
4700_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4701{
4702    const char *e;
4703    wchar_t *unicode;
4704    Py_ssize_t outpos;
4705
4706    /* Note: size will always be longer than the resulting Unicode
4707       character count */
4708    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
4709        return NULL;
4710    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4711    if (!unicode)
4712        return NULL;
4713
4714    /* Unpack UTF-8 encoded data */
4715    e = s + size;
4716    outpos = 0;
4717    while (s < e) {
4718        Py_UCS4 ch;
4719#if SIZEOF_WCHAR_T == 4
4720        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4721#else
4722        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4723#endif
4724        if (ch > 0xFF) {
4725#if SIZEOF_WCHAR_T == 4
4726            assert(0);
4727#else
4728            assert(Py_UNICODE_IS_SURROGATE(ch));
4729            /*  compute and append the two surrogates: */
4730            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4731            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4732#endif
4733        }
4734        else {
4735            if (!ch && s == e)
4736                break;
4737            /* surrogateescape */
4738            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4739        }
4740    }
4741    unicode[outpos] = L'\0';
4742    return unicode;
4743}
4744
4745#endif /* __APPLE__ */
4746
4747/* Primary internal function which creates utf8 encoded bytes objects.
4748
4749   Allocation strategy:  if the string is short, convert into a stack buffer
4750   and allocate exactly as much space needed at the end.  Else allocate the
4751   maximum possible needed (4 result bytes per Unicode character), and return
4752   the excess memory at the end.
4753*/
4754PyObject *
4755_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4756{
4757    enum PyUnicode_Kind kind;
4758    void *data;
4759    Py_ssize_t size;
4760
4761    if (!PyUnicode_Check(unicode)) {
4762        PyErr_BadArgument();
4763        return NULL;
4764    }
4765
4766    if (PyUnicode_READY(unicode) == -1)
4767        return NULL;
4768
4769    if (PyUnicode_UTF8(unicode))
4770        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4771                                         PyUnicode_UTF8_LENGTH(unicode));
4772
4773    kind = PyUnicode_KIND(unicode);
4774    data = PyUnicode_DATA(unicode);
4775    size = PyUnicode_GET_LENGTH(unicode);
4776
4777    switch (kind) {
4778    default:
4779        assert(0);
4780    case PyUnicode_1BYTE_KIND:
4781        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4782        assert(!PyUnicode_IS_ASCII(unicode));
4783        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4784    case PyUnicode_2BYTE_KIND:
4785        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4786    case PyUnicode_4BYTE_KIND:
4787        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4788    }
4789}
4790
4791PyObject *
4792PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4793                     Py_ssize_t size,
4794                     const char *errors)
4795{
4796    PyObject *v, *unicode;
4797
4798    unicode = PyUnicode_FromUnicode(s, size);
4799    if (unicode == NULL)
4800        return NULL;
4801    v = _PyUnicode_AsUTF8String(unicode, errors);
4802    Py_DECREF(unicode);
4803    return v;
4804}
4805
4806PyObject *
4807PyUnicode_AsUTF8String(PyObject *unicode)
4808{
4809    return _PyUnicode_AsUTF8String(unicode, NULL);
4810}
4811
4812/* --- UTF-32 Codec ------------------------------------------------------- */
4813
4814PyObject *
4815PyUnicode_DecodeUTF32(const char *s,
4816                      Py_ssize_t size,
4817                      const char *errors,
4818                      int *byteorder)
4819{
4820    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4821}
4822
4823PyObject *
4824PyUnicode_DecodeUTF32Stateful(const char *s,
4825                              Py_ssize_t size,
4826                              const char *errors,
4827                              int *byteorder,
4828                              Py_ssize_t *consumed)
4829{
4830    const char *starts = s;
4831    Py_ssize_t startinpos;
4832    Py_ssize_t endinpos;
4833    _PyUnicodeWriter writer;
4834    const unsigned char *q, *e;
4835    int le, bo = 0;       /* assume native ordering by default */
4836    const char *errmsg = "";
4837    PyObject *errorHandler = NULL;
4838    PyObject *exc = NULL;
4839
4840    q = (unsigned char *)s;
4841    e = q + size;
4842
4843    if (byteorder)
4844        bo = *byteorder;
4845
4846    /* Check for BOM marks (U+FEFF) in the input and adjust current
4847       byte order setting accordingly. In native mode, the leading BOM
4848       mark is skipped, in all other modes, it is copied to the output
4849       stream as-is (giving a ZWNBSP character). */
4850    if (bo == 0 && size >= 4) {
4851        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4852        if (bom == 0x0000FEFF) {
4853            bo = -1;
4854            q += 4;
4855        }
4856        else if (bom == 0xFFFE0000) {
4857            bo = 1;
4858            q += 4;
4859        }
4860        if (byteorder)
4861            *byteorder = bo;
4862    }
4863
4864    if (q == e) {
4865        if (consumed)
4866            *consumed = size;
4867        Py_INCREF(unicode_empty);
4868        return unicode_empty;
4869    }
4870
4871#ifdef WORDS_BIGENDIAN
4872    le = bo < 0;
4873#else
4874    le = bo <= 0;
4875#endif
4876
4877    _PyUnicodeWriter_Init(&writer, 0);
4878    if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4879        goto onError;
4880
4881    while (1) {
4882        Py_UCS4 ch = 0;
4883        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
4884
4885        if (e - q >= 4) {
4886            enum PyUnicode_Kind kind = writer.kind;
4887            void *data = writer.data;
4888            const unsigned char *last = e - 4;
4889            Py_ssize_t pos = writer.pos;
4890            if (le) {
4891                do {
4892                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4893                    if (ch > maxch)
4894                        break;
4895                    PyUnicode_WRITE(kind, data, pos++, ch);
4896                    q += 4;
4897                } while (q <= last);
4898            }
4899            else {
4900                do {
4901                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4902                    if (ch > maxch)
4903                        break;
4904                    PyUnicode_WRITE(kind, data, pos++, ch);
4905                    q += 4;
4906                } while (q <= last);
4907            }
4908            writer.pos = pos;
4909        }
4910
4911        if (ch <= maxch) {
4912            if (q == e || consumed)
4913                break;
4914            /* remaining bytes at the end? (size should be divisible by 4) */
4915            errmsg = "truncated data";
4916            startinpos = ((const char *)q) - starts;
4917            endinpos = ((const char *)e) - starts;
4918        }
4919        else {
4920            if (ch < 0x110000) {
4921                if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4922                    goto onError;
4923                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4924                writer.pos++;
4925                q += 4;
4926                continue;
4927            }
4928            errmsg = "codepoint not in range(0x110000)";
4929            startinpos = ((const char *)q) - starts;
4930            endinpos = startinpos + 4;
4931        }
4932
4933        /* The remaining input chars are ignored if the callback
4934           chooses to skip the input */
4935        if (unicode_decode_call_errorhandler_writer(
4936                errors, &errorHandler,
4937                "utf32", errmsg,
4938                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4939                &writer))
4940            goto onError;
4941    }
4942
4943    if (consumed)
4944        *consumed = (const char *)q-starts;
4945
4946    Py_XDECREF(errorHandler);
4947    Py_XDECREF(exc);
4948    return _PyUnicodeWriter_Finish(&writer);
4949
4950  onError:
4951    _PyUnicodeWriter_Dealloc(&writer);
4952    Py_XDECREF(errorHandler);
4953    Py_XDECREF(exc);
4954    return NULL;
4955}
4956
4957PyObject *
4958_PyUnicode_EncodeUTF32(PyObject *str,
4959                       const char *errors,
4960                       int byteorder)
4961{
4962    int kind;
4963    void *data;
4964    Py_ssize_t len;
4965    PyObject *v;
4966    unsigned char *p;
4967    Py_ssize_t nsize, i;
4968    /* Offsets from p for storing byte pairs in the right order. */
4969#if PY_LITTLE_ENDIAN
4970    int iorder[] = {0, 1, 2, 3};
4971#else
4972    int iorder[] = {3, 2, 1, 0};
4973#endif
4974
4975#define STORECHAR(CH)                           \
4976    do {                                        \
4977        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
4978        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
4979        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
4980        p[iorder[0]] = (CH) & 0xff;             \
4981        p += 4;                                 \
4982    } while(0)
4983
4984    if (!PyUnicode_Check(str)) {
4985        PyErr_BadArgument();
4986        return NULL;
4987    }
4988    if (PyUnicode_READY(str) == -1)
4989        return NULL;
4990    kind = PyUnicode_KIND(str);
4991    data = PyUnicode_DATA(str);
4992    len = PyUnicode_GET_LENGTH(str);
4993
4994    nsize = len + (byteorder == 0);
4995    if (nsize > PY_SSIZE_T_MAX / 4)
4996        return PyErr_NoMemory();
4997    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
4998    if (v == NULL)
4999        return NULL;
5000
5001    p = (unsigned char *)PyBytes_AS_STRING(v);
5002    if (byteorder == 0)
5003        STORECHAR(0xFEFF);
5004    if (len == 0)
5005        goto done;
5006
5007    if (byteorder == -1) {
5008        /* force LE */
5009        iorder[0] = 0;
5010        iorder[1] = 1;
5011        iorder[2] = 2;
5012        iorder[3] = 3;
5013    }
5014    else if (byteorder == 1) {
5015        /* force BE */
5016        iorder[0] = 3;
5017        iorder[1] = 2;
5018        iorder[2] = 1;
5019        iorder[3] = 0;
5020    }
5021
5022    for (i = 0; i < len; i++)
5023        STORECHAR(PyUnicode_READ(kind, data, i));
5024
5025  done:
5026    return v;
5027#undef STORECHAR
5028}
5029
5030PyObject *
5031PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5032                      Py_ssize_t size,
5033                      const char *errors,
5034                      int byteorder)
5035{
5036    PyObject *result;
5037    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5038    if (tmp == NULL)
5039        return NULL;
5040    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5041    Py_DECREF(tmp);
5042    return result;
5043}
5044
5045PyObject *
5046PyUnicode_AsUTF32String(PyObject *unicode)
5047{
5048    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5049}
5050
5051/* --- UTF-16 Codec ------------------------------------------------------- */
5052
5053PyObject *
5054PyUnicode_DecodeUTF16(const char *s,
5055                      Py_ssize_t size,
5056                      const char *errors,
5057                      int *byteorder)
5058{
5059    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5060}
5061
5062PyObject *
5063PyUnicode_DecodeUTF16Stateful(const char *s,
5064                              Py_ssize_t size,
5065                              const char *errors,
5066                              int *byteorder,
5067                              Py_ssize_t *consumed)
5068{
5069    const char *starts = s;
5070    Py_ssize_t startinpos;
5071    Py_ssize_t endinpos;
5072    _PyUnicodeWriter writer;
5073    const unsigned char *q, *e;
5074    int bo = 0;       /* assume native ordering by default */
5075    int native_ordering;
5076    const char *errmsg = "";
5077    PyObject *errorHandler = NULL;
5078    PyObject *exc = NULL;
5079
5080    q = (unsigned char *)s;
5081    e = q + size;
5082
5083    if (byteorder)
5084        bo = *byteorder;
5085
5086    /* Check for BOM marks (U+FEFF) in the input and adjust current
5087       byte order setting accordingly. In native mode, the leading BOM
5088       mark is skipped, in all other modes, it is copied to the output
5089       stream as-is (giving a ZWNBSP character). */
5090    if (bo == 0 && size >= 2) {
5091        const Py_UCS4 bom = (q[1] << 8) | q[0];
5092        if (bom == 0xFEFF) {
5093            q += 2;
5094            bo = -1;
5095        }
5096        else if (bom == 0xFFFE) {
5097            q += 2;
5098            bo = 1;
5099        }
5100        if (byteorder)
5101            *byteorder = bo;
5102    }
5103
5104    if (q == e) {
5105        if (consumed)
5106            *consumed = size;
5107        Py_INCREF(unicode_empty);
5108        return unicode_empty;
5109    }
5110
5111#if PY_LITTLE_ENDIAN
5112    native_ordering = bo <= 0;
5113#else
5114    native_ordering = bo >= 0;
5115#endif
5116
5117    /* Note: size will always be longer than the resulting Unicode
5118       character count */
5119    _PyUnicodeWriter_Init(&writer, 0);
5120    if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5121        goto onError;
5122
5123    while (1) {
5124        Py_UCS4 ch = 0;
5125        if (e - q >= 2) {
5126            int kind = writer.kind;
5127            if (kind == PyUnicode_1BYTE_KIND) {
5128                if (PyUnicode_IS_ASCII(writer.buffer))
5129                    ch = asciilib_utf16_decode(&q, e,
5130                            (Py_UCS1*)writer.data, &writer.pos,
5131                            native_ordering);
5132                else
5133                    ch = ucs1lib_utf16_decode(&q, e,
5134                            (Py_UCS1*)writer.data, &writer.pos,
5135                            native_ordering);
5136            } else if (kind == PyUnicode_2BYTE_KIND) {
5137                ch = ucs2lib_utf16_decode(&q, e,
5138                        (Py_UCS2*)writer.data, &writer.pos,
5139                        native_ordering);
5140            } else {
5141                assert(kind == PyUnicode_4BYTE_KIND);
5142                ch = ucs4lib_utf16_decode(&q, e,
5143                        (Py_UCS4*)writer.data, &writer.pos,
5144                        native_ordering);
5145            }
5146        }
5147
5148        switch (ch)
5149        {
5150        case 0:
5151            /* remaining byte at the end? (size should be even) */
5152            if (q == e || consumed)
5153                goto End;
5154            errmsg = "truncated data";
5155            startinpos = ((const char *)q) - starts;
5156            endinpos = ((const char *)e) - starts;
5157            break;
5158            /* The remaining input chars are ignored if the callback
5159               chooses to skip the input */
5160        case 1:
5161            errmsg = "unexpected end of data";
5162            startinpos = ((const char *)q) - 2 - starts;
5163            endinpos = ((const char *)e) - starts;
5164            break;
5165        case 2:
5166            errmsg = "illegal encoding";
5167            startinpos = ((const char *)q) - 2 - starts;
5168            endinpos = startinpos + 2;
5169            break;
5170        case 3:
5171            errmsg = "illegal UTF-16 surrogate";
5172            startinpos = ((const char *)q) - 4 - starts;
5173            endinpos = startinpos + 2;
5174            break;
5175        default:
5176            if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
5177                goto onError;
5178            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5179            writer.pos++;
5180            continue;
5181        }
5182
5183        if (unicode_decode_call_errorhandler_writer(
5184                errors,
5185                &errorHandler,
5186                "utf16", errmsg,
5187                &starts,
5188                (const char **)&e,
5189                &startinpos,
5190                &endinpos,
5191                &exc,
5192                (const char **)&q,
5193                &writer))
5194            goto onError;
5195    }
5196
5197End:
5198    if (consumed)
5199        *consumed = (const char *)q-starts;
5200
5201    Py_XDECREF(errorHandler);
5202    Py_XDECREF(exc);
5203    return _PyUnicodeWriter_Finish(&writer);
5204
5205  onError:
5206    _PyUnicodeWriter_Dealloc(&writer);
5207    Py_XDECREF(errorHandler);
5208    Py_XDECREF(exc);
5209    return NULL;
5210}
5211
5212PyObject *
5213_PyUnicode_EncodeUTF16(PyObject *str,
5214                       const char *errors,
5215                       int byteorder)
5216{
5217    enum PyUnicode_Kind kind;
5218    const void *data;
5219    Py_ssize_t len;
5220    PyObject *v;
5221    unsigned short *out;
5222    Py_ssize_t bytesize;
5223    Py_ssize_t pairs;
5224#if PY_BIG_ENDIAN
5225    int native_ordering = byteorder >= 0;
5226#else
5227    int native_ordering = byteorder <= 0;
5228#endif
5229
5230    if (!PyUnicode_Check(str)) {
5231        PyErr_BadArgument();
5232        return NULL;
5233    }
5234    if (PyUnicode_READY(str) == -1)
5235        return NULL;
5236    kind = PyUnicode_KIND(str);
5237    data = PyUnicode_DATA(str);
5238    len = PyUnicode_GET_LENGTH(str);
5239
5240    pairs = 0;
5241    if (kind == PyUnicode_4BYTE_KIND) {
5242        const Py_UCS4 *in = (const Py_UCS4 *)data;
5243        const Py_UCS4 *end = in + len;
5244        while (in < end)
5245            if (*in++ >= 0x10000)
5246                pairs++;
5247    }
5248    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5249        return PyErr_NoMemory();
5250    bytesize = (len + pairs + (byteorder == 0)) * 2;
5251    v = PyBytes_FromStringAndSize(NULL, bytesize);
5252    if (v == NULL)
5253        return NULL;
5254
5255    /* output buffer is 2-bytes aligned */
5256    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5257    out = (unsigned short *)PyBytes_AS_STRING(v);
5258    if (byteorder == 0)
5259        *out++ = 0xFEFF;
5260    if (len == 0)
5261        goto done;
5262
5263    switch (kind) {
5264    case PyUnicode_1BYTE_KIND: {
5265        ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5266        break;
5267    }
5268    case PyUnicode_2BYTE_KIND: {
5269        ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5270        break;
5271    }
5272    case PyUnicode_4BYTE_KIND: {
5273        ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5274        break;
5275    }
5276    default:
5277        assert(0);
5278    }
5279
5280  done:
5281    return v;
5282}
5283
5284PyObject *
5285PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5286                      Py_ssize_t size,
5287                      const char *errors,
5288                      int byteorder)
5289{
5290    PyObject *result;
5291    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5292    if (tmp == NULL)
5293        return NULL;
5294    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5295    Py_DECREF(tmp);
5296    return result;
5297}
5298
5299PyObject *
5300PyUnicode_AsUTF16String(PyObject *unicode)
5301{
5302    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5303}
5304
5305/* --- Unicode Escape Codec ----------------------------------------------- */
5306
5307/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5308   if all the escapes in the string make it still a valid ASCII string.
5309   Returns -1 if any escapes were found which cause the string to
5310   pop out of ASCII range.  Otherwise returns the length of the
5311   required buffer to hold the string.
5312   */
5313static Py_ssize_t
5314length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5315{
5316    const unsigned char *p = (const unsigned char *)s;
5317    const unsigned char *end = p + size;
5318    Py_ssize_t length = 0;
5319
5320    if (size < 0)
5321        return -1;
5322
5323    for (; p < end; ++p) {
5324        if (*p > 127) {
5325            /* Non-ASCII */
5326            return -1;
5327        }
5328        else if (*p != '\\') {
5329            /* Normal character */
5330            ++length;
5331        }
5332        else {
5333            /* Backslash-escape, check next char */
5334            ++p;
5335            /* Escape sequence reaches till end of string or
5336               non-ASCII follow-up. */
5337            if (p >= end || *p > 127)
5338                return -1;
5339            switch (*p) {
5340            case '\n':
5341                /* backslash + \n result in zero characters */
5342                break;
5343            case '\\': case '\'': case '\"':
5344            case 'b': case 'f': case 't':
5345            case 'n': case 'r': case 'v': case 'a':
5346                ++length;
5347                break;
5348            case '0': case '1': case '2': case '3':
5349            case '4': case '5': case '6': case '7':
5350            case 'x': case 'u': case 'U': case 'N':
5351                /* these do not guarantee ASCII characters */
5352                return -1;
5353            default:
5354                /* count the backslash + the other character */
5355                length += 2;
5356            }
5357        }
5358    }
5359    return length;
5360}
5361
5362static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5363
5364PyObject *
5365PyUnicode_DecodeUnicodeEscape(const char *s,
5366                              Py_ssize_t size,
5367                              const char *errors)
5368{
5369    const char *starts = s;
5370    Py_ssize_t startinpos;
5371    Py_ssize_t endinpos;
5372    int j;
5373    _PyUnicodeWriter writer;
5374    const char *end;
5375    char* message;
5376    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5377    PyObject *errorHandler = NULL;
5378    PyObject *exc = NULL;
5379    Py_ssize_t len;
5380
5381    len = length_of_escaped_ascii_string(s, size);
5382    if (len == 0) {
5383        Py_INCREF(unicode_empty);
5384        return unicode_empty;
5385    }
5386
5387    /* After length_of_escaped_ascii_string() there are two alternatives,
5388       either the string is pure ASCII with named escapes like \n, etc.
5389       and we determined it's exact size (common case)
5390       or it contains \x, \u, ... escape sequences.  then we create a
5391       legacy wchar string and resize it at the end of this function. */
5392    _PyUnicodeWriter_Init(&writer, 0);
5393    if (len > 0) {
5394        if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
5395            goto onError;
5396        assert(writer.kind == PyUnicode_1BYTE_KIND);
5397    }
5398    else {
5399        /* Escaped strings will always be longer than the resulting
5400           Unicode string, so we start with size here and then reduce the
5401           length after conversion to the true value.
5402           (but if the error callback returns a long replacement string
5403           we'll have to allocate more space) */
5404        if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
5405            goto onError;
5406    }
5407
5408    if (size == 0)
5409        return _PyUnicodeWriter_Finish(&writer);
5410    end = s + size;
5411
5412    while (s < end) {
5413        unsigned char c;
5414        Py_UCS4 x;
5415        int digits;
5416
5417        /* Non-escape characters are interpreted as Unicode ordinals */
5418        if (*s != '\\') {
5419            x = (unsigned char)*s;
5420            s++;
5421            if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
5422                goto onError;
5423            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5424            writer.pos++;
5425            continue;
5426        }
5427
5428        startinpos = s-starts;
5429        /* \ - Escapes */
5430        s++;
5431        c = *s++;
5432        if (s > end)
5433            c = '\0'; /* Invalid after \ */
5434
5435        /* The only case in which i == ascii_length is a backslash
5436           followed by a newline. */
5437        assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
5438
5439        switch (c) {
5440
5441            /* \x escapes */
5442#define WRITECHAR(ch)                                                      \
5443            do {                                                           \
5444                if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)        \
5445                    goto onError;                                          \
5446                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5447                writer.pos++;                                              \
5448            } while(0)
5449
5450        case '\n': break;
5451        case '\\': WRITECHAR('\\'); break;
5452        case '\'': WRITECHAR('\''); break;
5453        case '\"': WRITECHAR('\"'); break;
5454        case 'b': WRITECHAR('\b'); break;
5455        /* FF */
5456        case 'f': WRITECHAR('\014'); break;
5457        case 't': WRITECHAR('\t'); break;
5458        case 'n': WRITECHAR('\n'); break;
5459        case 'r': WRITECHAR('\r'); break;
5460        /* VT */
5461        case 'v': WRITECHAR('\013'); break;
5462        /* BEL, not classic C */
5463        case 'a': WRITECHAR('\007'); break;
5464
5465            /* \OOO (octal) escapes */
5466        case '0': case '1': case '2': case '3':
5467        case '4': case '5': case '6': case '7':
5468            x = s[-1] - '0';
5469            if (s < end && '0' <= *s && *s <= '7') {
5470                x = (x<<3) + *s++ - '0';
5471                if (s < end && '0' <= *s && *s <= '7')
5472                    x = (x<<3) + *s++ - '0';
5473            }
5474            WRITECHAR(x);
5475            break;
5476
5477            /* hex escapes */
5478            /* \xXX */
5479        case 'x':
5480            digits = 2;
5481            message = "truncated \\xXX escape";
5482            goto hexescape;
5483
5484            /* \uXXXX */
5485        case 'u':
5486            digits = 4;
5487            message = "truncated \\uXXXX escape";
5488            goto hexescape;
5489
5490            /* \UXXXXXXXX */
5491        case 'U':
5492            digits = 8;
5493            message = "truncated \\UXXXXXXXX escape";
5494        hexescape:
5495            chr = 0;
5496            if (s+digits>end) {
5497                endinpos = size;
5498                if (unicode_decode_call_errorhandler_writer(
5499                        errors, &errorHandler,
5500                        "unicodeescape", "end of string in escape sequence",
5501                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5502                        &writer))
5503                    goto onError;
5504                goto nextByte;
5505            }
5506            for (j = 0; j < digits; ++j) {
5507                c = (unsigned char) s[j];
5508                if (!Py_ISXDIGIT(c)) {
5509                    endinpos = (s+j+1)-starts;
5510                    if (unicode_decode_call_errorhandler_writer(
5511                            errors, &errorHandler,
5512                            "unicodeescape", message,
5513                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5514                            &writer))
5515                        goto onError;
5516                    goto nextByte;
5517                }
5518                chr = (chr<<4) & ~0xF;
5519                if (c >= '0' && c <= '9')
5520                    chr += c - '0';
5521                else if (c >= 'a' && c <= 'f')
5522                    chr += 10 + c - 'a';
5523                else
5524                    chr += 10 + c - 'A';
5525            }
5526            s += j;
5527            if (chr == 0xffffffff && PyErr_Occurred())
5528                /* _decoding_error will have already written into the
5529                   target buffer. */
5530                break;
5531        store:
5532            /* when we get here, chr is a 32-bit unicode character */
5533            if (chr <= MAX_UNICODE) {
5534                WRITECHAR(chr);
5535            } else {
5536                endinpos = s-starts;
5537                if (unicode_decode_call_errorhandler_writer(
5538                        errors, &errorHandler,
5539                        "unicodeescape", "illegal Unicode character",
5540                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5541                        &writer))
5542                    goto onError;
5543            }
5544            break;
5545
5546            /* \N{name} */
5547        case 'N':
5548            message = "malformed \\N character escape";
5549            if (ucnhash_CAPI == NULL) {
5550                /* load the unicode data module */
5551                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5552                                                PyUnicodeData_CAPSULE_NAME, 1);
5553                if (ucnhash_CAPI == NULL)
5554                    goto ucnhashError;
5555            }
5556            if (*s == '{') {
5557                const char *start = s+1;
5558                /* look for the closing brace */
5559                while (*s != '}' && s < end)
5560                    s++;
5561                if (s > start && s < end && *s == '}') {
5562                    /* found a name.  look it up in the unicode database */
5563                    message = "unknown Unicode character name";
5564                    s++;
5565                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5566                                              &chr, 0))
5567                        goto store;
5568                }
5569            }
5570            endinpos = s-starts;
5571            if (unicode_decode_call_errorhandler_writer(
5572                    errors, &errorHandler,
5573                    "unicodeescape", message,
5574                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5575                    &writer))
5576                goto onError;
5577            break;
5578
5579        default:
5580            if (s > end) {
5581                message = "\\ at end of string";
5582                s--;
5583                endinpos = s-starts;
5584                if (unicode_decode_call_errorhandler_writer(
5585                        errors, &errorHandler,
5586                        "unicodeescape", message,
5587                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5588                        &writer))
5589                    goto onError;
5590            }
5591            else {
5592                WRITECHAR('\\');
5593                WRITECHAR(s[-1]);
5594            }
5595            break;
5596        }
5597      nextByte:
5598        ;
5599    }
5600#undef WRITECHAR
5601
5602    Py_XDECREF(errorHandler);
5603    Py_XDECREF(exc);
5604    return _PyUnicodeWriter_Finish(&writer);
5605
5606  ucnhashError:
5607    PyErr_SetString(
5608        PyExc_UnicodeError,
5609        "\\N escapes not supported (can't load unicodedata module)"
5610        );
5611    _PyUnicodeWriter_Dealloc(&writer);
5612    Py_XDECREF(errorHandler);
5613    Py_XDECREF(exc);
5614    return NULL;
5615
5616  onError:
5617    _PyUnicodeWriter_Dealloc(&writer);
5618    Py_XDECREF(errorHandler);
5619    Py_XDECREF(exc);
5620    return NULL;
5621}
5622
5623/* Return a Unicode-Escape string version of the Unicode object.
5624
5625   If quotes is true, the string is enclosed in u"" or u'' quotes as
5626   appropriate.
5627
5628*/
5629
5630PyObject *
5631PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5632{
5633    Py_ssize_t i, len;
5634    PyObject *repr;
5635    char *p;
5636    int kind;
5637    void *data;
5638    Py_ssize_t expandsize = 0;
5639
5640    /* Initial allocation is based on the longest-possible character
5641       escape.
5642
5643       For UCS1 strings it's '\xxx', 4 bytes per source character.
5644       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5645       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5646    */
5647
5648    if (!PyUnicode_Check(unicode)) {
5649        PyErr_BadArgument();
5650        return NULL;
5651    }
5652    if (PyUnicode_READY(unicode) == -1)
5653        return NULL;
5654    len = PyUnicode_GET_LENGTH(unicode);
5655    kind = PyUnicode_KIND(unicode);
5656    data = PyUnicode_DATA(unicode);
5657    switch (kind) {
5658    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5659    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5660    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5661    }
5662
5663    if (len == 0)
5664        return PyBytes_FromStringAndSize(NULL, 0);
5665
5666    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5667        return PyErr_NoMemory();
5668
5669    repr = PyBytes_FromStringAndSize(NULL,
5670                                     2
5671                                     + expandsize*len
5672                                     + 1);
5673    if (repr == NULL)
5674        return NULL;
5675
5676    p = PyBytes_AS_STRING(repr);
5677
5678    for (i = 0; i < len; i++) {
5679        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5680
5681        /* Escape backslashes */
5682        if (ch == '\\') {
5683            *p++ = '\\';
5684            *p++ = (char) ch;
5685            continue;
5686        }
5687
5688        /* Map 21-bit characters to '\U00xxxxxx' */
5689        else if (ch >= 0x10000) {
5690            assert(ch <= MAX_UNICODE);
5691            *p++ = '\\';
5692            *p++ = 'U';
5693            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5694            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5695            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5696            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5697            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5698            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5699            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5700            *p++ = Py_hexdigits[ch & 0x0000000F];
5701            continue;
5702        }
5703
5704        /* Map 16-bit characters to '\uxxxx' */
5705        if (ch >= 256) {
5706            *p++ = '\\';
5707            *p++ = 'u';
5708            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5709            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5710            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5711            *p++ = Py_hexdigits[ch & 0x000F];
5712        }
5713
5714        /* Map special whitespace to '\t', \n', '\r' */
5715        else if (ch == '\t') {
5716            *p++ = '\\';
5717            *p++ = 't';
5718        }
5719        else if (ch == '\n') {
5720            *p++ = '\\';
5721            *p++ = 'n';
5722        }
5723        else if (ch == '\r') {
5724            *p++ = '\\';
5725            *p++ = 'r';
5726        }
5727
5728        /* Map non-printable US ASCII to '\xhh' */
5729        else if (ch < ' ' || ch >= 0x7F) {
5730            *p++ = '\\';
5731            *p++ = 'x';
5732            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5733            *p++ = Py_hexdigits[ch & 0x000F];
5734        }
5735
5736        /* Copy everything else as-is */
5737        else
5738            *p++ = (char) ch;
5739    }
5740
5741    assert(p - PyBytes_AS_STRING(repr) > 0);
5742    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5743        return NULL;
5744    return repr;
5745}
5746
5747PyObject *
5748PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5749                              Py_ssize_t size)
5750{
5751    PyObject *result;
5752    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5753    if (tmp == NULL)
5754        return NULL;
5755    result = PyUnicode_AsUnicodeEscapeString(tmp);
5756    Py_DECREF(tmp);
5757    return result;
5758}
5759
5760/* --- Raw Unicode Escape Codec ------------------------------------------- */
5761
5762PyObject *
5763PyUnicode_DecodeRawUnicodeEscape(const char *s,
5764                                 Py_ssize_t size,
5765                                 const char *errors)
5766{
5767    const char *starts = s;
5768    Py_ssize_t startinpos;
5769    Py_ssize_t endinpos;
5770    _PyUnicodeWriter writer;
5771    const char *end;
5772    const char *bs;
5773    PyObject *errorHandler = NULL;
5774    PyObject *exc = NULL;
5775
5776    if (size == 0) {
5777        Py_INCREF(unicode_empty);
5778        return unicode_empty;
5779    }
5780
5781    /* Escaped strings will always be longer than the resulting
5782       Unicode string, so we start with size here and then reduce the
5783       length after conversion to the true value. (But decoding error
5784       handler might have to resize the string) */
5785    _PyUnicodeWriter_Init(&writer, 1);
5786    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
5787        goto onError;
5788
5789    end = s + size;
5790    while (s < end) {
5791        unsigned char c;
5792        Py_UCS4 x;
5793        int i;
5794        int count;
5795
5796        /* Non-escape characters are interpreted as Unicode ordinals */
5797        if (*s != '\\') {
5798            x = (unsigned char)*s++;
5799            if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
5800                goto onError;
5801            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5802            writer.pos++;
5803            continue;
5804        }
5805        startinpos = s-starts;
5806
5807        /* \u-escapes are only interpreted iff the number of leading
5808           backslashes if odd */
5809        bs = s;
5810        for (;s < end;) {
5811            if (*s != '\\')
5812                break;
5813            x = (unsigned char)*s++;
5814            if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
5815                goto onError;
5816            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5817            writer.pos++;
5818        }
5819        if (((s - bs) & 1) == 0 ||
5820            s >= end ||
5821            (*s != 'u' && *s != 'U')) {
5822            continue;
5823        }
5824        writer.pos--;
5825        count = *s=='u' ? 4 : 8;
5826        s++;
5827
5828        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5829        for (x = 0, i = 0; i < count; ++i, ++s) {
5830            c = (unsigned char)*s;
5831            if (!Py_ISXDIGIT(c)) {
5832                endinpos = s-starts;
5833                if (unicode_decode_call_errorhandler_writer(
5834                        errors, &errorHandler,
5835                        "rawunicodeescape", "truncated \\uXXXX",
5836                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5837                        &writer))
5838                    goto onError;
5839                goto nextByte;
5840            }
5841            x = (x<<4) & ~0xF;
5842            if (c >= '0' && c <= '9')
5843                x += c - '0';
5844            else if (c >= 'a' && c <= 'f')
5845                x += 10 + c - 'a';
5846            else
5847                x += 10 + c - 'A';
5848        }
5849        if (x <= MAX_UNICODE) {
5850            if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
5851                goto onError;
5852            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5853            writer.pos++;
5854        }
5855        else {
5856            endinpos = s-starts;
5857            if (unicode_decode_call_errorhandler_writer(
5858                    errors, &errorHandler,
5859                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5860                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5861                    &writer))
5862                goto onError;
5863        }
5864      nextByte:
5865        ;
5866    }
5867    Py_XDECREF(errorHandler);
5868    Py_XDECREF(exc);
5869    return _PyUnicodeWriter_Finish(&writer);
5870
5871  onError:
5872    _PyUnicodeWriter_Dealloc(&writer);
5873    Py_XDECREF(errorHandler);
5874    Py_XDECREF(exc);
5875    return NULL;
5876}
5877
5878
5879PyObject *
5880PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
5881{
5882    PyObject *repr;
5883    char *p;
5884    char *q;
5885    Py_ssize_t expandsize, pos;
5886    int kind;
5887    void *data;
5888    Py_ssize_t len;
5889
5890    if (!PyUnicode_Check(unicode)) {
5891        PyErr_BadArgument();
5892        return NULL;
5893    }
5894    if (PyUnicode_READY(unicode) == -1)
5895        return NULL;
5896    kind = PyUnicode_KIND(unicode);
5897    data = PyUnicode_DATA(unicode);
5898    len = PyUnicode_GET_LENGTH(unicode);
5899    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5900       bytes, and 1 byte characters 4. */
5901    expandsize = kind * 2 + 2;
5902
5903    if (len > PY_SSIZE_T_MAX / expandsize)
5904        return PyErr_NoMemory();
5905
5906    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
5907    if (repr == NULL)
5908        return NULL;
5909    if (len == 0)
5910        return repr;
5911
5912    p = q = PyBytes_AS_STRING(repr);
5913    for (pos = 0; pos < len; pos++) {
5914        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
5915        /* Map 32-bit characters to '\Uxxxxxxxx' */
5916        if (ch >= 0x10000) {
5917            assert(ch <= MAX_UNICODE);
5918            *p++ = '\\';
5919            *p++ = 'U';
5920            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5921            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5922            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5923            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5924            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5925            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5926            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5927            *p++ = Py_hexdigits[ch & 15];
5928        }
5929        /* Map 16-bit characters to '\uxxxx' */
5930        else if (ch >= 256) {
5931            *p++ = '\\';
5932            *p++ = 'u';
5933            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5934            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5935            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5936            *p++ = Py_hexdigits[ch & 15];
5937        }
5938        /* Copy everything else as-is */
5939        else
5940            *p++ = (char) ch;
5941    }
5942
5943    assert(p > q);
5944    if (_PyBytes_Resize(&repr, p - q) < 0)
5945        return NULL;
5946    return repr;
5947}
5948
5949PyObject *
5950PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5951                                 Py_ssize_t size)
5952{
5953    PyObject *result;
5954    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5955    if (tmp == NULL)
5956        return NULL;
5957    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5958    Py_DECREF(tmp);
5959    return result;
5960}
5961
5962/* --- Unicode Internal Codec ------------------------------------------- */
5963
5964PyObject *
5965_PyUnicode_DecodeUnicodeInternal(const char *s,
5966                                 Py_ssize_t size,
5967                                 const char *errors)
5968{
5969    const char *starts = s;
5970    Py_ssize_t startinpos;
5971    Py_ssize_t endinpos;
5972    _PyUnicodeWriter writer;
5973    const char *end;
5974    const char *reason;
5975    PyObject *errorHandler = NULL;
5976    PyObject *exc = NULL;
5977
5978    if (PyErr_WarnEx(PyExc_DeprecationWarning,
5979                     "unicode_internal codec has been deprecated",
5980                     1))
5981        return NULL;
5982
5983    if (size == 0) {
5984        Py_INCREF(unicode_empty);
5985        return unicode_empty;
5986    }
5987
5988    /* XXX overflow detection missing */
5989    _PyUnicodeWriter_Init(&writer, 0);
5990    if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
5991        goto onError;
5992    end = s + size;
5993
5994    while (s < end) {
5995        Py_UNICODE uch;
5996        Py_UCS4 ch;
5997        /* We copy the raw representation one byte at a time because the
5998           pointer may be unaligned (see test_codeccallbacks). */
5999        ((char *) &uch)[0] = s[0];
6000        ((char *) &uch)[1] = s[1];
6001#ifdef Py_UNICODE_WIDE
6002        ((char *) &uch)[2] = s[2];
6003        ((char *) &uch)[3] = s[3];
6004#endif
6005        ch = uch;
6006
6007        /* We have to sanity check the raw data, otherwise doom looms for
6008           some malformed UCS-4 data. */
6009        if (
6010#ifdef Py_UNICODE_WIDE
6011            ch > 0x10ffff ||
6012#endif
6013            end-s < Py_UNICODE_SIZE
6014            )
6015        {
6016            startinpos = s - starts;
6017            if (end-s < Py_UNICODE_SIZE) {
6018                endinpos = end-starts;
6019                reason = "truncated input";
6020            }
6021            else {
6022                endinpos = s - starts + Py_UNICODE_SIZE;
6023                reason = "illegal code point (> 0x10FFFF)";
6024            }
6025            if (unicode_decode_call_errorhandler_writer(
6026                    errors, &errorHandler,
6027                    "unicode_internal", reason,
6028                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6029                    &writer))
6030                goto onError;
6031            continue;
6032        }
6033
6034        s += Py_UNICODE_SIZE;
6035#ifndef Py_UNICODE_WIDE
6036        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
6037        {
6038            Py_UNICODE uch2;
6039            ((char *) &uch2)[0] = s[0];
6040            ((char *) &uch2)[1] = s[1];
6041            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6042            {
6043                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6044                s += Py_UNICODE_SIZE;
6045            }
6046        }
6047#endif
6048
6049        if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
6050            goto onError;
6051        PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6052        writer.pos++;
6053    }
6054
6055    Py_XDECREF(errorHandler);
6056    Py_XDECREF(exc);
6057    return _PyUnicodeWriter_Finish(&writer);
6058
6059  onError:
6060    _PyUnicodeWriter_Dealloc(&writer);
6061    Py_XDECREF(errorHandler);
6062    Py_XDECREF(exc);
6063    return NULL;
6064}
6065
6066/* --- Latin-1 Codec ------------------------------------------------------ */
6067
6068PyObject *
6069PyUnicode_DecodeLatin1(const char *s,
6070                       Py_ssize_t size,
6071                       const char *errors)
6072{
6073    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6074    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6075}
6076
6077/* create or adjust a UnicodeEncodeError */
6078static void
6079make_encode_exception(PyObject **exceptionObject,
6080                      const char *encoding,
6081                      PyObject *unicode,
6082                      Py_ssize_t startpos, Py_ssize_t endpos,
6083                      const char *reason)
6084{
6085    if (*exceptionObject == NULL) {
6086        *exceptionObject = PyObject_CallFunction(
6087            PyExc_UnicodeEncodeError, "sOnns",
6088            encoding, unicode, startpos, endpos, reason);
6089    }
6090    else {
6091        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6092            goto onError;
6093        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6094            goto onError;
6095        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6096            goto onError;
6097        return;
6098      onError:
6099        Py_DECREF(*exceptionObject);
6100        *exceptionObject = NULL;
6101    }
6102}
6103
6104/* raises a UnicodeEncodeError */
6105static void
6106raise_encode_exception(PyObject **exceptionObject,
6107                       const char *encoding,
6108                       PyObject *unicode,
6109                       Py_ssize_t startpos, Py_ssize_t endpos,
6110                       const char *reason)
6111{
6112    make_encode_exception(exceptionObject,
6113                          encoding, unicode, startpos, endpos, reason);
6114    if (*exceptionObject != NULL)
6115        PyCodec_StrictErrors(*exceptionObject);
6116}
6117
6118/* error handling callback helper:
6119   build arguments, call the callback and check the arguments,
6120   put the result into newpos and return the replacement string, which
6121   has to be freed by the caller */
6122static PyObject *
6123unicode_encode_call_errorhandler(const char *errors,
6124                                 PyObject **errorHandler,
6125                                 const char *encoding, const char *reason,
6126                                 PyObject *unicode, PyObject **exceptionObject,
6127                                 Py_ssize_t startpos, Py_ssize_t endpos,
6128                                 Py_ssize_t *newpos)
6129{
6130    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6131    Py_ssize_t len;
6132    PyObject *restuple;
6133    PyObject *resunicode;
6134
6135    if (*errorHandler == NULL) {
6136        *errorHandler = PyCodec_LookupError(errors);
6137        if (*errorHandler == NULL)
6138            return NULL;
6139    }
6140
6141    if (PyUnicode_READY(unicode) == -1)
6142        return NULL;
6143    len = PyUnicode_GET_LENGTH(unicode);
6144
6145    make_encode_exception(exceptionObject,
6146                          encoding, unicode, startpos, endpos, reason);
6147    if (*exceptionObject == NULL)
6148        return NULL;
6149
6150    restuple = PyObject_CallFunctionObjArgs(
6151        *errorHandler, *exceptionObject, NULL);
6152    if (restuple == NULL)
6153        return NULL;
6154    if (!PyTuple_Check(restuple)) {
6155        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6156        Py_DECREF(restuple);
6157        return NULL;
6158    }
6159    if (!PyArg_ParseTuple(restuple, argparse,
6160                          &resunicode, newpos)) {
6161        Py_DECREF(restuple);
6162        return NULL;
6163    }
6164    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6165        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6166        Py_DECREF(restuple);
6167        return NULL;
6168    }
6169    if (*newpos<0)
6170        *newpos = len + *newpos;
6171    if (*newpos<0 || *newpos>len) {
6172        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6173        Py_DECREF(restuple);
6174        return NULL;
6175    }
6176    Py_INCREF(resunicode);
6177    Py_DECREF(restuple);
6178    return resunicode;
6179}
6180
6181static PyObject *
6182unicode_encode_ucs1(PyObject *unicode,
6183                    const char *errors,
6184                    unsigned int limit)
6185{
6186    /* input state */
6187    Py_ssize_t pos=0, size;
6188    int kind;
6189    void *data;
6190    /* output object */
6191    PyObject *res;
6192    /* pointer into the output */
6193    char *str;
6194    /* current output position */
6195    Py_ssize_t ressize;
6196    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6197    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6198    PyObject *errorHandler = NULL;
6199    PyObject *exc = NULL;
6200    /* the following variable is used for caching string comparisons
6201     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6202    int known_errorHandler = -1;
6203
6204    if (PyUnicode_READY(unicode) == -1)
6205        return NULL;
6206    size = PyUnicode_GET_LENGTH(unicode);
6207    kind = PyUnicode_KIND(unicode);
6208    data = PyUnicode_DATA(unicode);
6209    /* allocate enough for a simple encoding without
6210       replacements, if we need more, we'll resize */
6211    if (size == 0)
6212        return PyBytes_FromStringAndSize(NULL, 0);
6213    res = PyBytes_FromStringAndSize(NULL, size);
6214    if (res == NULL)
6215        return NULL;
6216    str = PyBytes_AS_STRING(res);
6217    ressize = size;
6218
6219    while (pos < size) {
6220        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6221
6222        /* can we encode this? */
6223        if (c<limit) {
6224            /* no overflow check, because we know that the space is enough */
6225            *str++ = (char)c;
6226            ++pos;
6227        }
6228        else {
6229            Py_ssize_t requiredsize;
6230            PyObject *repunicode;
6231            Py_ssize_t repsize, newpos, respos, i;
6232            /* startpos for collecting unencodable chars */
6233            Py_ssize_t collstart = pos;
6234            Py_ssize_t collend = pos;
6235            /* find all unecodable characters */
6236            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6237                ++collend;
6238            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6239            if (known_errorHandler==-1) {
6240                if ((errors==NULL) || (!strcmp(errors, "strict")))
6241                    known_errorHandler = 1;
6242                else if (!strcmp(errors, "replace"))
6243                    known_errorHandler = 2;
6244                else if (!strcmp(errors, "ignore"))
6245                    known_errorHandler = 3;
6246                else if (!strcmp(errors, "xmlcharrefreplace"))
6247                    known_errorHandler = 4;
6248                else
6249                    known_errorHandler = 0;
6250            }
6251            switch (known_errorHandler) {
6252            case 1: /* strict */
6253                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6254                goto onError;
6255            case 2: /* replace */
6256                while (collstart++<collend)
6257                    *str++ = '?'; /* fall through */
6258            case 3: /* ignore */
6259                pos = collend;
6260                break;
6261            case 4: /* xmlcharrefreplace */
6262                respos = str - PyBytes_AS_STRING(res);
6263                /* determine replacement size */
6264                for (i = collstart, repsize = 0; i < collend; ++i) {
6265                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6266                    if (ch < 10)
6267                        repsize += 2+1+1;
6268                    else if (ch < 100)
6269                        repsize += 2+2+1;
6270                    else if (ch < 1000)
6271                        repsize += 2+3+1;
6272                    else if (ch < 10000)
6273                        repsize += 2+4+1;
6274                    else if (ch < 100000)
6275                        repsize += 2+5+1;
6276                    else if (ch < 1000000)
6277                        repsize += 2+6+1;
6278                    else {
6279                        assert(ch <= MAX_UNICODE);
6280                        repsize += 2+7+1;
6281                    }
6282                }
6283                requiredsize = respos+repsize+(size-collend);
6284                if (requiredsize > ressize) {
6285                    if (requiredsize<2*ressize)
6286                        requiredsize = 2*ressize;
6287                    if (_PyBytes_Resize(&res, requiredsize))
6288                        goto onError;
6289                    str = PyBytes_AS_STRING(res) + respos;
6290                    ressize = requiredsize;
6291                }
6292                /* generate replacement */
6293                for (i = collstart; i < collend; ++i) {
6294                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6295                }
6296                pos = collend;
6297                break;
6298            default:
6299                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6300                                                              encoding, reason, unicode, &exc,
6301                                                              collstart, collend, &newpos);
6302                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6303                                           PyUnicode_READY(repunicode) == -1))
6304                    goto onError;
6305                if (PyBytes_Check(repunicode)) {
6306                    /* Directly copy bytes result to output. */
6307                    repsize = PyBytes_Size(repunicode);
6308                    if (repsize > 1) {
6309                        /* Make room for all additional bytes. */
6310                        respos = str - PyBytes_AS_STRING(res);
6311                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6312                            Py_DECREF(repunicode);
6313                            goto onError;
6314                        }
6315                        str = PyBytes_AS_STRING(res) + respos;
6316                        ressize += repsize-1;
6317                    }
6318                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6319                    str += repsize;
6320                    pos = newpos;
6321                    Py_DECREF(repunicode);
6322                    break;
6323                }
6324                /* need more space? (at least enough for what we
6325                   have+the replacement+the rest of the string, so
6326                   we won't have to check space for encodable characters) */
6327                respos = str - PyBytes_AS_STRING(res);
6328                repsize = PyUnicode_GET_LENGTH(repunicode);
6329                requiredsize = respos+repsize+(size-collend);
6330                if (requiredsize > ressize) {
6331                    if (requiredsize<2*ressize)
6332                        requiredsize = 2*ressize;
6333                    if (_PyBytes_Resize(&res, requiredsize)) {
6334                        Py_DECREF(repunicode);
6335                        goto onError;
6336                    }
6337                    str = PyBytes_AS_STRING(res) + respos;
6338                    ressize = requiredsize;
6339                }
6340                /* check if there is anything unencodable in the replacement
6341                   and copy it to the output */
6342                for (i = 0; repsize-->0; ++i, ++str) {
6343                    c = PyUnicode_READ_CHAR(repunicode, i);
6344                    if (c >= limit) {
6345                        raise_encode_exception(&exc, encoding, unicode,
6346                                               pos, pos+1, reason);
6347                        Py_DECREF(repunicode);
6348                        goto onError;
6349                    }
6350                    *str = (char)c;
6351                }
6352                pos = newpos;
6353                Py_DECREF(repunicode);
6354            }
6355        }
6356    }
6357    /* Resize if we allocated to much */
6358    size = str - PyBytes_AS_STRING(res);
6359    if (size < ressize) { /* If this falls res will be NULL */
6360        assert(size >= 0);
6361        if (_PyBytes_Resize(&res, size) < 0)
6362            goto onError;
6363    }
6364
6365    Py_XDECREF(errorHandler);
6366    Py_XDECREF(exc);
6367    return res;
6368
6369  onError:
6370    Py_XDECREF(res);
6371    Py_XDECREF(errorHandler);
6372    Py_XDECREF(exc);
6373    return NULL;
6374}
6375
6376/* Deprecated */
6377PyObject *
6378PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6379                       Py_ssize_t size,
6380                       const char *errors)
6381{
6382    PyObject *result;
6383    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6384    if (unicode == NULL)
6385        return NULL;
6386    result = unicode_encode_ucs1(unicode, errors, 256);
6387    Py_DECREF(unicode);
6388    return result;
6389}
6390
6391PyObject *
6392_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6393{
6394    if (!PyUnicode_Check(unicode)) {
6395        PyErr_BadArgument();
6396        return NULL;
6397    }
6398    if (PyUnicode_READY(unicode) == -1)
6399        return NULL;
6400    /* Fast path: if it is a one-byte string, construct
6401       bytes object directly. */
6402    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6403        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6404                                         PyUnicode_GET_LENGTH(unicode));
6405    /* Non-Latin-1 characters present. Defer to above function to
6406       raise the exception. */
6407    return unicode_encode_ucs1(unicode, errors, 256);
6408}
6409
6410PyObject*
6411PyUnicode_AsLatin1String(PyObject *unicode)
6412{
6413    return _PyUnicode_AsLatin1String(unicode, NULL);
6414}
6415
6416/* --- 7-bit ASCII Codec -------------------------------------------------- */
6417
6418PyObject *
6419PyUnicode_DecodeASCII(const char *s,
6420                      Py_ssize_t size,
6421                      const char *errors)
6422{
6423    const char *starts = s;
6424    _PyUnicodeWriter writer;
6425    int kind;
6426    void *data;
6427    Py_ssize_t startinpos;
6428    Py_ssize_t endinpos;
6429    Py_ssize_t outpos;
6430    const char *e;
6431    PyObject *errorHandler = NULL;
6432    PyObject *exc = NULL;
6433
6434    if (size == 0) {
6435        Py_INCREF(unicode_empty);
6436        return unicode_empty;
6437    }
6438
6439    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6440    if (size == 1 && (unsigned char)s[0] < 128)
6441        return get_latin1_char((unsigned char)s[0]);
6442
6443    _PyUnicodeWriter_Init(&writer, 0);
6444    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
6445        goto onError;
6446
6447    e = s + size;
6448    data = writer.data;
6449    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6450    writer.pos = outpos;
6451    if (writer.pos == size)
6452        return _PyUnicodeWriter_Finish(&writer);
6453
6454    s += writer.pos;
6455    kind = writer.kind;
6456    while (s < e) {
6457        register unsigned char c = (unsigned char)*s;
6458        if (c < 128) {
6459            PyUnicode_WRITE(kind, data, writer.pos, c);
6460            writer.pos++;
6461            ++s;
6462        }
6463        else {
6464            startinpos = s-starts;
6465            endinpos = startinpos + 1;
6466            if (unicode_decode_call_errorhandler_writer(
6467                    errors, &errorHandler,
6468                    "ascii", "ordinal not in range(128)",
6469                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6470                    &writer))
6471                goto onError;
6472            kind = writer.kind;
6473            data = writer.data;
6474        }
6475    }
6476    Py_XDECREF(errorHandler);
6477    Py_XDECREF(exc);
6478    return _PyUnicodeWriter_Finish(&writer);
6479
6480  onError:
6481    _PyUnicodeWriter_Dealloc(&writer);
6482    Py_XDECREF(errorHandler);
6483    Py_XDECREF(exc);
6484    return NULL;
6485}
6486
6487/* Deprecated */
6488PyObject *
6489PyUnicode_EncodeASCII(const Py_UNICODE *p,
6490                      Py_ssize_t size,
6491                      const char *errors)
6492{
6493    PyObject *result;
6494    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6495    if (unicode == NULL)
6496        return NULL;
6497    result = unicode_encode_ucs1(unicode, errors, 128);
6498    Py_DECREF(unicode);
6499    return result;
6500}
6501
6502PyObject *
6503_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6504{
6505    if (!PyUnicode_Check(unicode)) {
6506        PyErr_BadArgument();
6507        return NULL;
6508    }
6509    if (PyUnicode_READY(unicode) == -1)
6510        return NULL;
6511    /* Fast path: if it is an ASCII-only string, construct bytes object
6512       directly. Else defer to above function to raise the exception. */
6513    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6514        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6515                                         PyUnicode_GET_LENGTH(unicode));
6516    return unicode_encode_ucs1(unicode, errors, 128);
6517}
6518
6519PyObject *
6520PyUnicode_AsASCIIString(PyObject *unicode)
6521{
6522    return _PyUnicode_AsASCIIString(unicode, NULL);
6523}
6524
6525#ifdef HAVE_MBCS
6526
6527/* --- MBCS codecs for Windows -------------------------------------------- */
6528
6529#if SIZEOF_INT < SIZEOF_SIZE_T
6530#define NEED_RETRY
6531#endif
6532
6533#ifndef WC_ERR_INVALID_CHARS
6534#  define WC_ERR_INVALID_CHARS 0x0080
6535#endif
6536
6537static char*
6538code_page_name(UINT code_page, PyObject **obj)
6539{
6540    *obj = NULL;
6541    if (code_page == CP_ACP)
6542        return "mbcs";
6543    if (code_page == CP_UTF7)
6544        return "CP_UTF7";
6545    if (code_page == CP_UTF8)
6546        return "CP_UTF8";
6547
6548    *obj = PyBytes_FromFormat("cp%u", code_page);
6549    if (*obj == NULL)
6550        return NULL;
6551    return PyBytes_AS_STRING(*obj);
6552}
6553
6554static int
6555is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6556{
6557    const char *curr = s + offset;
6558    const char *prev;
6559
6560    if (!IsDBCSLeadByteEx(code_page, *curr))
6561        return 0;
6562
6563    prev = CharPrevExA(code_page, s, curr, 0);
6564    if (prev == curr)
6565        return 1;
6566    /* FIXME: This code is limited to "true" double-byte encodings,
6567       as it assumes an incomplete character consists of a single
6568       byte. */
6569    if (curr - prev == 2)
6570        return 1;
6571    if (!IsDBCSLeadByteEx(code_page, *prev))
6572        return 1;
6573    return 0;
6574}
6575
6576static DWORD
6577decode_code_page_flags(UINT code_page)
6578{
6579    if (code_page == CP_UTF7) {
6580        /* The CP_UTF7 decoder only supports flags=0 */
6581        return 0;
6582    }
6583    else
6584        return MB_ERR_INVALID_CHARS;
6585}
6586
6587/*
6588 * Decode a byte string from a Windows code page into unicode object in strict
6589 * mode.
6590 *
6591 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6592 * WindowsError and returns -1 on other error.
6593 */
6594static int
6595decode_code_page_strict(UINT code_page,
6596                        PyObject **v,
6597                        const char *in,
6598                        int insize)
6599{
6600    const DWORD flags = decode_code_page_flags(code_page);
6601    wchar_t *out;
6602    DWORD outsize;
6603
6604    /* First get the size of the result */
6605    assert(insize > 0);
6606    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6607    if (outsize <= 0)
6608        goto error;
6609
6610    if (*v == NULL) {
6611        /* Create unicode object */
6612        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6613        *v = (PyObject*)_PyUnicode_New(outsize);
6614        if (*v == NULL)
6615            return -1;
6616        out = PyUnicode_AS_UNICODE(*v);
6617    }
6618    else {
6619        /* Extend unicode object */
6620        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6621        if (unicode_resize(v, n + outsize) < 0)
6622            return -1;
6623        out = PyUnicode_AS_UNICODE(*v) + n;
6624    }
6625
6626    /* Do the conversion */
6627    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6628    if (outsize <= 0)
6629        goto error;
6630    return insize;
6631
6632error:
6633    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6634        return -2;
6635    PyErr_SetFromWindowsErr(0);
6636    return -1;
6637}
6638
6639/*
6640 * Decode a byte string from a code page into unicode object with an error
6641 * handler.
6642 *
6643 * Returns consumed size if succeed, or raise a WindowsError or
6644 * UnicodeDecodeError exception and returns -1 on error.
6645 */
6646static int
6647decode_code_page_errors(UINT code_page,
6648                        PyObject **v,
6649                        const char *in, const int size,
6650                        const char *errors)
6651{
6652    const char *startin = in;
6653    const char *endin = in + size;
6654    const DWORD flags = decode_code_page_flags(code_page);
6655    /* Ideally, we should get reason from FormatMessage. This is the Windows
6656       2000 English version of the message. */
6657    const char *reason = "No mapping for the Unicode character exists "
6658                         "in the target code page.";
6659    /* each step cannot decode more than 1 character, but a character can be
6660       represented as a surrogate pair */
6661    wchar_t buffer[2], *startout, *out;
6662    int insize, outsize;
6663    PyObject *errorHandler = NULL;
6664    PyObject *exc = NULL;
6665    PyObject *encoding_obj = NULL;
6666    char *encoding;
6667    DWORD err;
6668    int ret = -1;
6669
6670    assert(size > 0);
6671
6672    encoding = code_page_name(code_page, &encoding_obj);
6673    if (encoding == NULL)
6674        return -1;
6675
6676    if (errors == NULL || strcmp(errors, "strict") == 0) {
6677        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6678           UnicodeDecodeError. */
6679        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6680        if (exc != NULL) {
6681            PyCodec_StrictErrors(exc);
6682            Py_CLEAR(exc);
6683        }
6684        goto error;
6685    }
6686
6687    if (*v == NULL) {
6688        /* Create unicode object */
6689        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6690            PyErr_NoMemory();
6691            goto error;
6692        }
6693        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6694        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6695        if (*v == NULL)
6696            goto error;
6697        startout = PyUnicode_AS_UNICODE(*v);
6698    }
6699    else {
6700        /* Extend unicode object */
6701        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6702        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6703            PyErr_NoMemory();
6704            goto error;
6705        }
6706        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6707            goto error;
6708        startout = PyUnicode_AS_UNICODE(*v) + n;
6709    }
6710
6711    /* Decode the byte string character per character */
6712    out = startout;
6713    while (in < endin)
6714    {
6715        /* Decode a character */
6716        insize = 1;
6717        do
6718        {
6719            outsize = MultiByteToWideChar(code_page, flags,
6720                                          in, insize,
6721                                          buffer, Py_ARRAY_LENGTH(buffer));
6722            if (outsize > 0)
6723                break;
6724            err = GetLastError();
6725            if (err != ERROR_NO_UNICODE_TRANSLATION
6726                && err != ERROR_INSUFFICIENT_BUFFER)
6727            {
6728                PyErr_SetFromWindowsErr(0);
6729                goto error;
6730            }
6731            insize++;
6732        }
6733        /* 4=maximum length of a UTF-8 sequence */
6734        while (insize <= 4 && (in + insize) <= endin);
6735
6736        if (outsize <= 0) {
6737            Py_ssize_t startinpos, endinpos, outpos;
6738
6739            startinpos = in - startin;
6740            endinpos = startinpos + 1;
6741            outpos = out - PyUnicode_AS_UNICODE(*v);
6742            if (unicode_decode_call_errorhandler_wchar(
6743                    errors, &errorHandler,
6744                    encoding, reason,
6745                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
6746                    v, &outpos))
6747            {
6748                goto error;
6749            }
6750            out = PyUnicode_AS_UNICODE(*v) + outpos;
6751        }
6752        else {
6753            in += insize;
6754            memcpy(out, buffer, outsize * sizeof(wchar_t));
6755            out += outsize;
6756        }
6757    }
6758
6759    /* write a NUL character at the end */
6760    *out = 0;
6761
6762    /* Extend unicode object */
6763    outsize = out - startout;
6764    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
6765    if (unicode_resize(v, outsize) < 0)
6766        goto error;
6767    ret = size;
6768
6769error:
6770    Py_XDECREF(encoding_obj);
6771    Py_XDECREF(errorHandler);
6772    Py_XDECREF(exc);
6773    return ret;
6774}
6775
6776static PyObject *
6777decode_code_page_stateful(int code_page,
6778                          const char *s, Py_ssize_t size,
6779                          const char *errors, Py_ssize_t *consumed)
6780{
6781    PyObject *v = NULL;
6782    int chunk_size, final, converted, done;
6783
6784    if (code_page < 0) {
6785        PyErr_SetString(PyExc_ValueError, "invalid code page number");
6786        return NULL;
6787    }
6788
6789    if (consumed)
6790        *consumed = 0;
6791
6792    do
6793    {
6794#ifdef NEED_RETRY
6795        if (size > INT_MAX) {
6796            chunk_size = INT_MAX;
6797            final = 0;
6798            done = 0;
6799        }
6800        else
6801#endif
6802        {
6803            chunk_size = (int)size;
6804            final = (consumed == NULL);
6805            done = 1;
6806        }
6807
6808        /* Skip trailing lead-byte unless 'final' is set */
6809        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6810            --chunk_size;
6811
6812        if (chunk_size == 0 && done) {
6813            if (v != NULL)
6814                break;
6815            Py_INCREF(unicode_empty);
6816            return unicode_empty;
6817        }
6818
6819
6820        converted = decode_code_page_strict(code_page, &v,
6821                                            s, chunk_size);
6822        if (converted == -2)
6823            converted = decode_code_page_errors(code_page, &v,
6824                                                s, chunk_size,
6825                                                errors);
6826        assert(converted != 0);
6827
6828        if (converted < 0) {
6829            Py_XDECREF(v);
6830            return NULL;
6831        }
6832
6833        if (consumed)
6834            *consumed += converted;
6835
6836        s += converted;
6837        size -= converted;
6838    } while (!done);
6839
6840    return unicode_result(v);
6841}
6842
6843PyObject *
6844PyUnicode_DecodeCodePageStateful(int code_page,
6845                                 const char *s,
6846                                 Py_ssize_t size,
6847                                 const char *errors,
6848                                 Py_ssize_t *consumed)
6849{
6850    return decode_code_page_stateful(code_page, s, size, errors, consumed);
6851}
6852
6853PyObject *
6854PyUnicode_DecodeMBCSStateful(const char *s,
6855                             Py_ssize_t size,
6856                             const char *errors,
6857                             Py_ssize_t *consumed)
6858{
6859    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6860}
6861
6862PyObject *
6863PyUnicode_DecodeMBCS(const char *s,
6864                     Py_ssize_t size,
6865                     const char *errors)
6866{
6867    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6868}
6869
6870static DWORD
6871encode_code_page_flags(UINT code_page, const char *errors)
6872{
6873    if (code_page == CP_UTF8) {
6874        if (winver.dwMajorVersion >= 6)
6875            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6876               and later */
6877            return WC_ERR_INVALID_CHARS;
6878        else
6879            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6880            return 0;
6881    }
6882    else if (code_page == CP_UTF7) {
6883        /* CP_UTF7 only supports flags=0 */
6884        return 0;
6885    }
6886    else {
6887        if (errors != NULL && strcmp(errors, "replace") == 0)
6888            return 0;
6889        else
6890            return WC_NO_BEST_FIT_CHARS;
6891    }
6892}
6893
6894/*
6895 * Encode a Unicode string to a Windows code page into a byte string in strict
6896 * mode.
6897 *
6898 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6899 * a WindowsError and returns -1 on other error.
6900 */
6901static int
6902encode_code_page_strict(UINT code_page, PyObject **outbytes,
6903                        PyObject *unicode, Py_ssize_t offset, int len,
6904                        const char* errors)
6905{
6906    BOOL usedDefaultChar = FALSE;
6907    BOOL *pusedDefaultChar = &usedDefaultChar;
6908    int outsize;
6909    PyObject *exc = NULL;
6910    wchar_t *p;
6911    Py_ssize_t size;
6912    const DWORD flags = encode_code_page_flags(code_page, NULL);
6913    char *out;
6914    /* Create a substring so that we can get the UTF-16 representation
6915       of just the slice under consideration. */
6916    PyObject *substring;
6917
6918    assert(len > 0);
6919
6920    if (code_page != CP_UTF8 && code_page != CP_UTF7)
6921        pusedDefaultChar = &usedDefaultChar;
6922    else
6923        pusedDefaultChar = NULL;
6924
6925    substring = PyUnicode_Substring(unicode, offset, offset+len);
6926    if (substring == NULL)
6927        return -1;
6928    p = PyUnicode_AsUnicodeAndSize(substring, &size);
6929    if (p == NULL) {
6930        Py_DECREF(substring);
6931        return -1;
6932    }
6933
6934    /* First get the size of the result */
6935    outsize = WideCharToMultiByte(code_page, flags,
6936                                  p, size,
6937                                  NULL, 0,
6938                                  NULL, pusedDefaultChar);
6939    if (outsize <= 0)
6940        goto error;
6941    /* If we used a default char, then we failed! */
6942    if (pusedDefaultChar && *pusedDefaultChar) {
6943        Py_DECREF(substring);
6944        return -2;
6945    }
6946
6947    if (*outbytes == NULL) {
6948        /* Create string object */
6949        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
6950        if (*outbytes == NULL) {
6951            Py_DECREF(substring);
6952            return -1;
6953        }
6954        out = PyBytes_AS_STRING(*outbytes);
6955    }
6956    else {
6957        /* Extend string object */
6958        const Py_ssize_t n = PyBytes_Size(*outbytes);
6959        if (outsize > PY_SSIZE_T_MAX - n) {
6960            PyErr_NoMemory();
6961            Py_DECREF(substring);
6962            return -1;
6963        }
6964        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6965            Py_DECREF(substring);
6966            return -1;
6967        }
6968        out = PyBytes_AS_STRING(*outbytes) + n;
6969    }
6970
6971    /* Do the conversion */
6972    outsize = WideCharToMultiByte(code_page, flags,
6973                                  p, size,
6974                                  out, outsize,
6975                                  NULL, pusedDefaultChar);
6976    Py_CLEAR(substring);
6977    if (outsize <= 0)
6978        goto error;
6979    if (pusedDefaultChar && *pusedDefaultChar)
6980        return -2;
6981    return 0;
6982
6983error:
6984    Py_XDECREF(substring);
6985    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6986        return -2;
6987    PyErr_SetFromWindowsErr(0);
6988    return -1;
6989}
6990
6991/*
6992 * Encode a Unicode string to a Windows code page into a byte string using a
6993 * error handler.
6994 *
6995 * Returns consumed characters if succeed, or raise a WindowsError and returns
6996 * -1 on other error.
6997 */
6998static int
6999encode_code_page_errors(UINT code_page, PyObject **outbytes,
7000                        PyObject *unicode, Py_ssize_t unicode_offset,
7001                        Py_ssize_t insize, const char* errors)
7002{
7003    const DWORD flags = encode_code_page_flags(code_page, errors);
7004    Py_ssize_t pos = unicode_offset;
7005    Py_ssize_t endin = unicode_offset + insize;
7006    /* Ideally, we should get reason from FormatMessage. This is the Windows
7007       2000 English version of the message. */
7008    const char *reason = "invalid character";
7009    /* 4=maximum length of a UTF-8 sequence */
7010    char buffer[4];
7011    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7012    Py_ssize_t outsize;
7013    char *out;
7014    PyObject *errorHandler = NULL;
7015    PyObject *exc = NULL;
7016    PyObject *encoding_obj = NULL;
7017    char *encoding;
7018    Py_ssize_t newpos, newoutsize;
7019    PyObject *rep;
7020    int ret = -1;
7021
7022    assert(insize > 0);
7023
7024    encoding = code_page_name(code_page, &encoding_obj);
7025    if (encoding == NULL)
7026        return -1;
7027
7028    if (errors == NULL || strcmp(errors, "strict") == 0) {
7029        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7030           then we raise a UnicodeEncodeError. */
7031        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7032        if (exc != NULL) {
7033            PyCodec_StrictErrors(exc);
7034            Py_DECREF(exc);
7035        }
7036        Py_XDECREF(encoding_obj);
7037        return -1;
7038    }
7039
7040    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7041        pusedDefaultChar = &usedDefaultChar;
7042    else
7043        pusedDefaultChar = NULL;
7044
7045    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7046        PyErr_NoMemory();
7047        goto error;
7048    }
7049    outsize = insize * Py_ARRAY_LENGTH(buffer);
7050
7051    if (*outbytes == NULL) {
7052        /* Create string object */
7053        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7054        if (*outbytes == NULL)
7055            goto error;
7056        out = PyBytes_AS_STRING(*outbytes);
7057    }
7058    else {
7059        /* Extend string object */
7060        Py_ssize_t n = PyBytes_Size(*outbytes);
7061        if (n > PY_SSIZE_T_MAX - outsize) {
7062            PyErr_NoMemory();
7063            goto error;
7064        }
7065        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7066            goto error;
7067        out = PyBytes_AS_STRING(*outbytes) + n;
7068    }
7069
7070    /* Encode the string character per character */
7071    while (pos < endin)
7072    {
7073        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7074        wchar_t chars[2];
7075        int charsize;
7076        if (ch < 0x10000) {
7077            chars[0] = (wchar_t)ch;
7078            charsize = 1;
7079        }
7080        else {
7081            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7082            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7083            charsize = 2;
7084        }
7085
7086        outsize = WideCharToMultiByte(code_page, flags,
7087                                      chars, charsize,
7088                                      buffer, Py_ARRAY_LENGTH(buffer),
7089                                      NULL, pusedDefaultChar);
7090        if (outsize > 0) {
7091            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7092            {
7093                pos++;
7094                memcpy(out, buffer, outsize);
7095                out += outsize;
7096                continue;
7097            }
7098        }
7099        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7100            PyErr_SetFromWindowsErr(0);
7101            goto error;
7102        }
7103
7104        rep = unicode_encode_call_errorhandler(
7105                  errors, &errorHandler, encoding, reason,
7106                  unicode, &exc,
7107                  pos, pos + 1, &newpos);
7108        if (rep == NULL)
7109            goto error;
7110        pos = newpos;
7111
7112        if (PyBytes_Check(rep)) {
7113            outsize = PyBytes_GET_SIZE(rep);
7114            if (outsize != 1) {
7115                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7116                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7117                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7118                    Py_DECREF(rep);
7119                    goto error;
7120                }
7121                out = PyBytes_AS_STRING(*outbytes) + offset;
7122            }
7123            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7124            out += outsize;
7125        }
7126        else {
7127            Py_ssize_t i;
7128            enum PyUnicode_Kind kind;
7129            void *data;
7130
7131            if (PyUnicode_READY(rep) == -1) {
7132                Py_DECREF(rep);
7133                goto error;
7134            }
7135
7136            outsize = PyUnicode_GET_LENGTH(rep);
7137            if (outsize != 1) {
7138                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7139                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7140                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7141                    Py_DECREF(rep);
7142                    goto error;
7143                }
7144                out = PyBytes_AS_STRING(*outbytes) + offset;
7145            }
7146            kind = PyUnicode_KIND(rep);
7147            data = PyUnicode_DATA(rep);
7148            for (i=0; i < outsize; i++) {
7149                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7150                if (ch > 127) {
7151                    raise_encode_exception(&exc,
7152                        encoding, unicode,
7153                        pos, pos + 1,
7154                        "unable to encode error handler result to ASCII");
7155                    Py_DECREF(rep);
7156                    goto error;
7157                }
7158                *out = (unsigned char)ch;
7159                out++;
7160            }
7161        }
7162        Py_DECREF(rep);
7163    }
7164    /* write a NUL byte */
7165    *out = 0;
7166    outsize = out - PyBytes_AS_STRING(*outbytes);
7167    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7168    if (_PyBytes_Resize(outbytes, outsize) < 0)
7169        goto error;
7170    ret = 0;
7171
7172error:
7173    Py_XDECREF(encoding_obj);
7174    Py_XDECREF(errorHandler);
7175    Py_XDECREF(exc);
7176    return ret;
7177}
7178
7179static PyObject *
7180encode_code_page(int code_page,
7181                 PyObject *unicode,
7182                 const char *errors)
7183{
7184    Py_ssize_t len;
7185    PyObject *outbytes = NULL;
7186    Py_ssize_t offset;
7187    int chunk_len, ret, done;
7188
7189    if (PyUnicode_READY(unicode) == -1)
7190        return NULL;
7191    len = PyUnicode_GET_LENGTH(unicode);
7192
7193    if (code_page < 0) {
7194        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7195        return NULL;
7196    }
7197
7198    if (len == 0)
7199        return PyBytes_FromStringAndSize(NULL, 0);
7200
7201    offset = 0;
7202    do
7203    {
7204#ifdef NEED_RETRY
7205        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7206           chunks. */
7207        if (len > INT_MAX/2) {
7208            chunk_len = INT_MAX/2;
7209            done = 0;
7210        }
7211        else
7212#endif
7213        {
7214            chunk_len = (int)len;
7215            done = 1;
7216        }
7217
7218        ret = encode_code_page_strict(code_page, &outbytes,
7219                                      unicode, offset, chunk_len,
7220                                      errors);
7221        if (ret == -2)
7222            ret = encode_code_page_errors(code_page, &outbytes,
7223                                          unicode, offset,
7224                                          chunk_len, errors);
7225        if (ret < 0) {
7226            Py_XDECREF(outbytes);
7227            return NULL;
7228        }
7229
7230        offset += chunk_len;
7231        len -= chunk_len;
7232    } while (!done);
7233
7234    return outbytes;
7235}
7236
7237PyObject *
7238PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7239                     Py_ssize_t size,
7240                     const char *errors)
7241{
7242    PyObject *unicode, *res;
7243    unicode = PyUnicode_FromUnicode(p, size);
7244    if (unicode == NULL)
7245        return NULL;
7246    res = encode_code_page(CP_ACP, unicode, errors);
7247    Py_DECREF(unicode);
7248    return res;
7249}
7250
7251PyObject *
7252PyUnicode_EncodeCodePage(int code_page,
7253                         PyObject *unicode,
7254                         const char *errors)
7255{
7256    return encode_code_page(code_page, unicode, errors);
7257}
7258
7259PyObject *
7260PyUnicode_AsMBCSString(PyObject *unicode)
7261{
7262    if (!PyUnicode_Check(unicode)) {
7263        PyErr_BadArgument();
7264        return NULL;
7265    }
7266    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7267}
7268
7269#undef NEED_RETRY
7270
7271#endif /* HAVE_MBCS */
7272
7273/* --- Character Mapping Codec -------------------------------------------- */
7274
7275PyObject *
7276PyUnicode_DecodeCharmap(const char *s,
7277                        Py_ssize_t size,
7278                        PyObject *mapping,
7279                        const char *errors)
7280{
7281    const char *starts = s;
7282    Py_ssize_t startinpos;
7283    Py_ssize_t endinpos;
7284    const char *e;
7285    _PyUnicodeWriter writer;
7286    PyObject *errorHandler = NULL;
7287    PyObject *exc = NULL;
7288
7289    /* Default to Latin-1 */
7290    if (mapping == NULL)
7291        return PyUnicode_DecodeLatin1(s, size, errors);
7292
7293    if (size == 0) {
7294        Py_INCREF(unicode_empty);
7295        return unicode_empty;
7296    }
7297    _PyUnicodeWriter_Init(&writer, 0);
7298    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
7299        goto onError;
7300
7301    e = s + size;
7302    if (PyUnicode_CheckExact(mapping)) {
7303        Py_ssize_t maplen;
7304        enum PyUnicode_Kind mapkind;
7305        void *mapdata;
7306        Py_UCS4 x;
7307
7308        if (PyUnicode_READY(mapping) == -1)
7309            return NULL;
7310
7311        maplen = PyUnicode_GET_LENGTH(mapping);
7312        mapdata = PyUnicode_DATA(mapping);
7313        mapkind = PyUnicode_KIND(mapping);
7314        while (s < e) {
7315            unsigned char ch;
7316            if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7317                enum PyUnicode_Kind outkind = writer.kind;
7318                void *outdata = writer.data;
7319                if (outkind == PyUnicode_1BYTE_KIND) {
7320                    Py_UCS4 maxchar = writer.maxchar;
7321                    while (s < e) {
7322                        unsigned char ch = *s;
7323                        x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7324                        if (x > maxchar)
7325                            goto Error;
7326                        PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x);
7327                        writer.pos++;
7328                        ++s;
7329                    }
7330                    break;
7331                }
7332                else if (outkind == PyUnicode_2BYTE_KIND) {
7333                    while (s < e) {
7334                        unsigned char ch = *s;
7335                        x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7336                        if (x == 0xFFFE)
7337                            goto Error;
7338                        PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x);
7339                        writer.pos++;
7340                        ++s;
7341                    }
7342                    break;
7343                }
7344            }
7345            ch = *s;
7346
7347            if (ch < maplen)
7348                x = PyUnicode_READ(mapkind, mapdata, ch);
7349            else
7350                x = 0xfffe; /* invalid value */
7351Error:
7352            if (x == 0xfffe)
7353            {
7354                /* undefined mapping */
7355                startinpos = s-starts;
7356                endinpos = startinpos+1;
7357                if (unicode_decode_call_errorhandler_writer(
7358                        errors, &errorHandler,
7359                        "charmap", "character maps to <undefined>",
7360                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7361                        &writer)) {
7362                    goto onError;
7363                }
7364                continue;
7365            }
7366
7367            if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
7368                goto onError;
7369            PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7370            writer.pos++;
7371            ++s;
7372        }
7373    }
7374    else {
7375        while (s < e) {
7376            unsigned char ch = *s;
7377            PyObject *w, *x;
7378
7379            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7380            w = PyLong_FromLong((long)ch);
7381            if (w == NULL)
7382                goto onError;
7383            x = PyObject_GetItem(mapping, w);
7384            Py_DECREF(w);
7385            if (x == NULL) {
7386                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7387                    /* No mapping found means: mapping is undefined. */
7388                    PyErr_Clear();
7389                    x = Py_None;
7390                    Py_INCREF(x);
7391                } else
7392                    goto onError;
7393            }
7394
7395            /* Apply mapping */
7396            if (PyLong_Check(x)) {
7397                long value = PyLong_AS_LONG(x);
7398                if (value < 0 || value > MAX_UNICODE) {
7399                    PyErr_Format(PyExc_TypeError,
7400                                 "character mapping must be in range(0x%lx)",
7401                                 (unsigned long)MAX_UNICODE + 1);
7402                    Py_DECREF(x);
7403                    goto onError;
7404                }
7405
7406                if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1)
7407                    goto onError;
7408                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7409                writer.pos++;
7410            }
7411            else if (x == Py_None) {
7412                /* undefined mapping */
7413                startinpos = s-starts;
7414                endinpos = startinpos+1;
7415                if (unicode_decode_call_errorhandler_writer(
7416                        errors, &errorHandler,
7417                        "charmap", "character maps to <undefined>",
7418                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7419                        &writer)) {
7420                    Py_DECREF(x);
7421                    goto onError;
7422                }
7423                Py_DECREF(x);
7424                continue;
7425            }
7426            else if (PyUnicode_Check(x)) {
7427                writer.overallocate = 1;
7428                if (_PyUnicodeWriter_WriteStr(&writer, x) == -1)
7429                    goto onError;
7430            }
7431            else {
7432                /* wrong return value */
7433                PyErr_SetString(PyExc_TypeError,
7434                                "character mapping must return integer, None or str");
7435                Py_DECREF(x);
7436                goto onError;
7437            }
7438            Py_DECREF(x);
7439            ++s;
7440        }
7441    }
7442    Py_XDECREF(errorHandler);
7443    Py_XDECREF(exc);
7444    return _PyUnicodeWriter_Finish(&writer);
7445
7446  onError:
7447    Py_XDECREF(errorHandler);
7448    Py_XDECREF(exc);
7449    _PyUnicodeWriter_Dealloc(&writer);
7450    return NULL;
7451}
7452
7453/* Charmap encoding: the lookup table */
7454
7455struct encoding_map {
7456    PyObject_HEAD
7457    unsigned char level1[32];
7458    int count2, count3;
7459    unsigned char level23[1];
7460};
7461
7462static PyObject*
7463encoding_map_size(PyObject *obj, PyObject* args)
7464{
7465    struct encoding_map *map = (struct encoding_map*)obj;
7466    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7467                           128*map->count3);
7468}
7469
7470static PyMethodDef encoding_map_methods[] = {
7471    {"size", encoding_map_size, METH_NOARGS,
7472     PyDoc_STR("Return the size (in bytes) of this object") },
7473    { 0 }
7474};
7475
7476static void
7477encoding_map_dealloc(PyObject* o)
7478{
7479    PyObject_FREE(o);
7480}
7481
7482static PyTypeObject EncodingMapType = {
7483    PyVarObject_HEAD_INIT(NULL, 0)
7484    "EncodingMap",          /*tp_name*/
7485    sizeof(struct encoding_map),   /*tp_basicsize*/
7486    0,                      /*tp_itemsize*/
7487    /* methods */
7488    encoding_map_dealloc,   /*tp_dealloc*/
7489    0,                      /*tp_print*/
7490    0,                      /*tp_getattr*/
7491    0,                      /*tp_setattr*/
7492    0,                      /*tp_reserved*/
7493    0,                      /*tp_repr*/
7494    0,                      /*tp_as_number*/
7495    0,                      /*tp_as_sequence*/
7496    0,                      /*tp_as_mapping*/
7497    0,                      /*tp_hash*/
7498    0,                      /*tp_call*/
7499    0,                      /*tp_str*/
7500    0,                      /*tp_getattro*/
7501    0,                      /*tp_setattro*/
7502    0,                      /*tp_as_buffer*/
7503    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7504    0,                      /*tp_doc*/
7505    0,                      /*tp_traverse*/
7506    0,                      /*tp_clear*/
7507    0,                      /*tp_richcompare*/
7508    0,                      /*tp_weaklistoffset*/
7509    0,                      /*tp_iter*/
7510    0,                      /*tp_iternext*/
7511    encoding_map_methods,   /*tp_methods*/
7512    0,                      /*tp_members*/
7513    0,                      /*tp_getset*/
7514    0,                      /*tp_base*/
7515    0,                      /*tp_dict*/
7516    0,                      /*tp_descr_get*/
7517    0,                      /*tp_descr_set*/
7518    0,                      /*tp_dictoffset*/
7519    0,                      /*tp_init*/
7520    0,                      /*tp_alloc*/
7521    0,                      /*tp_new*/
7522    0,                      /*tp_free*/
7523    0,                      /*tp_is_gc*/
7524};
7525
7526PyObject*
7527PyUnicode_BuildEncodingMap(PyObject* string)
7528{
7529    PyObject *result;
7530    struct encoding_map *mresult;
7531    int i;
7532    int need_dict = 0;
7533    unsigned char level1[32];
7534    unsigned char level2[512];
7535    unsigned char *mlevel1, *mlevel2, *mlevel3;
7536    int count2 = 0, count3 = 0;
7537    int kind;
7538    void *data;
7539    Py_ssize_t length;
7540    Py_UCS4 ch;
7541
7542    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7543        PyErr_BadArgument();
7544        return NULL;
7545    }
7546    kind = PyUnicode_KIND(string);
7547    data = PyUnicode_DATA(string);
7548    length = PyUnicode_GET_LENGTH(string);
7549    length = Py_MIN(length, 256);
7550    memset(level1, 0xFF, sizeof level1);
7551    memset(level2, 0xFF, sizeof level2);
7552
7553    /* If there isn't a one-to-one mapping of NULL to \0,
7554       or if there are non-BMP characters, we need to use
7555       a mapping dictionary. */
7556    if (PyUnicode_READ(kind, data, 0) != 0)
7557        need_dict = 1;
7558    for (i = 1; i < length; i++) {
7559        int l1, l2;
7560        ch = PyUnicode_READ(kind, data, i);
7561        if (ch == 0 || ch > 0xFFFF) {
7562            need_dict = 1;
7563            break;
7564        }
7565        if (ch == 0xFFFE)
7566            /* unmapped character */
7567            continue;
7568        l1 = ch >> 11;
7569        l2 = ch >> 7;
7570        if (level1[l1] == 0xFF)
7571            level1[l1] = count2++;
7572        if (level2[l2] == 0xFF)
7573            level2[l2] = count3++;
7574    }
7575
7576    if (count2 >= 0xFF || count3 >= 0xFF)
7577        need_dict = 1;
7578
7579    if (need_dict) {
7580        PyObject *result = PyDict_New();
7581        PyObject *key, *value;
7582        if (!result)
7583            return NULL;
7584        for (i = 0; i < length; i++) {
7585            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7586            value = PyLong_FromLong(i);
7587            if (!key || !value)
7588                goto failed1;
7589            if (PyDict_SetItem(result, key, value) == -1)
7590                goto failed1;
7591            Py_DECREF(key);
7592            Py_DECREF(value);
7593        }
7594        return result;
7595      failed1:
7596        Py_XDECREF(key);
7597        Py_XDECREF(value);
7598        Py_DECREF(result);
7599        return NULL;
7600    }
7601
7602    /* Create a three-level trie */
7603    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7604                             16*count2 + 128*count3 - 1);
7605    if (!result)
7606        return PyErr_NoMemory();
7607    PyObject_Init(result, &EncodingMapType);
7608    mresult = (struct encoding_map*)result;
7609    mresult->count2 = count2;
7610    mresult->count3 = count3;
7611    mlevel1 = mresult->level1;
7612    mlevel2 = mresult->level23;
7613    mlevel3 = mresult->level23 + 16*count2;
7614    memcpy(mlevel1, level1, 32);
7615    memset(mlevel2, 0xFF, 16*count2);
7616    memset(mlevel3, 0, 128*count3);
7617    count3 = 0;
7618    for (i = 1; i < length; i++) {
7619        int o1, o2, o3, i2, i3;
7620        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7621        if (ch == 0xFFFE)
7622            /* unmapped character */
7623            continue;
7624        o1 = ch>>11;
7625        o2 = (ch>>7) & 0xF;
7626        i2 = 16*mlevel1[o1] + o2;
7627        if (mlevel2[i2] == 0xFF)
7628            mlevel2[i2] = count3++;
7629        o3 = ch & 0x7F;
7630        i3 = 128*mlevel2[i2] + o3;
7631        mlevel3[i3] = i;
7632    }
7633    return result;
7634}
7635
7636static int
7637encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7638{
7639    struct encoding_map *map = (struct encoding_map*)mapping;
7640    int l1 = c>>11;
7641    int l2 = (c>>7) & 0xF;
7642    int l3 = c & 0x7F;
7643    int i;
7644
7645    if (c > 0xFFFF)
7646        return -1;
7647    if (c == 0)
7648        return 0;
7649    /* level 1*/
7650    i = map->level1[l1];
7651    if (i == 0xFF) {
7652        return -1;
7653    }
7654    /* level 2*/
7655    i = map->level23[16*i+l2];
7656    if (i == 0xFF) {
7657        return -1;
7658    }
7659    /* level 3 */
7660    i = map->level23[16*map->count2 + 128*i + l3];
7661    if (i == 0) {
7662        return -1;
7663    }
7664    return i;
7665}
7666
7667/* Lookup the character ch in the mapping. If the character
7668   can't be found, Py_None is returned (or NULL, if another
7669   error occurred). */
7670static PyObject *
7671charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7672{
7673    PyObject *w = PyLong_FromLong((long)c);
7674    PyObject *x;
7675
7676    if (w == NULL)
7677        return NULL;
7678    x = PyObject_GetItem(mapping, w);
7679    Py_DECREF(w);
7680    if (x == NULL) {
7681        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7682            /* No mapping found means: mapping is undefined. */
7683            PyErr_Clear();
7684            x = Py_None;
7685            Py_INCREF(x);
7686            return x;
7687        } else
7688            return NULL;
7689    }
7690    else if (x == Py_None)
7691        return x;
7692    else if (PyLong_Check(x)) {
7693        long value = PyLong_AS_LONG(x);
7694        if (value < 0 || value > 255) {
7695            PyErr_SetString(PyExc_TypeError,
7696                            "character mapping must be in range(256)");
7697            Py_DECREF(x);
7698            return NULL;
7699        }
7700        return x;
7701    }
7702    else if (PyBytes_Check(x))
7703        return x;
7704    else {
7705        /* wrong return value */
7706        PyErr_Format(PyExc_TypeError,
7707                     "character mapping must return integer, bytes or None, not %.400s",
7708                     x->ob_type->tp_name);
7709        Py_DECREF(x);
7710        return NULL;
7711    }
7712}
7713
7714static int
7715charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7716{
7717    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7718    /* exponentially overallocate to minimize reallocations */
7719    if (requiredsize < 2*outsize)
7720        requiredsize = 2*outsize;
7721    if (_PyBytes_Resize(outobj, requiredsize))
7722        return -1;
7723    return 0;
7724}
7725
7726typedef enum charmapencode_result {
7727    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7728} charmapencode_result;
7729/* lookup the character, put the result in the output string and adjust
7730   various state variables. Resize the output bytes object if not enough
7731   space is available. Return a new reference to the object that
7732   was put in the output buffer, or Py_None, if the mapping was undefined
7733   (in which case no character was written) or NULL, if a
7734   reallocation error occurred. The caller must decref the result */
7735static charmapencode_result
7736charmapencode_output(Py_UCS4 c, PyObject *mapping,
7737                     PyObject **outobj, Py_ssize_t *outpos)
7738{
7739    PyObject *rep;
7740    char *outstart;
7741    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7742
7743    if (Py_TYPE(mapping) == &EncodingMapType) {
7744        int res = encoding_map_lookup(c, mapping);
7745        Py_ssize_t requiredsize = *outpos+1;
7746        if (res == -1)
7747            return enc_FAILED;
7748        if (outsize<requiredsize)
7749            if (charmapencode_resize(outobj, outpos, requiredsize))
7750                return enc_EXCEPTION;
7751        outstart = PyBytes_AS_STRING(*outobj);
7752        outstart[(*outpos)++] = (char)res;
7753        return enc_SUCCESS;
7754    }
7755
7756    rep = charmapencode_lookup(c, mapping);
7757    if (rep==NULL)
7758        return enc_EXCEPTION;
7759    else if (rep==Py_None) {
7760        Py_DECREF(rep);
7761        return enc_FAILED;
7762    } else {
7763        if (PyLong_Check(rep)) {
7764            Py_ssize_t requiredsize = *outpos+1;
7765            if (outsize<requiredsize)
7766                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7767                    Py_DECREF(rep);
7768                    return enc_EXCEPTION;
7769                }
7770            outstart = PyBytes_AS_STRING(*outobj);
7771            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7772        }
7773        else {
7774            const char *repchars = PyBytes_AS_STRING(rep);
7775            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7776            Py_ssize_t requiredsize = *outpos+repsize;
7777            if (outsize<requiredsize)
7778                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7779                    Py_DECREF(rep);
7780                    return enc_EXCEPTION;
7781                }
7782            outstart = PyBytes_AS_STRING(*outobj);
7783            memcpy(outstart + *outpos, repchars, repsize);
7784            *outpos += repsize;
7785        }
7786    }
7787    Py_DECREF(rep);
7788    return enc_SUCCESS;
7789}
7790
7791/* handle an error in PyUnicode_EncodeCharmap
7792   Return 0 on success, -1 on error */
7793static int
7794charmap_encoding_error(
7795    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
7796    PyObject **exceptionObject,
7797    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7798    PyObject **res, Py_ssize_t *respos)
7799{
7800    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7801    Py_ssize_t size, repsize;
7802    Py_ssize_t newpos;
7803    enum PyUnicode_Kind kind;
7804    void *data;
7805    Py_ssize_t index;
7806    /* startpos for collecting unencodable chars */
7807    Py_ssize_t collstartpos = *inpos;
7808    Py_ssize_t collendpos = *inpos+1;
7809    Py_ssize_t collpos;
7810    char *encoding = "charmap";
7811    char *reason = "character maps to <undefined>";
7812    charmapencode_result x;
7813    Py_UCS4 ch;
7814    int val;
7815
7816    if (PyUnicode_READY(unicode) == -1)
7817        return -1;
7818    size = PyUnicode_GET_LENGTH(unicode);
7819    /* find all unencodable characters */
7820    while (collendpos < size) {
7821        PyObject *rep;
7822        if (Py_TYPE(mapping) == &EncodingMapType) {
7823            ch = PyUnicode_READ_CHAR(unicode, collendpos);
7824            val = encoding_map_lookup(ch, mapping);
7825            if (val != -1)
7826                break;
7827            ++collendpos;
7828            continue;
7829        }
7830
7831        ch = PyUnicode_READ_CHAR(unicode, collendpos);
7832        rep = charmapencode_lookup(ch, mapping);
7833        if (rep==NULL)
7834            return -1;
7835        else if (rep!=Py_None) {
7836            Py_DECREF(rep);
7837            break;
7838        }
7839        Py_DECREF(rep);
7840        ++collendpos;
7841    }
7842    /* cache callback name lookup
7843     * (if not done yet, i.e. it's the first error) */
7844    if (*known_errorHandler==-1) {
7845        if ((errors==NULL) || (!strcmp(errors, "strict")))
7846            *known_errorHandler = 1;
7847        else if (!strcmp(errors, "replace"))
7848            *known_errorHandler = 2;
7849        else if (!strcmp(errors, "ignore"))
7850            *known_errorHandler = 3;
7851        else if (!strcmp(errors, "xmlcharrefreplace"))
7852            *known_errorHandler = 4;
7853        else
7854            *known_errorHandler = 0;
7855    }
7856    switch (*known_errorHandler) {
7857    case 1: /* strict */
7858        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7859        return -1;
7860    case 2: /* replace */
7861        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7862            x = charmapencode_output('?', mapping, res, respos);
7863            if (x==enc_EXCEPTION) {
7864                return -1;
7865            }
7866            else if (x==enc_FAILED) {
7867                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7868                return -1;
7869            }
7870        }
7871        /* fall through */
7872    case 3: /* ignore */
7873        *inpos = collendpos;
7874        break;
7875    case 4: /* xmlcharrefreplace */
7876        /* generate replacement (temporarily (mis)uses p) */
7877        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7878            char buffer[2+29+1+1];
7879            char *cp;
7880            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
7881            for (cp = buffer; *cp; ++cp) {
7882                x = charmapencode_output(*cp, mapping, res, respos);
7883                if (x==enc_EXCEPTION)
7884                    return -1;
7885                else if (x==enc_FAILED) {
7886                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7887                    return -1;
7888                }
7889            }
7890        }
7891        *inpos = collendpos;
7892        break;
7893    default:
7894        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7895                                                      encoding, reason, unicode, exceptionObject,
7896                                                      collstartpos, collendpos, &newpos);
7897        if (repunicode == NULL)
7898            return -1;
7899        if (PyBytes_Check(repunicode)) {
7900            /* Directly copy bytes result to output. */
7901            Py_ssize_t outsize = PyBytes_Size(*res);
7902            Py_ssize_t requiredsize;
7903            repsize = PyBytes_Size(repunicode);
7904            requiredsize = *respos + repsize;
7905            if (requiredsize > outsize)
7906                /* Make room for all additional bytes. */
7907                if (charmapencode_resize(res, respos, requiredsize)) {
7908                    Py_DECREF(repunicode);
7909                    return -1;
7910                }
7911            memcpy(PyBytes_AsString(*res) + *respos,
7912                   PyBytes_AsString(repunicode),  repsize);
7913            *respos += repsize;
7914            *inpos = newpos;
7915            Py_DECREF(repunicode);
7916            break;
7917        }
7918        /* generate replacement  */
7919        if (PyUnicode_READY(repunicode) == -1) {
7920            Py_DECREF(repunicode);
7921            return -1;
7922        }
7923        repsize = PyUnicode_GET_LENGTH(repunicode);
7924        data = PyUnicode_DATA(repunicode);
7925        kind = PyUnicode_KIND(repunicode);
7926        for (index = 0; index < repsize; index++) {
7927            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7928            x = charmapencode_output(repch, mapping, res, respos);
7929            if (x==enc_EXCEPTION) {
7930                Py_DECREF(repunicode);
7931                return -1;
7932            }
7933            else if (x==enc_FAILED) {
7934                Py_DECREF(repunicode);
7935                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7936                return -1;
7937            }
7938        }
7939        *inpos = newpos;
7940        Py_DECREF(repunicode);
7941    }
7942    return 0;
7943}
7944
7945PyObject *
7946_PyUnicode_EncodeCharmap(PyObject *unicode,
7947                         PyObject *mapping,
7948                         const char *errors)
7949{
7950    /* output object */
7951    PyObject *res = NULL;
7952    /* current input position */
7953    Py_ssize_t inpos = 0;
7954    Py_ssize_t size;
7955    /* current output position */
7956    Py_ssize_t respos = 0;
7957    PyObject *errorHandler = NULL;
7958    PyObject *exc = NULL;
7959    /* the following variable is used for caching string comparisons
7960     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7961     * 3=ignore, 4=xmlcharrefreplace */
7962    int known_errorHandler = -1;
7963
7964    if (PyUnicode_READY(unicode) == -1)
7965        return NULL;
7966    size = PyUnicode_GET_LENGTH(unicode);
7967
7968    /* Default to Latin-1 */
7969    if (mapping == NULL)
7970        return unicode_encode_ucs1(unicode, errors, 256);
7971
7972    /* allocate enough for a simple encoding without
7973       replacements, if we need more, we'll resize */
7974    res = PyBytes_FromStringAndSize(NULL, size);
7975    if (res == NULL)
7976        goto onError;
7977    if (size == 0)
7978        return res;
7979
7980    while (inpos<size) {
7981        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
7982        /* try to encode it */
7983        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
7984        if (x==enc_EXCEPTION) /* error */
7985            goto onError;
7986        if (x==enc_FAILED) { /* unencodable character */
7987            if (charmap_encoding_error(unicode, &inpos, mapping,
7988                                       &exc,
7989                                       &known_errorHandler, &errorHandler, errors,
7990                                       &res, &respos)) {
7991                goto onError;
7992            }
7993        }
7994        else
7995            /* done with this character => adjust input position */
7996            ++inpos;
7997    }
7998
7999    /* Resize if we allocated to much */
8000    if (respos<PyBytes_GET_SIZE(res))
8001        if (_PyBytes_Resize(&res, respos) < 0)
8002            goto onError;
8003
8004    Py_XDECREF(exc);
8005    Py_XDECREF(errorHandler);
8006    return res;
8007
8008  onError:
8009    Py_XDECREF(res);
8010    Py_XDECREF(exc);
8011    Py_XDECREF(errorHandler);
8012    return NULL;
8013}
8014
8015/* Deprecated */
8016PyObject *
8017PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8018                        Py_ssize_t size,
8019                        PyObject *mapping,
8020                        const char *errors)
8021{
8022    PyObject *result;
8023    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8024    if (unicode == NULL)
8025        return NULL;
8026    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8027    Py_DECREF(unicode);
8028    return result;
8029}
8030
8031PyObject *
8032PyUnicode_AsCharmapString(PyObject *unicode,
8033                          PyObject *mapping)
8034{
8035    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8036        PyErr_BadArgument();
8037        return NULL;
8038    }
8039    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8040}
8041
8042/* create or adjust a UnicodeTranslateError */
8043static void
8044make_translate_exception(PyObject **exceptionObject,
8045                         PyObject *unicode,
8046                         Py_ssize_t startpos, Py_ssize_t endpos,
8047                         const char *reason)
8048{
8049    if (*exceptionObject == NULL) {
8050        *exceptionObject = _PyUnicodeTranslateError_Create(
8051            unicode, startpos, endpos, reason);
8052    }
8053    else {
8054        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8055            goto onError;
8056        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8057            goto onError;
8058        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8059            goto onError;
8060        return;
8061      onError:
8062        Py_DECREF(*exceptionObject);
8063        *exceptionObject = NULL;
8064    }
8065}
8066
8067/* error handling callback helper:
8068   build arguments, call the callback and check the arguments,
8069   put the result into newpos and return the replacement string, which
8070   has to be freed by the caller */
8071static PyObject *
8072unicode_translate_call_errorhandler(const char *errors,
8073                                    PyObject **errorHandler,
8074                                    const char *reason,
8075                                    PyObject *unicode, PyObject **exceptionObject,
8076                                    Py_ssize_t startpos, Py_ssize_t endpos,
8077                                    Py_ssize_t *newpos)
8078{
8079    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8080
8081    Py_ssize_t i_newpos;
8082    PyObject *restuple;
8083    PyObject *resunicode;
8084
8085    if (*errorHandler == NULL) {
8086        *errorHandler = PyCodec_LookupError(errors);
8087        if (*errorHandler == NULL)
8088            return NULL;
8089    }
8090
8091    make_translate_exception(exceptionObject,
8092                             unicode, startpos, endpos, reason);
8093    if (*exceptionObject == NULL)
8094        return NULL;
8095
8096    restuple = PyObject_CallFunctionObjArgs(
8097        *errorHandler, *exceptionObject, NULL);
8098    if (restuple == NULL)
8099        return NULL;
8100    if (!PyTuple_Check(restuple)) {
8101        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8102        Py_DECREF(restuple);
8103        return NULL;
8104    }
8105    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8106                          &resunicode, &i_newpos)) {
8107        Py_DECREF(restuple);
8108        return NULL;
8109    }
8110    if (i_newpos<0)
8111        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8112    else
8113        *newpos = i_newpos;
8114    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8115        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8116        Py_DECREF(restuple);
8117        return NULL;
8118    }
8119    Py_INCREF(resunicode);
8120    Py_DECREF(restuple);
8121    return resunicode;
8122}
8123
8124/* Lookup the character ch in the mapping and put the result in result,
8125   which must be decrefed by the caller.
8126   Return 0 on success, -1 on error */
8127static int
8128charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8129{
8130    PyObject *w = PyLong_FromLong((long)c);
8131    PyObject *x;
8132
8133    if (w == NULL)
8134        return -1;
8135    x = PyObject_GetItem(mapping, w);
8136    Py_DECREF(w);
8137    if (x == NULL) {
8138        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8139            /* No mapping found means: use 1:1 mapping. */
8140            PyErr_Clear();
8141            *result = NULL;
8142            return 0;
8143        } else
8144            return -1;
8145    }
8146    else if (x == Py_None) {
8147        *result = x;
8148        return 0;
8149    }
8150    else if (PyLong_Check(x)) {
8151        long value = PyLong_AS_LONG(x);
8152        long max = PyUnicode_GetMax();
8153        if (value < 0 || value > max) {
8154            PyErr_Format(PyExc_TypeError,
8155                         "character mapping must be in range(0x%x)", max+1);
8156            Py_DECREF(x);
8157            return -1;
8158        }
8159        *result = x;
8160        return 0;
8161    }
8162    else if (PyUnicode_Check(x)) {
8163        *result = x;
8164        return 0;
8165    }
8166    else {
8167        /* wrong return value */
8168        PyErr_SetString(PyExc_TypeError,
8169                        "character mapping must return integer, None or str");
8170        Py_DECREF(x);
8171        return -1;
8172    }
8173}
8174/* ensure that *outobj is at least requiredsize characters long,
8175   if not reallocate and adjust various state variables.
8176   Return 0 on success, -1 on error */
8177static int
8178charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8179                               Py_ssize_t requiredsize)
8180{
8181    Py_ssize_t oldsize = *psize;
8182    Py_UCS4 *new_outobj;
8183    if (requiredsize > oldsize) {
8184        /* exponentially overallocate to minimize reallocations */
8185        if (requiredsize < 2 * oldsize)
8186            requiredsize = 2 * oldsize;
8187        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8188        if (new_outobj == 0)
8189            return -1;
8190        *outobj = new_outobj;
8191        *psize = requiredsize;
8192    }
8193    return 0;
8194}
8195/* lookup the character, put the result in the output string and adjust
8196   various state variables. Return a new reference to the object that
8197   was put in the output buffer in *result, or Py_None, if the mapping was
8198   undefined (in which case no character was written).
8199   The called must decref result.
8200   Return 0 on success, -1 on error. */
8201static int
8202charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8203                        PyObject *mapping, Py_UCS4 **output,
8204                        Py_ssize_t *osize, Py_ssize_t *opos,
8205                        PyObject **res)
8206{
8207    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8208    if (charmaptranslate_lookup(curinp, mapping, res))
8209        return -1;
8210    if (*res==NULL) {
8211        /* not found => default to 1:1 mapping */
8212        (*output)[(*opos)++] = curinp;
8213    }
8214    else if (*res==Py_None)
8215        ;
8216    else if (PyLong_Check(*res)) {
8217        /* no overflow check, because we know that the space is enough */
8218        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8219    }
8220    else if (PyUnicode_Check(*res)) {
8221        Py_ssize_t repsize;
8222        if (PyUnicode_READY(*res) == -1)
8223            return -1;
8224        repsize = PyUnicode_GET_LENGTH(*res);
8225        if (repsize==1) {
8226            /* no overflow check, because we know that the space is enough */
8227            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8228        }
8229        else if (repsize!=0) {
8230            /* more than one character */
8231            Py_ssize_t requiredsize = *opos +
8232                (PyUnicode_GET_LENGTH(input) - ipos) +
8233                repsize - 1;
8234            Py_ssize_t i;
8235            if (charmaptranslate_makespace(output, osize, requiredsize))
8236                return -1;
8237            for(i = 0; i < repsize; i++)
8238                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8239        }
8240    }
8241    else
8242        return -1;
8243    return 0;
8244}
8245
8246PyObject *
8247_PyUnicode_TranslateCharmap(PyObject *input,
8248                            PyObject *mapping,
8249                            const char *errors)
8250{
8251    /* input object */
8252    char *idata;
8253    Py_ssize_t size, i;
8254    int kind;
8255    /* output buffer */
8256    Py_UCS4 *output = NULL;
8257    Py_ssize_t osize;
8258    PyObject *res;
8259    /* current output position */
8260    Py_ssize_t opos;
8261    char *reason = "character maps to <undefined>";
8262    PyObject *errorHandler = NULL;
8263    PyObject *exc = NULL;
8264    /* the following variable is used for caching string comparisons
8265     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8266     * 3=ignore, 4=xmlcharrefreplace */
8267    int known_errorHandler = -1;
8268
8269    if (mapping == NULL) {
8270        PyErr_BadArgument();
8271        return NULL;
8272    }
8273
8274    if (PyUnicode_READY(input) == -1)
8275        return NULL;
8276    idata = (char*)PyUnicode_DATA(input);
8277    kind = PyUnicode_KIND(input);
8278    size = PyUnicode_GET_LENGTH(input);
8279    i = 0;
8280
8281    if (size == 0) {
8282        Py_INCREF(input);
8283        return input;
8284    }
8285
8286    /* allocate enough for a simple 1:1 translation without
8287       replacements, if we need more, we'll resize */
8288    osize = size;
8289    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8290    opos = 0;
8291    if (output == NULL) {
8292        PyErr_NoMemory();
8293        goto onError;
8294    }
8295
8296    while (i<size) {
8297        /* try to encode it */
8298        PyObject *x = NULL;
8299        if (charmaptranslate_output(input, i, mapping,
8300                                    &output, &osize, &opos, &x)) {
8301            Py_XDECREF(x);
8302            goto onError;
8303        }
8304        Py_XDECREF(x);
8305        if (x!=Py_None) /* it worked => adjust input pointer */
8306            ++i;
8307        else { /* untranslatable character */
8308            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8309            Py_ssize_t repsize;
8310            Py_ssize_t newpos;
8311            Py_ssize_t uni2;
8312            /* startpos for collecting untranslatable chars */
8313            Py_ssize_t collstart = i;
8314            Py_ssize_t collend = i+1;
8315            Py_ssize_t coll;
8316
8317            /* find all untranslatable characters */
8318            while (collend < size) {
8319                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8320                    goto onError;
8321                Py_XDECREF(x);
8322                if (x!=Py_None)
8323                    break;
8324                ++collend;
8325            }
8326            /* cache callback name lookup
8327             * (if not done yet, i.e. it's the first error) */
8328            if (known_errorHandler==-1) {
8329                if ((errors==NULL) || (!strcmp(errors, "strict")))
8330                    known_errorHandler = 1;
8331                else if (!strcmp(errors, "replace"))
8332                    known_errorHandler = 2;
8333                else if (!strcmp(errors, "ignore"))
8334                    known_errorHandler = 3;
8335                else if (!strcmp(errors, "xmlcharrefreplace"))
8336                    known_errorHandler = 4;
8337                else
8338                    known_errorHandler = 0;
8339            }
8340            switch (known_errorHandler) {
8341            case 1: /* strict */
8342                make_translate_exception(&exc,
8343                                         input, collstart, collend, reason);
8344                if (exc != NULL)
8345                    PyCodec_StrictErrors(exc);
8346                goto onError;
8347            case 2: /* replace */
8348                /* No need to check for space, this is a 1:1 replacement */
8349                for (coll = collstart; coll<collend; coll++)
8350                    output[opos++] = '?';
8351                /* fall through */
8352            case 3: /* ignore */
8353                i = collend;
8354                break;
8355            case 4: /* xmlcharrefreplace */
8356                /* generate replacement (temporarily (mis)uses i) */
8357                for (i = collstart; i < collend; ++i) {
8358                    char buffer[2+29+1+1];
8359                    char *cp;
8360                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8361                    if (charmaptranslate_makespace(&output, &osize,
8362                                                   opos+strlen(buffer)+(size-collend)))
8363                        goto onError;
8364                    for (cp = buffer; *cp; ++cp)
8365                        output[opos++] = *cp;
8366                }
8367                i = collend;
8368                break;
8369            default:
8370                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8371                                                                 reason, input, &exc,
8372                                                                 collstart, collend, &newpos);
8373                if (repunicode == NULL)
8374                    goto onError;
8375                if (PyUnicode_READY(repunicode) == -1) {
8376                    Py_DECREF(repunicode);
8377                    goto onError;
8378                }
8379                /* generate replacement  */
8380                repsize = PyUnicode_GET_LENGTH(repunicode);
8381                if (charmaptranslate_makespace(&output, &osize,
8382                                               opos+repsize+(size-collend))) {
8383                    Py_DECREF(repunicode);
8384                    goto onError;
8385                }
8386                for (uni2 = 0; repsize-->0; ++uni2)
8387                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8388                i = newpos;
8389                Py_DECREF(repunicode);
8390            }
8391        }
8392    }
8393    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8394    if (!res)
8395        goto onError;
8396    PyMem_Free(output);
8397    Py_XDECREF(exc);
8398    Py_XDECREF(errorHandler);
8399    return res;
8400
8401  onError:
8402    PyMem_Free(output);
8403    Py_XDECREF(exc);
8404    Py_XDECREF(errorHandler);
8405    return NULL;
8406}
8407
8408/* Deprecated. Use PyUnicode_Translate instead. */
8409PyObject *
8410PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8411                           Py_ssize_t size,
8412                           PyObject *mapping,
8413                           const char *errors)
8414{
8415    PyObject *result;
8416    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8417    if (!unicode)
8418        return NULL;
8419    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8420    Py_DECREF(unicode);
8421    return result;
8422}
8423
8424PyObject *
8425PyUnicode_Translate(PyObject *str,
8426                    PyObject *mapping,
8427                    const char *errors)
8428{
8429    PyObject *result;
8430
8431    str = PyUnicode_FromObject(str);
8432    if (str == NULL)
8433        return NULL;
8434    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8435    Py_DECREF(str);
8436    return result;
8437}
8438
8439static Py_UCS4
8440fix_decimal_and_space_to_ascii(PyObject *self)
8441{
8442    /* No need to call PyUnicode_READY(self) because this function is only
8443       called as a callback from fixup() which does it already. */
8444    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8445    const int kind = PyUnicode_KIND(self);
8446    void *data = PyUnicode_DATA(self);
8447    Py_UCS4 maxchar = 127, ch, fixed;
8448    int modified = 0;
8449    Py_ssize_t i;
8450
8451    for (i = 0; i < len; ++i) {
8452        ch = PyUnicode_READ(kind, data, i);
8453        fixed = 0;
8454        if (ch > 127) {
8455            if (Py_UNICODE_ISSPACE(ch))
8456                fixed = ' ';
8457            else {
8458                const int decimal = Py_UNICODE_TODECIMAL(ch);
8459                if (decimal >= 0)
8460                    fixed = '0' + decimal;
8461            }
8462            if (fixed != 0) {
8463                modified = 1;
8464                maxchar = MAX_MAXCHAR(maxchar, fixed);
8465                PyUnicode_WRITE(kind, data, i, fixed);
8466            }
8467            else
8468                maxchar = MAX_MAXCHAR(maxchar, ch);
8469        }
8470    }
8471
8472    return (modified) ? maxchar : 0;
8473}
8474
8475PyObject *
8476_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8477{
8478    if (!PyUnicode_Check(unicode)) {
8479        PyErr_BadInternalCall();
8480        return NULL;
8481    }
8482    if (PyUnicode_READY(unicode) == -1)
8483        return NULL;
8484    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8485        /* If the string is already ASCII, just return the same string */
8486        Py_INCREF(unicode);
8487        return unicode;
8488    }
8489    return fixup(unicode, fix_decimal_and_space_to_ascii);
8490}
8491
8492PyObject *
8493PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8494                                  Py_ssize_t length)
8495{
8496    PyObject *decimal;
8497    Py_ssize_t i;
8498    Py_UCS4 maxchar;
8499    enum PyUnicode_Kind kind;
8500    void *data;
8501
8502    maxchar = 127;
8503    for (i = 0; i < length; i++) {
8504        Py_UNICODE ch = s[i];
8505        if (ch > 127) {
8506            int decimal = Py_UNICODE_TODECIMAL(ch);
8507            if (decimal >= 0)
8508                ch = '0' + decimal;
8509            maxchar = MAX_MAXCHAR(maxchar, ch);
8510        }
8511    }
8512
8513    /* Copy to a new string */
8514    decimal = PyUnicode_New(length, maxchar);
8515    if (decimal == NULL)
8516        return decimal;
8517    kind = PyUnicode_KIND(decimal);
8518    data = PyUnicode_DATA(decimal);
8519    /* Iterate over code points */
8520    for (i = 0; i < length; i++) {
8521        Py_UNICODE ch = s[i];
8522        if (ch > 127) {
8523            int decimal = Py_UNICODE_TODECIMAL(ch);
8524            if (decimal >= 0)
8525                ch = '0' + decimal;
8526        }
8527        PyUnicode_WRITE(kind, data, i, ch);
8528    }
8529    return unicode_result(decimal);
8530}
8531/* --- Decimal Encoder ---------------------------------------------------- */
8532
8533int
8534PyUnicode_EncodeDecimal(Py_UNICODE *s,
8535                        Py_ssize_t length,
8536                        char *output,
8537                        const char *errors)
8538{
8539    PyObject *unicode;
8540    Py_ssize_t i;
8541    enum PyUnicode_Kind kind;
8542    void *data;
8543
8544    if (output == NULL) {
8545        PyErr_BadArgument();
8546        return -1;
8547    }
8548
8549    unicode = PyUnicode_FromUnicode(s, length);
8550    if (unicode == NULL)
8551        return -1;
8552
8553    if (PyUnicode_READY(unicode) == -1) {
8554        Py_DECREF(unicode);
8555        return -1;
8556    }
8557    kind = PyUnicode_KIND(unicode);
8558    data = PyUnicode_DATA(unicode);
8559
8560    for (i=0; i < length; ) {
8561        PyObject *exc;
8562        Py_UCS4 ch;
8563        int decimal;
8564        Py_ssize_t startpos;
8565
8566        ch = PyUnicode_READ(kind, data, i);
8567
8568        if (Py_UNICODE_ISSPACE(ch)) {
8569            *output++ = ' ';
8570            i++;
8571            continue;
8572        }
8573        decimal = Py_UNICODE_TODECIMAL(ch);
8574        if (decimal >= 0) {
8575            *output++ = '0' + decimal;
8576            i++;
8577            continue;
8578        }
8579        if (0 < ch && ch < 256) {
8580            *output++ = (char)ch;
8581            i++;
8582            continue;
8583        }
8584
8585        startpos = i;
8586        exc = NULL;
8587        raise_encode_exception(&exc, "decimal", unicode,
8588                               startpos, startpos+1,
8589                               "invalid decimal Unicode string");
8590        Py_XDECREF(exc);
8591        Py_DECREF(unicode);
8592        return -1;
8593    }
8594    /* 0-terminate the output string */
8595    *output++ = '\0';
8596    Py_DECREF(unicode);
8597    return 0;
8598}
8599
8600/* --- Helpers ------------------------------------------------------------ */
8601
8602static Py_ssize_t
8603any_find_slice(int direction, PyObject* s1, PyObject* s2,
8604               Py_ssize_t start,
8605               Py_ssize_t end)
8606{
8607    int kind1, kind2, kind;
8608    void *buf1, *buf2;
8609    Py_ssize_t len1, len2, result;
8610
8611    kind1 = PyUnicode_KIND(s1);
8612    kind2 = PyUnicode_KIND(s2);
8613    kind = kind1 > kind2 ? kind1 : kind2;
8614    buf1 = PyUnicode_DATA(s1);
8615    buf2 = PyUnicode_DATA(s2);
8616    if (kind1 != kind)
8617        buf1 = _PyUnicode_AsKind(s1, kind);
8618    if (!buf1)
8619        return -2;
8620    if (kind2 != kind)
8621        buf2 = _PyUnicode_AsKind(s2, kind);
8622    if (!buf2) {
8623        if (kind1 != kind) PyMem_Free(buf1);
8624        return -2;
8625    }
8626    len1 = PyUnicode_GET_LENGTH(s1);
8627    len2 = PyUnicode_GET_LENGTH(s2);
8628
8629    if (direction > 0) {
8630        switch (kind) {
8631        case PyUnicode_1BYTE_KIND:
8632            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8633                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8634            else
8635                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8636            break;
8637        case PyUnicode_2BYTE_KIND:
8638            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8639            break;
8640        case PyUnicode_4BYTE_KIND:
8641            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8642            break;
8643        default:
8644            assert(0); result = -2;
8645        }
8646    }
8647    else {
8648        switch (kind) {
8649        case PyUnicode_1BYTE_KIND:
8650            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8651                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8652            else
8653                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8654            break;
8655        case PyUnicode_2BYTE_KIND:
8656            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8657            break;
8658        case PyUnicode_4BYTE_KIND:
8659            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8660            break;
8661        default:
8662            assert(0); result = -2;
8663        }
8664    }
8665
8666    if (kind1 != kind)
8667        PyMem_Free(buf1);
8668    if (kind2 != kind)
8669        PyMem_Free(buf2);
8670
8671    return result;
8672}
8673
8674Py_ssize_t
8675_PyUnicode_InsertThousandsGrouping(
8676    PyObject *unicode, Py_ssize_t index,
8677    Py_ssize_t n_buffer,
8678    void *digits, Py_ssize_t n_digits,
8679    Py_ssize_t min_width,
8680    const char *grouping, PyObject *thousands_sep,
8681    Py_UCS4 *maxchar)
8682{
8683    unsigned int kind, thousands_sep_kind;
8684    char *data, *thousands_sep_data;
8685    Py_ssize_t thousands_sep_len;
8686    Py_ssize_t len;
8687
8688    if (unicode != NULL) {
8689        kind = PyUnicode_KIND(unicode);
8690        data = (char *) PyUnicode_DATA(unicode) + index * kind;
8691    }
8692    else {
8693        kind = PyUnicode_1BYTE_KIND;
8694        data = NULL;
8695    }
8696    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8697    thousands_sep_data = PyUnicode_DATA(thousands_sep);
8698    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8699    if (unicode != NULL && thousands_sep_kind != kind) {
8700        if (thousands_sep_kind < kind) {
8701            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8702            if (!thousands_sep_data)
8703                return -1;
8704        }
8705        else {
8706            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8707            if (!data)
8708                return -1;
8709        }
8710    }
8711
8712    switch (kind) {
8713    case PyUnicode_1BYTE_KIND:
8714        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8715            len = asciilib_InsertThousandsGrouping(
8716                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
8717                min_width, grouping,
8718                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8719        else
8720            len = ucs1lib_InsertThousandsGrouping(
8721                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8722                min_width, grouping,
8723                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8724        break;
8725    case PyUnicode_2BYTE_KIND:
8726        len = ucs2lib_InsertThousandsGrouping(
8727            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
8728            min_width, grouping,
8729            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
8730        break;
8731    case PyUnicode_4BYTE_KIND:
8732        len = ucs4lib_InsertThousandsGrouping(
8733            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
8734            min_width, grouping,
8735            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
8736        break;
8737    default:
8738        assert(0);
8739        return -1;
8740    }
8741    if (unicode != NULL && thousands_sep_kind != kind) {
8742        if (thousands_sep_kind < kind)
8743            PyMem_Free(thousands_sep_data);
8744        else
8745            PyMem_Free(data);
8746    }
8747    if (unicode == NULL) {
8748        *maxchar = 127;
8749        if (len != n_digits) {
8750            *maxchar = MAX_MAXCHAR(*maxchar,
8751                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
8752        }
8753    }
8754    return len;
8755}
8756
8757
8758/* helper macro to fixup start/end slice values */
8759#define ADJUST_INDICES(start, end, len)         \
8760    if (end > len)                              \
8761        end = len;                              \
8762    else if (end < 0) {                         \
8763        end += len;                             \
8764        if (end < 0)                            \
8765            end = 0;                            \
8766    }                                           \
8767    if (start < 0) {                            \
8768        start += len;                           \
8769        if (start < 0)                          \
8770            start = 0;                          \
8771    }
8772
8773Py_ssize_t
8774PyUnicode_Count(PyObject *str,
8775                PyObject *substr,
8776                Py_ssize_t start,
8777                Py_ssize_t end)
8778{
8779    Py_ssize_t result;
8780    PyObject* str_obj;
8781    PyObject* sub_obj;
8782    int kind1, kind2, kind;
8783    void *buf1 = NULL, *buf2 = NULL;
8784    Py_ssize_t len1, len2;
8785
8786    str_obj = PyUnicode_FromObject(str);
8787    if (!str_obj)
8788        return -1;
8789    sub_obj = PyUnicode_FromObject(substr);
8790    if (!sub_obj) {
8791        Py_DECREF(str_obj);
8792        return -1;
8793    }
8794    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
8795        Py_DECREF(sub_obj);
8796        Py_DECREF(str_obj);
8797        return -1;
8798    }
8799
8800    kind1 = PyUnicode_KIND(str_obj);
8801    kind2 = PyUnicode_KIND(sub_obj);
8802    kind = kind1;
8803    buf1 = PyUnicode_DATA(str_obj);
8804    buf2 = PyUnicode_DATA(sub_obj);
8805    if (kind2 != kind) {
8806        if (kind2 > kind) {
8807            Py_DECREF(sub_obj);
8808            Py_DECREF(str_obj);
8809            return 0;
8810        }
8811        buf2 = _PyUnicode_AsKind(sub_obj, kind);
8812    }
8813    if (!buf2)
8814        goto onError;
8815    len1 = PyUnicode_GET_LENGTH(str_obj);
8816    len2 = PyUnicode_GET_LENGTH(sub_obj);
8817
8818    ADJUST_INDICES(start, end, len1);
8819    switch (kind) {
8820    case PyUnicode_1BYTE_KIND:
8821        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8822            result = asciilib_count(
8823                ((Py_UCS1*)buf1) + start, end - start,
8824                buf2, len2, PY_SSIZE_T_MAX
8825                );
8826        else
8827            result = ucs1lib_count(
8828                ((Py_UCS1*)buf1) + start, end - start,
8829                buf2, len2, PY_SSIZE_T_MAX
8830                );
8831        break;
8832    case PyUnicode_2BYTE_KIND:
8833        result = ucs2lib_count(
8834            ((Py_UCS2*)buf1) + start, end - start,
8835            buf2, len2, PY_SSIZE_T_MAX
8836            );
8837        break;
8838    case PyUnicode_4BYTE_KIND:
8839        result = ucs4lib_count(
8840            ((Py_UCS4*)buf1) + start, end - start,
8841            buf2, len2, PY_SSIZE_T_MAX
8842            );
8843        break;
8844    default:
8845        assert(0); result = 0;
8846    }
8847
8848    Py_DECREF(sub_obj);
8849    Py_DECREF(str_obj);
8850
8851    if (kind2 != kind)
8852        PyMem_Free(buf2);
8853
8854    return result;
8855  onError:
8856    Py_DECREF(sub_obj);
8857    Py_DECREF(str_obj);
8858    if (kind2 != kind && buf2)
8859        PyMem_Free(buf2);
8860    return -1;
8861}
8862
8863Py_ssize_t
8864PyUnicode_Find(PyObject *str,
8865               PyObject *sub,
8866               Py_ssize_t start,
8867               Py_ssize_t end,
8868               int direction)
8869{
8870    Py_ssize_t result;
8871
8872    str = PyUnicode_FromObject(str);
8873    if (!str)
8874        return -2;
8875    sub = PyUnicode_FromObject(sub);
8876    if (!sub) {
8877        Py_DECREF(str);
8878        return -2;
8879    }
8880    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8881        Py_DECREF(sub);
8882        Py_DECREF(str);
8883        return -2;
8884    }
8885
8886    result = any_find_slice(direction,
8887        str, sub, start, end
8888        );
8889
8890    Py_DECREF(str);
8891    Py_DECREF(sub);
8892
8893    return result;
8894}
8895
8896Py_ssize_t
8897PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8898                   Py_ssize_t start, Py_ssize_t end,
8899                   int direction)
8900{
8901    int kind;
8902    Py_ssize_t result;
8903    if (PyUnicode_READY(str) == -1)
8904        return -2;
8905    if (start < 0 || end < 0) {
8906        PyErr_SetString(PyExc_IndexError, "string index out of range");
8907        return -2;
8908    }
8909    if (end > PyUnicode_GET_LENGTH(str))
8910        end = PyUnicode_GET_LENGTH(str);
8911    kind = PyUnicode_KIND(str);
8912    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8913                      kind, end-start, ch, direction);
8914    if (result == -1)
8915        return -1;
8916    else
8917        return start + result;
8918}
8919
8920static int
8921tailmatch(PyObject *self,
8922          PyObject *substring,
8923          Py_ssize_t start,
8924          Py_ssize_t end,
8925          int direction)
8926{
8927    int kind_self;
8928    int kind_sub;
8929    void *data_self;
8930    void *data_sub;
8931    Py_ssize_t offset;
8932    Py_ssize_t i;
8933    Py_ssize_t end_sub;
8934
8935    if (PyUnicode_READY(self) == -1 ||
8936        PyUnicode_READY(substring) == -1)
8937        return 0;
8938
8939    if (PyUnicode_GET_LENGTH(substring) == 0)
8940        return 1;
8941
8942    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8943    end -= PyUnicode_GET_LENGTH(substring);
8944    if (end < start)
8945        return 0;
8946
8947    kind_self = PyUnicode_KIND(self);
8948    data_self = PyUnicode_DATA(self);
8949    kind_sub = PyUnicode_KIND(substring);
8950    data_sub = PyUnicode_DATA(substring);
8951    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8952
8953    if (direction > 0)
8954        offset = end;
8955    else
8956        offset = start;
8957
8958    if (PyUnicode_READ(kind_self, data_self, offset) ==
8959        PyUnicode_READ(kind_sub, data_sub, 0) &&
8960        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8961        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8962        /* If both are of the same kind, memcmp is sufficient */
8963        if (kind_self == kind_sub) {
8964            return ! memcmp((char *)data_self +
8965                                (offset * PyUnicode_KIND(substring)),
8966                            data_sub,
8967                            PyUnicode_GET_LENGTH(substring) *
8968                                PyUnicode_KIND(substring));
8969        }
8970        /* otherwise we have to compare each character by first accesing it */
8971        else {
8972            /* We do not need to compare 0 and len(substring)-1 because
8973               the if statement above ensured already that they are equal
8974               when we end up here. */
8975            /* TODO: honor direction and do a forward or backwards search */
8976            for (i = 1; i < end_sub; ++i) {
8977                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8978                    PyUnicode_READ(kind_sub, data_sub, i))
8979                    return 0;
8980            }
8981            return 1;
8982        }
8983    }
8984
8985    return 0;
8986}
8987
8988Py_ssize_t
8989PyUnicode_Tailmatch(PyObject *str,
8990                    PyObject *substr,
8991                    Py_ssize_t start,
8992                    Py_ssize_t end,
8993                    int direction)
8994{
8995    Py_ssize_t result;
8996
8997    str = PyUnicode_FromObject(str);
8998    if (str == NULL)
8999        return -1;
9000    substr = PyUnicode_FromObject(substr);
9001    if (substr == NULL) {
9002        Py_DECREF(str);
9003        return -1;
9004    }
9005
9006    result = tailmatch(str, substr,
9007                       start, end, direction);
9008    Py_DECREF(str);
9009    Py_DECREF(substr);
9010    return result;
9011}
9012
9013/* Apply fixfct filter to the Unicode object self and return a
9014   reference to the modified object */
9015
9016static PyObject *
9017fixup(PyObject *self,
9018      Py_UCS4 (*fixfct)(PyObject *s))
9019{
9020    PyObject *u;
9021    Py_UCS4 maxchar_old, maxchar_new = 0;
9022    PyObject *v;
9023
9024    u = _PyUnicode_Copy(self);
9025    if (u == NULL)
9026        return NULL;
9027    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9028
9029    /* fix functions return the new maximum character in a string,
9030       if the kind of the resulting unicode object does not change,
9031       everything is fine.  Otherwise we need to change the string kind
9032       and re-run the fix function. */
9033    maxchar_new = fixfct(u);
9034
9035    if (maxchar_new == 0) {
9036        /* no changes */;
9037        if (PyUnicode_CheckExact(self)) {
9038            Py_DECREF(u);
9039            Py_INCREF(self);
9040            return self;
9041        }
9042        else
9043            return u;
9044    }
9045
9046    maxchar_new = align_maxchar(maxchar_new);
9047
9048    if (maxchar_new == maxchar_old)
9049        return u;
9050
9051    /* In case the maximum character changed, we need to
9052       convert the string to the new category. */
9053    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9054    if (v == NULL) {
9055        Py_DECREF(u);
9056        return NULL;
9057    }
9058    if (maxchar_new > maxchar_old) {
9059        /* If the maxchar increased so that the kind changed, not all
9060           characters are representable anymore and we need to fix the
9061           string again. This only happens in very few cases. */
9062        _PyUnicode_FastCopyCharacters(v, 0,
9063                                      self, 0, PyUnicode_GET_LENGTH(self));
9064        maxchar_old = fixfct(v);
9065        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9066    }
9067    else {
9068        _PyUnicode_FastCopyCharacters(v, 0,
9069                                      u, 0, PyUnicode_GET_LENGTH(self));
9070    }
9071    Py_DECREF(u);
9072    assert(_PyUnicode_CheckConsistency(v, 1));
9073    return v;
9074}
9075
9076static PyObject *
9077ascii_upper_or_lower(PyObject *self, int lower)
9078{
9079    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9080    char *resdata, *data = PyUnicode_DATA(self);
9081    PyObject *res;
9082
9083    res = PyUnicode_New(len, 127);
9084    if (res == NULL)
9085        return NULL;
9086    resdata = PyUnicode_DATA(res);
9087    if (lower)
9088        _Py_bytes_lower(resdata, data, len);
9089    else
9090        _Py_bytes_upper(resdata, data, len);
9091    return res;
9092}
9093
9094static Py_UCS4
9095handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9096{
9097    Py_ssize_t j;
9098    int final_sigma;
9099    Py_UCS4 c;
9100    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9101
9102     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9103
9104    where ! is a negation and \p{xxx} is a character with property xxx.
9105    */
9106    for (j = i - 1; j >= 0; j--) {
9107        c = PyUnicode_READ(kind, data, j);
9108        if (!_PyUnicode_IsCaseIgnorable(c))
9109            break;
9110    }
9111    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9112    if (final_sigma) {
9113        for (j = i + 1; j < length; j++) {
9114            c = PyUnicode_READ(kind, data, j);
9115            if (!_PyUnicode_IsCaseIgnorable(c))
9116                break;
9117        }
9118        final_sigma = j == length || !_PyUnicode_IsCased(c);
9119    }
9120    return (final_sigma) ? 0x3C2 : 0x3C3;
9121}
9122
9123static int
9124lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9125           Py_UCS4 c, Py_UCS4 *mapped)
9126{
9127    /* Obscure special case. */
9128    if (c == 0x3A3) {
9129        mapped[0] = handle_capital_sigma(kind, data, length, i);
9130        return 1;
9131    }
9132    return _PyUnicode_ToLowerFull(c, mapped);
9133}
9134
9135static Py_ssize_t
9136do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9137{
9138    Py_ssize_t i, k = 0;
9139    int n_res, j;
9140    Py_UCS4 c, mapped[3];
9141
9142    c = PyUnicode_READ(kind, data, 0);
9143    n_res = _PyUnicode_ToUpperFull(c, mapped);
9144    for (j = 0; j < n_res; j++) {
9145        *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9146        res[k++] = mapped[j];
9147    }
9148    for (i = 1; i < length; i++) {
9149        c = PyUnicode_READ(kind, data, i);
9150        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9151        for (j = 0; j < n_res; j++) {
9152            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9153            res[k++] = mapped[j];
9154        }
9155    }
9156    return k;
9157}
9158
9159static Py_ssize_t
9160do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9161    Py_ssize_t i, k = 0;
9162
9163    for (i = 0; i < length; i++) {
9164        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9165        int n_res, j;
9166        if (Py_UNICODE_ISUPPER(c)) {
9167            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9168        }
9169        else if (Py_UNICODE_ISLOWER(c)) {
9170            n_res = _PyUnicode_ToUpperFull(c, mapped);
9171        }
9172        else {
9173            n_res = 1;
9174            mapped[0] = c;
9175        }
9176        for (j = 0; j < n_res; j++) {
9177            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9178            res[k++] = mapped[j];
9179        }
9180    }
9181    return k;
9182}
9183
9184static Py_ssize_t
9185do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9186                  Py_UCS4 *maxchar, int lower)
9187{
9188    Py_ssize_t i, k = 0;
9189
9190    for (i = 0; i < length; i++) {
9191        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9192        int n_res, j;
9193        if (lower)
9194            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9195        else
9196            n_res = _PyUnicode_ToUpperFull(c, mapped);
9197        for (j = 0; j < n_res; j++) {
9198            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9199            res[k++] = mapped[j];
9200        }
9201    }
9202    return k;
9203}
9204
9205static Py_ssize_t
9206do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9207{
9208    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9209}
9210
9211static Py_ssize_t
9212do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9213{
9214    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9215}
9216
9217static Py_ssize_t
9218do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9219{
9220    Py_ssize_t i, k = 0;
9221
9222    for (i = 0; i < length; i++) {
9223        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9224        Py_UCS4 mapped[3];
9225        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9226        for (j = 0; j < n_res; j++) {
9227            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9228            res[k++] = mapped[j];
9229        }
9230    }
9231    return k;
9232}
9233
9234static Py_ssize_t
9235do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9236{
9237    Py_ssize_t i, k = 0;
9238    int previous_is_cased;
9239
9240    previous_is_cased = 0;
9241    for (i = 0; i < length; i++) {
9242        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9243        Py_UCS4 mapped[3];
9244        int n_res, j;
9245
9246        if (previous_is_cased)
9247            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9248        else
9249            n_res = _PyUnicode_ToTitleFull(c, mapped);
9250
9251        for (j = 0; j < n_res; j++) {
9252            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9253            res[k++] = mapped[j];
9254        }
9255
9256        previous_is_cased = _PyUnicode_IsCased(c);
9257    }
9258    return k;
9259}
9260
9261static PyObject *
9262case_operation(PyObject *self,
9263               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9264{
9265    PyObject *res = NULL;
9266    Py_ssize_t length, newlength = 0;
9267    int kind, outkind;
9268    void *data, *outdata;
9269    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9270
9271    assert(PyUnicode_IS_READY(self));
9272
9273    kind = PyUnicode_KIND(self);
9274    data = PyUnicode_DATA(self);
9275    length = PyUnicode_GET_LENGTH(self);
9276    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9277    if (tmp == NULL)
9278        return PyErr_NoMemory();
9279    newlength = perform(kind, data, length, tmp, &maxchar);
9280    res = PyUnicode_New(newlength, maxchar);
9281    if (res == NULL)
9282        goto leave;
9283    tmpend = tmp + newlength;
9284    outdata = PyUnicode_DATA(res);
9285    outkind = PyUnicode_KIND(res);
9286    switch (outkind) {
9287    case PyUnicode_1BYTE_KIND:
9288        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9289        break;
9290    case PyUnicode_2BYTE_KIND:
9291        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9292        break;
9293    case PyUnicode_4BYTE_KIND:
9294        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9295        break;
9296    default:
9297        assert(0);
9298        break;
9299    }
9300  leave:
9301    PyMem_FREE(tmp);
9302    return res;
9303}
9304
9305PyObject *
9306PyUnicode_Join(PyObject *separator, PyObject *seq)
9307{
9308    PyObject *sep = NULL;
9309    Py_ssize_t seplen;
9310    PyObject *res = NULL; /* the result */
9311    PyObject *fseq;          /* PySequence_Fast(seq) */
9312    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9313    PyObject **items;
9314    PyObject *item;
9315    Py_ssize_t sz, i, res_offset;
9316    Py_UCS4 maxchar;
9317    Py_UCS4 item_maxchar;
9318    int use_memcpy;
9319    unsigned char *res_data = NULL, *sep_data = NULL;
9320    PyObject *last_obj;
9321    unsigned int kind = 0;
9322
9323    fseq = PySequence_Fast(seq, "");
9324    if (fseq == NULL) {
9325        return NULL;
9326    }
9327
9328    /* NOTE: the following code can't call back into Python code,
9329     * so we are sure that fseq won't be mutated.
9330     */
9331
9332    seqlen = PySequence_Fast_GET_SIZE(fseq);
9333    /* If empty sequence, return u"". */
9334    if (seqlen == 0) {
9335        Py_DECREF(fseq);
9336        Py_INCREF(unicode_empty);
9337        res = unicode_empty;
9338        return res;
9339    }
9340
9341    /* If singleton sequence with an exact Unicode, return that. */
9342    last_obj = NULL;
9343    items = PySequence_Fast_ITEMS(fseq);
9344    if (seqlen == 1) {
9345        if (PyUnicode_CheckExact(items[0])) {
9346            res = items[0];
9347            Py_INCREF(res);
9348            Py_DECREF(fseq);
9349            return res;
9350        }
9351        seplen = 0;
9352        maxchar = 0;
9353    }
9354    else {
9355        /* Set up sep and seplen */
9356        if (separator == NULL) {
9357            /* fall back to a blank space separator */
9358            sep = PyUnicode_FromOrdinal(' ');
9359            if (!sep)
9360                goto onError;
9361            seplen = 1;
9362            maxchar = 32;
9363        }
9364        else {
9365            if (!PyUnicode_Check(separator)) {
9366                PyErr_Format(PyExc_TypeError,
9367                             "separator: expected str instance,"
9368                             " %.80s found",
9369                             Py_TYPE(separator)->tp_name);
9370                goto onError;
9371            }
9372            if (PyUnicode_READY(separator))
9373                goto onError;
9374            sep = separator;
9375            seplen = PyUnicode_GET_LENGTH(separator);
9376            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9377            /* inc refcount to keep this code path symmetric with the
9378               above case of a blank separator */
9379            Py_INCREF(sep);
9380        }
9381        last_obj = sep;
9382    }
9383
9384    /* There are at least two things to join, or else we have a subclass
9385     * of str in the sequence.
9386     * Do a pre-pass to figure out the total amount of space we'll
9387     * need (sz), and see whether all argument are strings.
9388     */
9389    sz = 0;
9390#ifdef Py_DEBUG
9391    use_memcpy = 0;
9392#else
9393    use_memcpy = 1;
9394#endif
9395    for (i = 0; i < seqlen; i++) {
9396        const Py_ssize_t old_sz = sz;
9397        item = items[i];
9398        if (!PyUnicode_Check(item)) {
9399            PyErr_Format(PyExc_TypeError,
9400                         "sequence item %zd: expected str instance,"
9401                         " %.80s found",
9402                         i, Py_TYPE(item)->tp_name);
9403            goto onError;
9404        }
9405        if (PyUnicode_READY(item) == -1)
9406            goto onError;
9407        sz += PyUnicode_GET_LENGTH(item);
9408        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9409        maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
9410        if (i != 0)
9411            sz += seplen;
9412        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9413            PyErr_SetString(PyExc_OverflowError,
9414                            "join() result is too long for a Python string");
9415            goto onError;
9416        }
9417        if (use_memcpy && last_obj != NULL) {
9418            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9419                use_memcpy = 0;
9420        }
9421        last_obj = item;
9422    }
9423
9424    res = PyUnicode_New(sz, maxchar);
9425    if (res == NULL)
9426        goto onError;
9427
9428    /* Catenate everything. */
9429#ifdef Py_DEBUG
9430    use_memcpy = 0;
9431#else
9432    if (use_memcpy) {
9433        res_data = PyUnicode_1BYTE_DATA(res);
9434        kind = PyUnicode_KIND(res);
9435        if (seplen != 0)
9436            sep_data = PyUnicode_1BYTE_DATA(sep);
9437    }
9438#endif
9439    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9440        Py_ssize_t itemlen;
9441        item = items[i];
9442        /* Copy item, and maybe the separator. */
9443        if (i && seplen != 0) {
9444            if (use_memcpy) {
9445                Py_MEMCPY(res_data,
9446                          sep_data,
9447                          kind * seplen);
9448                res_data += kind * seplen;
9449            }
9450            else {
9451                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9452                res_offset += seplen;
9453            }
9454        }
9455        itemlen = PyUnicode_GET_LENGTH(item);
9456        if (itemlen != 0) {
9457            if (use_memcpy) {
9458                Py_MEMCPY(res_data,
9459                          PyUnicode_DATA(item),
9460                          kind * itemlen);
9461                res_data += kind * itemlen;
9462            }
9463            else {
9464                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9465                res_offset += itemlen;
9466            }
9467        }
9468    }
9469    if (use_memcpy)
9470        assert(res_data == PyUnicode_1BYTE_DATA(res)
9471                           + kind * PyUnicode_GET_LENGTH(res));
9472    else
9473        assert(res_offset == PyUnicode_GET_LENGTH(res));
9474
9475    Py_DECREF(fseq);
9476    Py_XDECREF(sep);
9477    assert(_PyUnicode_CheckConsistency(res, 1));
9478    return res;
9479
9480  onError:
9481    Py_DECREF(fseq);
9482    Py_XDECREF(sep);
9483    Py_XDECREF(res);
9484    return NULL;
9485}
9486
9487#define FILL(kind, data, value, start, length) \
9488    do { \
9489        Py_ssize_t i_ = 0; \
9490        assert(kind != PyUnicode_WCHAR_KIND); \
9491        switch ((kind)) { \
9492        case PyUnicode_1BYTE_KIND: { \
9493            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9494            memset(to_, (unsigned char)value, (length)); \
9495            break; \
9496        } \
9497        case PyUnicode_2BYTE_KIND: { \
9498            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9499            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9500            break; \
9501        } \
9502        case PyUnicode_4BYTE_KIND: { \
9503            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9504            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9505            break; \
9506        default: assert(0); \
9507        } \
9508        } \
9509    } while (0)
9510
9511void
9512_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9513                    Py_UCS4 fill_char)
9514{
9515    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9516    const void *data = PyUnicode_DATA(unicode);
9517    assert(PyUnicode_IS_READY(unicode));
9518    assert(unicode_modifiable(unicode));
9519    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9520    assert(start >= 0);
9521    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9522    FILL(kind, data, fill_char, start, length);
9523}
9524
9525Py_ssize_t
9526PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9527               Py_UCS4 fill_char)
9528{
9529    Py_ssize_t maxlen;
9530
9531    if (!PyUnicode_Check(unicode)) {
9532        PyErr_BadInternalCall();
9533        return -1;
9534    }
9535    if (PyUnicode_READY(unicode) == -1)
9536        return -1;
9537    if (unicode_check_modifiable(unicode))
9538        return -1;
9539
9540    if (start < 0) {
9541        PyErr_SetString(PyExc_IndexError, "string index out of range");
9542        return -1;
9543    }
9544    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9545        PyErr_SetString(PyExc_ValueError,
9546                         "fill character is bigger than "
9547                         "the string maximum character");
9548        return -1;
9549    }
9550
9551    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9552    length = Py_MIN(maxlen, length);
9553    if (length <= 0)
9554        return 0;
9555
9556    _PyUnicode_FastFill(unicode, start, length, fill_char);
9557    return length;
9558}
9559
9560static PyObject *
9561pad(PyObject *self,
9562    Py_ssize_t left,
9563    Py_ssize_t right,
9564    Py_UCS4 fill)
9565{
9566    PyObject *u;
9567    Py_UCS4 maxchar;
9568    int kind;
9569    void *data;
9570
9571    if (left < 0)
9572        left = 0;
9573    if (right < 0)
9574        right = 0;
9575
9576    if (left == 0 && right == 0)
9577        return unicode_result_unchanged(self);
9578
9579    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9580        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9581        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9582        return NULL;
9583    }
9584    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9585    maxchar = MAX_MAXCHAR(maxchar, fill);
9586    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9587    if (!u)
9588        return NULL;
9589
9590    kind = PyUnicode_KIND(u);
9591    data = PyUnicode_DATA(u);
9592    if (left)
9593        FILL(kind, data, fill, 0, left);
9594    if (right)
9595        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9596    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9597    assert(_PyUnicode_CheckConsistency(u, 1));
9598    return u;
9599}
9600
9601PyObject *
9602PyUnicode_Splitlines(PyObject *string, int keepends)
9603{
9604    PyObject *list;
9605
9606    string = PyUnicode_FromObject(string);
9607    if (string == NULL)
9608        return NULL;
9609    if (PyUnicode_READY(string) == -1) {
9610        Py_DECREF(string);
9611        return NULL;
9612    }
9613
9614    switch (PyUnicode_KIND(string)) {
9615    case PyUnicode_1BYTE_KIND:
9616        if (PyUnicode_IS_ASCII(string))
9617            list = asciilib_splitlines(
9618                string, PyUnicode_1BYTE_DATA(string),
9619                PyUnicode_GET_LENGTH(string), keepends);
9620        else
9621            list = ucs1lib_splitlines(
9622                string, PyUnicode_1BYTE_DATA(string),
9623                PyUnicode_GET_LENGTH(string), keepends);
9624        break;
9625    case PyUnicode_2BYTE_KIND:
9626        list = ucs2lib_splitlines(
9627            string, PyUnicode_2BYTE_DATA(string),
9628            PyUnicode_GET_LENGTH(string), keepends);
9629        break;
9630    case PyUnicode_4BYTE_KIND:
9631        list = ucs4lib_splitlines(
9632            string, PyUnicode_4BYTE_DATA(string),
9633            PyUnicode_GET_LENGTH(string), keepends);
9634        break;
9635    default:
9636        assert(0);
9637        list = 0;
9638    }
9639    Py_DECREF(string);
9640    return list;
9641}
9642
9643static PyObject *
9644split(PyObject *self,
9645      PyObject *substring,
9646      Py_ssize_t maxcount)
9647{
9648    int kind1, kind2, kind;
9649    void *buf1, *buf2;
9650    Py_ssize_t len1, len2;
9651    PyObject* out;
9652
9653    if (maxcount < 0)
9654        maxcount = PY_SSIZE_T_MAX;
9655
9656    if (PyUnicode_READY(self) == -1)
9657        return NULL;
9658
9659    if (substring == NULL)
9660        switch (PyUnicode_KIND(self)) {
9661        case PyUnicode_1BYTE_KIND:
9662            if (PyUnicode_IS_ASCII(self))
9663                return asciilib_split_whitespace(
9664                    self,  PyUnicode_1BYTE_DATA(self),
9665                    PyUnicode_GET_LENGTH(self), maxcount
9666                    );
9667            else
9668                return ucs1lib_split_whitespace(
9669                    self,  PyUnicode_1BYTE_DATA(self),
9670                    PyUnicode_GET_LENGTH(self), maxcount
9671                    );
9672        case PyUnicode_2BYTE_KIND:
9673            return ucs2lib_split_whitespace(
9674                self,  PyUnicode_2BYTE_DATA(self),
9675                PyUnicode_GET_LENGTH(self), maxcount
9676                );
9677        case PyUnicode_4BYTE_KIND:
9678            return ucs4lib_split_whitespace(
9679                self,  PyUnicode_4BYTE_DATA(self),
9680                PyUnicode_GET_LENGTH(self), maxcount
9681                );
9682        default:
9683            assert(0);
9684            return NULL;
9685        }
9686
9687    if (PyUnicode_READY(substring) == -1)
9688        return NULL;
9689
9690    kind1 = PyUnicode_KIND(self);
9691    kind2 = PyUnicode_KIND(substring);
9692    kind = kind1 > kind2 ? kind1 : kind2;
9693    buf1 = PyUnicode_DATA(self);
9694    buf2 = PyUnicode_DATA(substring);
9695    if (kind1 != kind)
9696        buf1 = _PyUnicode_AsKind(self, kind);
9697    if (!buf1)
9698        return NULL;
9699    if (kind2 != kind)
9700        buf2 = _PyUnicode_AsKind(substring, kind);
9701    if (!buf2) {
9702        if (kind1 != kind) PyMem_Free(buf1);
9703        return NULL;
9704    }
9705    len1 = PyUnicode_GET_LENGTH(self);
9706    len2 = PyUnicode_GET_LENGTH(substring);
9707
9708    switch (kind) {
9709    case PyUnicode_1BYTE_KIND:
9710        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9711            out = asciilib_split(
9712                self,  buf1, len1, buf2, len2, maxcount);
9713        else
9714            out = ucs1lib_split(
9715                self,  buf1, len1, buf2, len2, maxcount);
9716        break;
9717    case PyUnicode_2BYTE_KIND:
9718        out = ucs2lib_split(
9719            self,  buf1, len1, buf2, len2, maxcount);
9720        break;
9721    case PyUnicode_4BYTE_KIND:
9722        out = ucs4lib_split(
9723            self,  buf1, len1, buf2, len2, maxcount);
9724        break;
9725    default:
9726        out = NULL;
9727    }
9728    if (kind1 != kind)
9729        PyMem_Free(buf1);
9730    if (kind2 != kind)
9731        PyMem_Free(buf2);
9732    return out;
9733}
9734
9735static PyObject *
9736rsplit(PyObject *self,
9737       PyObject *substring,
9738       Py_ssize_t maxcount)
9739{
9740    int kind1, kind2, kind;
9741    void *buf1, *buf2;
9742    Py_ssize_t len1, len2;
9743    PyObject* out;
9744
9745    if (maxcount < 0)
9746        maxcount = PY_SSIZE_T_MAX;
9747
9748    if (PyUnicode_READY(self) == -1)
9749        return NULL;
9750
9751    if (substring == NULL)
9752        switch (PyUnicode_KIND(self)) {
9753        case PyUnicode_1BYTE_KIND:
9754            if (PyUnicode_IS_ASCII(self))
9755                return asciilib_rsplit_whitespace(
9756                    self,  PyUnicode_1BYTE_DATA(self),
9757                    PyUnicode_GET_LENGTH(self), maxcount
9758                    );
9759            else
9760                return ucs1lib_rsplit_whitespace(
9761                    self,  PyUnicode_1BYTE_DATA(self),
9762                    PyUnicode_GET_LENGTH(self), maxcount
9763                    );
9764        case PyUnicode_2BYTE_KIND:
9765            return ucs2lib_rsplit_whitespace(
9766                self,  PyUnicode_2BYTE_DATA(self),
9767                PyUnicode_GET_LENGTH(self), maxcount
9768                );
9769        case PyUnicode_4BYTE_KIND:
9770            return ucs4lib_rsplit_whitespace(
9771                self,  PyUnicode_4BYTE_DATA(self),
9772                PyUnicode_GET_LENGTH(self), maxcount
9773                );
9774        default:
9775            assert(0);
9776            return NULL;
9777        }
9778
9779    if (PyUnicode_READY(substring) == -1)
9780        return NULL;
9781
9782    kind1 = PyUnicode_KIND(self);
9783    kind2 = PyUnicode_KIND(substring);
9784    kind = kind1 > kind2 ? kind1 : kind2;
9785    buf1 = PyUnicode_DATA(self);
9786    buf2 = PyUnicode_DATA(substring);
9787    if (kind1 != kind)
9788        buf1 = _PyUnicode_AsKind(self, kind);
9789    if (!buf1)
9790        return NULL;
9791    if (kind2 != kind)
9792        buf2 = _PyUnicode_AsKind(substring, kind);
9793    if (!buf2) {
9794        if (kind1 != kind) PyMem_Free(buf1);
9795        return NULL;
9796    }
9797    len1 = PyUnicode_GET_LENGTH(self);
9798    len2 = PyUnicode_GET_LENGTH(substring);
9799
9800    switch (kind) {
9801    case PyUnicode_1BYTE_KIND:
9802        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9803            out = asciilib_rsplit(
9804                self,  buf1, len1, buf2, len2, maxcount);
9805        else
9806            out = ucs1lib_rsplit(
9807                self,  buf1, len1, buf2, len2, maxcount);
9808        break;
9809    case PyUnicode_2BYTE_KIND:
9810        out = ucs2lib_rsplit(
9811            self,  buf1, len1, buf2, len2, maxcount);
9812        break;
9813    case PyUnicode_4BYTE_KIND:
9814        out = ucs4lib_rsplit(
9815            self,  buf1, len1, buf2, len2, maxcount);
9816        break;
9817    default:
9818        out = NULL;
9819    }
9820    if (kind1 != kind)
9821        PyMem_Free(buf1);
9822    if (kind2 != kind)
9823        PyMem_Free(buf2);
9824    return out;
9825}
9826
9827static Py_ssize_t
9828anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9829            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9830{
9831    switch (kind) {
9832    case PyUnicode_1BYTE_KIND:
9833        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9834            return asciilib_find(buf1, len1, buf2, len2, offset);
9835        else
9836            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9837    case PyUnicode_2BYTE_KIND:
9838        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9839    case PyUnicode_4BYTE_KIND:
9840        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9841    }
9842    assert(0);
9843    return -1;
9844}
9845
9846static Py_ssize_t
9847anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9848             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9849{
9850    switch (kind) {
9851    case PyUnicode_1BYTE_KIND:
9852        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9853            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9854        else
9855            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9856    case PyUnicode_2BYTE_KIND:
9857        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9858    case PyUnicode_4BYTE_KIND:
9859        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9860    }
9861    assert(0);
9862    return 0;
9863}
9864
9865static PyObject *
9866replace(PyObject *self, PyObject *str1,
9867        PyObject *str2, Py_ssize_t maxcount)
9868{
9869    PyObject *u;
9870    char *sbuf = PyUnicode_DATA(self);
9871    char *buf1 = PyUnicode_DATA(str1);
9872    char *buf2 = PyUnicode_DATA(str2);
9873    int srelease = 0, release1 = 0, release2 = 0;
9874    int skind = PyUnicode_KIND(self);
9875    int kind1 = PyUnicode_KIND(str1);
9876    int kind2 = PyUnicode_KIND(str2);
9877    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9878    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9879    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9880    int mayshrink;
9881    Py_UCS4 maxchar, maxchar_str2;
9882
9883    if (maxcount < 0)
9884        maxcount = PY_SSIZE_T_MAX;
9885    else if (maxcount == 0 || slen == 0)
9886        goto nothing;
9887
9888    if (str1 == str2)
9889        goto nothing;
9890    if (skind < kind1)
9891        /* substring too wide to be present */
9892        goto nothing;
9893
9894    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9895    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9896    /* Replacing str1 with str2 may cause a maxchar reduction in the
9897       result string. */
9898    mayshrink = (maxchar_str2 < maxchar);
9899    maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
9900
9901    if (len1 == len2) {
9902        /* same length */
9903        if (len1 == 0)
9904            goto nothing;
9905        if (len1 == 1) {
9906            /* replace characters */
9907            Py_UCS4 u1, u2;
9908            int rkind;
9909            Py_ssize_t index, pos;
9910            char *src;
9911
9912            u1 = PyUnicode_READ_CHAR(str1, 0);
9913            pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9914            if (pos < 0)
9915                goto nothing;
9916            u2 = PyUnicode_READ_CHAR(str2, 0);
9917            u = PyUnicode_New(slen, maxchar);
9918            if (!u)
9919                goto error;
9920            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
9921            rkind = PyUnicode_KIND(u);
9922
9923            PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9924            index = 0;
9925            src = sbuf;
9926            while (--maxcount)
9927            {
9928                pos++;
9929                src += pos * PyUnicode_KIND(self);
9930                slen -= pos;
9931                index += pos;
9932                pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9933                if (pos < 0)
9934                    break;
9935                PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9936            }
9937        }
9938        else {
9939            int rkind = skind;
9940            char *res;
9941            Py_ssize_t i;
9942
9943            if (kind1 < rkind) {
9944                /* widen substring */
9945                buf1 = _PyUnicode_AsKind(str1, rkind);
9946                if (!buf1) goto error;
9947                release1 = 1;
9948            }
9949            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
9950            if (i < 0)
9951                goto nothing;
9952            if (rkind > kind2) {
9953                /* widen replacement */
9954                buf2 = _PyUnicode_AsKind(str2, rkind);
9955                if (!buf2) goto error;
9956                release2 = 1;
9957            }
9958            else if (rkind < kind2) {
9959                /* widen self and buf1 */
9960                rkind = kind2;
9961                if (release1) PyMem_Free(buf1);
9962                sbuf = _PyUnicode_AsKind(self, rkind);
9963                if (!sbuf) goto error;
9964                srelease = 1;
9965                buf1 = _PyUnicode_AsKind(str1, rkind);
9966                if (!buf1) goto error;
9967                release1 = 1;
9968            }
9969            u = PyUnicode_New(slen, maxchar);
9970            if (!u)
9971                goto error;
9972            assert(PyUnicode_KIND(u) == rkind);
9973            res = PyUnicode_DATA(u);
9974
9975            memcpy(res, sbuf, rkind * slen);
9976            /* change everything in-place, starting with this one */
9977            memcpy(res + rkind * i,
9978                   buf2,
9979                   rkind * len2);
9980            i += len1;
9981
9982            while ( --maxcount > 0) {
9983                i = anylib_find(rkind, self,
9984                                sbuf+rkind*i, slen-i,
9985                                str1, buf1, len1, i);
9986                if (i == -1)
9987                    break;
9988                memcpy(res + rkind * i,
9989                       buf2,
9990                       rkind * len2);
9991                i += len1;
9992            }
9993        }
9994    }
9995    else {
9996        Py_ssize_t n, i, j, ires;
9997        Py_ssize_t new_size;
9998        int rkind = skind;
9999        char *res;
10000
10001        if (kind1 < rkind) {
10002            /* widen substring */
10003            buf1 = _PyUnicode_AsKind(str1, rkind);
10004            if (!buf1) goto error;
10005            release1 = 1;
10006        }
10007        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10008        if (n == 0)
10009            goto nothing;
10010        if (kind2 < rkind) {
10011            /* widen replacement */
10012            buf2 = _PyUnicode_AsKind(str2, rkind);
10013            if (!buf2) goto error;
10014            release2 = 1;
10015        }
10016        else if (kind2 > rkind) {
10017            /* widen self and buf1 */
10018            rkind = kind2;
10019            sbuf = _PyUnicode_AsKind(self, rkind);
10020            if (!sbuf) goto error;
10021            srelease = 1;
10022            if (release1) PyMem_Free(buf1);
10023            buf1 = _PyUnicode_AsKind(str1, rkind);
10024            if (!buf1) goto error;
10025            release1 = 1;
10026        }
10027        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10028           PyUnicode_GET_LENGTH(str1))); */
10029        if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10030                PyErr_SetString(PyExc_OverflowError,
10031                                "replace string is too long");
10032                goto error;
10033        }
10034        new_size = slen + n * (len2 - len1);
10035        if (new_size == 0) {
10036            Py_INCREF(unicode_empty);
10037            u = unicode_empty;
10038            goto done;
10039        }
10040        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10041            PyErr_SetString(PyExc_OverflowError,
10042                            "replace string is too long");
10043            goto error;
10044        }
10045        u = PyUnicode_New(new_size, maxchar);
10046        if (!u)
10047            goto error;
10048        assert(PyUnicode_KIND(u) == rkind);
10049        res = PyUnicode_DATA(u);
10050        ires = i = 0;
10051        if (len1 > 0) {
10052            while (n-- > 0) {
10053                /* look for next match */
10054                j = anylib_find(rkind, self,
10055                                sbuf + rkind * i, slen-i,
10056                                str1, buf1, len1, i);
10057                if (j == -1)
10058                    break;
10059                else if (j > i) {
10060                    /* copy unchanged part [i:j] */
10061                    memcpy(res + rkind * ires,
10062                           sbuf + rkind * i,
10063                           rkind * (j-i));
10064                    ires += j - i;
10065                }
10066                /* copy substitution string */
10067                if (len2 > 0) {
10068                    memcpy(res + rkind * ires,
10069                           buf2,
10070                           rkind * len2);
10071                    ires += len2;
10072                }
10073                i = j + len1;
10074            }
10075            if (i < slen)
10076                /* copy tail [i:] */
10077                memcpy(res + rkind * ires,
10078                       sbuf + rkind * i,
10079                       rkind * (slen-i));
10080        }
10081        else {
10082            /* interleave */
10083            while (n > 0) {
10084                memcpy(res + rkind * ires,
10085                       buf2,
10086                       rkind * len2);
10087                ires += len2;
10088                if (--n <= 0)
10089                    break;
10090                memcpy(res + rkind * ires,
10091                       sbuf + rkind * i,
10092                       rkind);
10093                ires++;
10094                i++;
10095            }
10096            memcpy(res + rkind * ires,
10097                   sbuf + rkind * i,
10098                   rkind * (slen-i));
10099        }
10100    }
10101
10102    if (mayshrink) {
10103        unicode_adjust_maxchar(&u);
10104        if (u == NULL)
10105            goto error;
10106    }
10107
10108  done:
10109    if (srelease)
10110        PyMem_FREE(sbuf);
10111    if (release1)
10112        PyMem_FREE(buf1);
10113    if (release2)
10114        PyMem_FREE(buf2);
10115    assert(_PyUnicode_CheckConsistency(u, 1));
10116    return u;
10117
10118  nothing:
10119    /* nothing to replace; return original string (when possible) */
10120    if (srelease)
10121        PyMem_FREE(sbuf);
10122    if (release1)
10123        PyMem_FREE(buf1);
10124    if (release2)
10125        PyMem_FREE(buf2);
10126    return unicode_result_unchanged(self);
10127
10128  error:
10129    if (srelease && sbuf)
10130        PyMem_FREE(sbuf);
10131    if (release1 && buf1)
10132        PyMem_FREE(buf1);
10133    if (release2 && buf2)
10134        PyMem_FREE(buf2);
10135    return NULL;
10136}
10137
10138/* --- Unicode Object Methods --------------------------------------------- */
10139
10140PyDoc_STRVAR(title__doc__,
10141             "S.title() -> str\n\
10142\n\
10143Return a titlecased version of S, i.e. words start with title case\n\
10144characters, all remaining cased characters have lower case.");
10145
10146static PyObject*
10147unicode_title(PyObject *self)
10148{
10149    if (PyUnicode_READY(self) == -1)
10150        return NULL;
10151    return case_operation(self, do_title);
10152}
10153
10154PyDoc_STRVAR(capitalize__doc__,
10155             "S.capitalize() -> str\n\
10156\n\
10157Return a capitalized version of S, i.e. make the first character\n\
10158have upper case and the rest lower case.");
10159
10160static PyObject*
10161unicode_capitalize(PyObject *self)
10162{
10163    if (PyUnicode_READY(self) == -1)
10164        return NULL;
10165    if (PyUnicode_GET_LENGTH(self) == 0)
10166        return unicode_result_unchanged(self);
10167    return case_operation(self, do_capitalize);
10168}
10169
10170PyDoc_STRVAR(casefold__doc__,
10171             "S.casefold() -> str\n\
10172\n\
10173Return a version of S suitable for caseless comparisons.");
10174
10175static PyObject *
10176unicode_casefold(PyObject *self)
10177{
10178    if (PyUnicode_READY(self) == -1)
10179        return NULL;
10180    if (PyUnicode_IS_ASCII(self))
10181        return ascii_upper_or_lower(self, 1);
10182    return case_operation(self, do_casefold);
10183}
10184
10185
10186/* Argument converter.  Coerces to a single unicode character */
10187
10188static int
10189convert_uc(PyObject *obj, void *addr)
10190{
10191    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10192    PyObject *uniobj;
10193
10194    uniobj = PyUnicode_FromObject(obj);
10195    if (uniobj == NULL) {
10196        PyErr_SetString(PyExc_TypeError,
10197                        "The fill character cannot be converted to Unicode");
10198        return 0;
10199    }
10200    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10201        PyErr_SetString(PyExc_TypeError,
10202                        "The fill character must be exactly one character long");
10203        Py_DECREF(uniobj);
10204        return 0;
10205    }
10206    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10207    Py_DECREF(uniobj);
10208    return 1;
10209}
10210
10211PyDoc_STRVAR(center__doc__,
10212             "S.center(width[, fillchar]) -> str\n\
10213\n\
10214Return S centered in a string of length width. Padding is\n\
10215done using the specified fill character (default is a space)");
10216
10217static PyObject *
10218unicode_center(PyObject *self, PyObject *args)
10219{
10220    Py_ssize_t marg, left;
10221    Py_ssize_t width;
10222    Py_UCS4 fillchar = ' ';
10223
10224    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10225        return NULL;
10226
10227    if (PyUnicode_READY(self) == -1)
10228        return NULL;
10229
10230    if (PyUnicode_GET_LENGTH(self) >= width)
10231        return unicode_result_unchanged(self);
10232
10233    marg = width - PyUnicode_GET_LENGTH(self);
10234    left = marg / 2 + (marg & width & 1);
10235
10236    return pad(self, left, marg - left, fillchar);
10237}
10238
10239/* This function assumes that str1 and str2 are readied by the caller. */
10240
10241static int
10242unicode_compare(PyObject *str1, PyObject *str2)
10243{
10244    int kind1, kind2;
10245    void *data1, *data2;
10246    Py_ssize_t len1, len2;
10247    Py_ssize_t i, len;
10248
10249    /* a string is equal to itself */
10250    if (str1 == str2)
10251        return 0;
10252
10253    kind1 = PyUnicode_KIND(str1);
10254    kind2 = PyUnicode_KIND(str2);
10255    data1 = PyUnicode_DATA(str1);
10256    data2 = PyUnicode_DATA(str2);
10257    len1 = PyUnicode_GET_LENGTH(str1);
10258    len2 = PyUnicode_GET_LENGTH(str2);
10259    len = Py_MIN(len1, len2);
10260
10261    if (kind1 == 1 && kind2 == 1) {
10262        int cmp = memcmp(data1, data2, len);
10263        /* normalize result of memcmp() into the range [-1; 1] */
10264        if (cmp < 0)
10265            return -1;
10266        if (cmp > 0)
10267            return 1;
10268    }
10269    else {
10270        for (i = 0; i < len; ++i) {
10271            Py_UCS4 c1, c2;
10272            c1 = PyUnicode_READ(kind1, data1, i);
10273            c2 = PyUnicode_READ(kind2, data2, i);
10274
10275            if (c1 != c2)
10276                return (c1 < c2) ? -1 : 1;
10277        }
10278    }
10279
10280    if (len1 == len2)
10281        return 0;
10282    if (len1 < len2)
10283        return -1;
10284    else
10285        return 1;
10286}
10287
10288static int
10289unicode_compare_eq(PyObject *str1, PyObject *str2)
10290{
10291    int kind;
10292    void *data1, *data2;
10293    Py_ssize_t len;
10294    int cmp;
10295
10296    /* a string is equal to itself */
10297    if (str1 == str2)
10298        return 1;
10299
10300    len = PyUnicode_GET_LENGTH(str1);
10301    if (PyUnicode_GET_LENGTH(str2) != len)
10302        return 0;
10303    kind = PyUnicode_KIND(str1);
10304    if (PyUnicode_KIND(str2) != kind)
10305        return 0;
10306    data1 = PyUnicode_DATA(str1);
10307    data2 = PyUnicode_DATA(str2);
10308
10309    cmp = memcmp(data1, data2, len * kind);
10310    return (cmp == 0);
10311}
10312
10313
10314int
10315PyUnicode_Compare(PyObject *left, PyObject *right)
10316{
10317    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10318        if (PyUnicode_READY(left) == -1 ||
10319            PyUnicode_READY(right) == -1)
10320            return -1;
10321        return unicode_compare(left, right);
10322    }
10323    PyErr_Format(PyExc_TypeError,
10324                 "Can't compare %.100s and %.100s",
10325                 left->ob_type->tp_name,
10326                 right->ob_type->tp_name);
10327    return -1;
10328}
10329
10330int
10331PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10332{
10333    Py_ssize_t i;
10334    int kind;
10335    void *data;
10336    Py_UCS4 chr;
10337
10338    assert(_PyUnicode_CHECK(uni));
10339    if (PyUnicode_READY(uni) == -1)
10340        return -1;
10341    kind = PyUnicode_KIND(uni);
10342    data = PyUnicode_DATA(uni);
10343    /* Compare Unicode string and source character set string */
10344    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10345        if (chr != str[i])
10346            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10347    /* This check keeps Python strings that end in '\0' from comparing equal
10348     to C strings identical up to that point. */
10349    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10350        return 1; /* uni is longer */
10351    if (str[i])
10352        return -1; /* str is longer */
10353    return 0;
10354}
10355
10356
10357#define TEST_COND(cond)                         \
10358    ((cond) ? Py_True : Py_False)
10359
10360PyObject *
10361PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10362{
10363    int result;
10364    PyObject *v;
10365
10366    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10367        Py_RETURN_NOTIMPLEMENTED;
10368
10369    if (PyUnicode_READY(left) == -1 ||
10370        PyUnicode_READY(right) == -1)
10371        return NULL;
10372
10373    if (op == Py_EQ || op == Py_NE) {
10374        result = unicode_compare_eq(left, right);
10375        if (op == Py_EQ)
10376            v = TEST_COND(result);
10377        else
10378            v = TEST_COND(!result);
10379    }
10380    else {
10381        result = unicode_compare(left, right);
10382
10383        /* Convert the return value to a Boolean */
10384        switch (op) {
10385        case Py_LE:
10386            v = TEST_COND(result <= 0);
10387            break;
10388        case Py_GE:
10389            v = TEST_COND(result >= 0);
10390            break;
10391        case Py_LT:
10392            v = TEST_COND(result == -1);
10393            break;
10394        case Py_GT:
10395            v = TEST_COND(result == 1);
10396            break;
10397        default:
10398            PyErr_BadArgument();
10399            return NULL;
10400        }
10401    }
10402    Py_INCREF(v);
10403    return v;
10404}
10405
10406int
10407PyUnicode_Contains(PyObject *container, PyObject *element)
10408{
10409    PyObject *str, *sub;
10410    int kind1, kind2, kind;
10411    void *buf1, *buf2;
10412    Py_ssize_t len1, len2;
10413    int result;
10414
10415    /* Coerce the two arguments */
10416    sub = PyUnicode_FromObject(element);
10417    if (!sub) {
10418        PyErr_Format(PyExc_TypeError,
10419                     "'in <string>' requires string as left operand, not %s",
10420                     element->ob_type->tp_name);
10421        return -1;
10422    }
10423
10424    str = PyUnicode_FromObject(container);
10425    if (!str) {
10426        Py_DECREF(sub);
10427        return -1;
10428    }
10429    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10430        Py_DECREF(sub);
10431        Py_DECREF(str);
10432    }
10433
10434    kind1 = PyUnicode_KIND(str);
10435    kind2 = PyUnicode_KIND(sub);
10436    kind = kind1;
10437    buf1 = PyUnicode_DATA(str);
10438    buf2 = PyUnicode_DATA(sub);
10439    if (kind2 != kind) {
10440        if (kind2 > kind) {
10441            Py_DECREF(sub);
10442            Py_DECREF(str);
10443            return 0;
10444        }
10445        buf2 = _PyUnicode_AsKind(sub, kind);
10446    }
10447    if (!buf2) {
10448        Py_DECREF(sub);
10449        Py_DECREF(str);
10450        return -1;
10451    }
10452    len1 = PyUnicode_GET_LENGTH(str);
10453    len2 = PyUnicode_GET_LENGTH(sub);
10454
10455    switch (kind) {
10456    case PyUnicode_1BYTE_KIND:
10457        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10458        break;
10459    case PyUnicode_2BYTE_KIND:
10460        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10461        break;
10462    case PyUnicode_4BYTE_KIND:
10463        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10464        break;
10465    default:
10466        result = -1;
10467        assert(0);
10468    }
10469
10470    Py_DECREF(str);
10471    Py_DECREF(sub);
10472
10473    if (kind2 != kind)
10474        PyMem_Free(buf2);
10475
10476    return result;
10477}
10478
10479/* Concat to string or Unicode object giving a new Unicode object. */
10480
10481PyObject *
10482PyUnicode_Concat(PyObject *left, PyObject *right)
10483{
10484    PyObject *u = NULL, *v = NULL, *w;
10485    Py_UCS4 maxchar, maxchar2;
10486    Py_ssize_t u_len, v_len, new_len;
10487
10488    /* Coerce the two arguments */
10489    u = PyUnicode_FromObject(left);
10490    if (u == NULL)
10491        goto onError;
10492    v = PyUnicode_FromObject(right);
10493    if (v == NULL)
10494        goto onError;
10495
10496    /* Shortcuts */
10497    if (v == unicode_empty) {
10498        Py_DECREF(v);
10499        return u;
10500    }
10501    if (u == unicode_empty) {
10502        Py_DECREF(u);
10503        return v;
10504    }
10505
10506    u_len = PyUnicode_GET_LENGTH(u);
10507    v_len = PyUnicode_GET_LENGTH(v);
10508    if (u_len > PY_SSIZE_T_MAX - v_len) {
10509        PyErr_SetString(PyExc_OverflowError,
10510                        "strings are too large to concat");
10511        goto onError;
10512    }
10513    new_len = u_len + v_len;
10514
10515    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10516    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10517    maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10518
10519    /* Concat the two Unicode strings */
10520    w = PyUnicode_New(new_len, maxchar);
10521    if (w == NULL)
10522        goto onError;
10523    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10524    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
10525    Py_DECREF(u);
10526    Py_DECREF(v);
10527    assert(_PyUnicode_CheckConsistency(w, 1));
10528    return w;
10529
10530  onError:
10531    Py_XDECREF(u);
10532    Py_XDECREF(v);
10533    return NULL;
10534}
10535
10536void
10537PyUnicode_Append(PyObject **p_left, PyObject *right)
10538{
10539    PyObject *left, *res;
10540    Py_UCS4 maxchar, maxchar2;
10541    Py_ssize_t left_len, right_len, new_len;
10542
10543    if (p_left == NULL) {
10544        if (!PyErr_Occurred())
10545            PyErr_BadInternalCall();
10546        return;
10547    }
10548    left = *p_left;
10549    if (right == NULL || !PyUnicode_Check(left)) {
10550        if (!PyErr_Occurred())
10551            PyErr_BadInternalCall();
10552        goto error;
10553    }
10554
10555    if (PyUnicode_READY(left) == -1)
10556        goto error;
10557    if (PyUnicode_READY(right) == -1)
10558        goto error;
10559
10560    /* Shortcuts */
10561    if (left == unicode_empty) {
10562        Py_DECREF(left);
10563        Py_INCREF(right);
10564        *p_left = right;
10565        return;
10566    }
10567    if (right == unicode_empty)
10568        return;
10569
10570    left_len = PyUnicode_GET_LENGTH(left);
10571    right_len = PyUnicode_GET_LENGTH(right);
10572    if (left_len > PY_SSIZE_T_MAX - right_len) {
10573        PyErr_SetString(PyExc_OverflowError,
10574                        "strings are too large to concat");
10575        goto error;
10576    }
10577    new_len = left_len + right_len;
10578
10579    if (unicode_modifiable(left)
10580        && PyUnicode_CheckExact(right)
10581        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
10582        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10583           to change the structure size, but characters are stored just after
10584           the structure, and so it requires to move all characters which is
10585           not so different than duplicating the string. */
10586        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10587    {
10588        /* append inplace */
10589        if (unicode_resize(p_left, new_len) != 0) {
10590            /* XXX if _PyUnicode_Resize() fails, 'left' has been
10591             * deallocated so it cannot be put back into
10592             * 'variable'.  The MemoryError is raised when there
10593             * is no value in 'variable', which might (very
10594             * remotely) be a cause of incompatibilities.
10595             */
10596            goto error;
10597        }
10598        /* copy 'right' into the newly allocated area of 'left' */
10599        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
10600    }
10601    else {
10602        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10603        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10604        maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10605
10606        /* Concat the two Unicode strings */
10607        res = PyUnicode_New(new_len, maxchar);
10608        if (res == NULL)
10609            goto error;
10610        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10611        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
10612        Py_DECREF(left);
10613        *p_left = res;
10614    }
10615    assert(_PyUnicode_CheckConsistency(*p_left, 1));
10616    return;
10617
10618error:
10619    Py_CLEAR(*p_left);
10620}
10621
10622void
10623PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10624{
10625    PyUnicode_Append(pleft, right);
10626    Py_XDECREF(right);
10627}
10628
10629PyDoc_STRVAR(count__doc__,
10630             "S.count(sub[, start[, end]]) -> int\n\
10631\n\
10632Return the number of non-overlapping occurrences of substring sub in\n\
10633string S[start:end].  Optional arguments start and end are\n\
10634interpreted as in slice notation.");
10635
10636static PyObject *
10637unicode_count(PyObject *self, PyObject *args)
10638{
10639    PyObject *substring;
10640    Py_ssize_t start = 0;
10641    Py_ssize_t end = PY_SSIZE_T_MAX;
10642    PyObject *result;
10643    int kind1, kind2, kind;
10644    void *buf1, *buf2;
10645    Py_ssize_t len1, len2, iresult;
10646
10647    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10648                                            &start, &end))
10649        return NULL;
10650
10651    kind1 = PyUnicode_KIND(self);
10652    kind2 = PyUnicode_KIND(substring);
10653    if (kind2 > kind1)
10654        return PyLong_FromLong(0);
10655    kind = kind1;
10656    buf1 = PyUnicode_DATA(self);
10657    buf2 = PyUnicode_DATA(substring);
10658    if (kind2 != kind)
10659        buf2 = _PyUnicode_AsKind(substring, kind);
10660    if (!buf2) {
10661        Py_DECREF(substring);
10662        return NULL;
10663    }
10664    len1 = PyUnicode_GET_LENGTH(self);
10665    len2 = PyUnicode_GET_LENGTH(substring);
10666
10667    ADJUST_INDICES(start, end, len1);
10668    switch (kind) {
10669    case PyUnicode_1BYTE_KIND:
10670        iresult = ucs1lib_count(
10671            ((Py_UCS1*)buf1) + start, end - start,
10672            buf2, len2, PY_SSIZE_T_MAX
10673            );
10674        break;
10675    case PyUnicode_2BYTE_KIND:
10676        iresult = ucs2lib_count(
10677            ((Py_UCS2*)buf1) + start, end - start,
10678            buf2, len2, PY_SSIZE_T_MAX
10679            );
10680        break;
10681    case PyUnicode_4BYTE_KIND:
10682        iresult = ucs4lib_count(
10683            ((Py_UCS4*)buf1) + start, end - start,
10684            buf2, len2, PY_SSIZE_T_MAX
10685            );
10686        break;
10687    default:
10688        assert(0); iresult = 0;
10689    }
10690
10691    result = PyLong_FromSsize_t(iresult);
10692
10693    if (kind2 != kind)
10694        PyMem_Free(buf2);
10695
10696    Py_DECREF(substring);
10697
10698    return result;
10699}
10700
10701PyDoc_STRVAR(encode__doc__,
10702             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10703\n\
10704Encode S using the codec registered for encoding. Default encoding\n\
10705is 'utf-8'. errors may be given to set a different error\n\
10706handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10707a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10708'xmlcharrefreplace' as well as any other name registered with\n\
10709codecs.register_error that can handle UnicodeEncodeErrors.");
10710
10711static PyObject *
10712unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10713{
10714    static char *kwlist[] = {"encoding", "errors", 0};
10715    char *encoding = NULL;
10716    char *errors = NULL;
10717
10718    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10719                                     kwlist, &encoding, &errors))
10720        return NULL;
10721    return PyUnicode_AsEncodedString(self, encoding, errors);
10722}
10723
10724PyDoc_STRVAR(expandtabs__doc__,
10725             "S.expandtabs([tabsize]) -> str\n\
10726\n\
10727Return a copy of S where all tab characters are expanded using spaces.\n\
10728If tabsize is not given, a tab size of 8 characters is assumed.");
10729
10730static PyObject*
10731unicode_expandtabs(PyObject *self, PyObject *args)
10732{
10733    Py_ssize_t i, j, line_pos, src_len, incr;
10734    Py_UCS4 ch;
10735    PyObject *u;
10736    void *src_data, *dest_data;
10737    int tabsize = 8;
10738    int kind;
10739    int found;
10740
10741    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10742        return NULL;
10743
10744    if (PyUnicode_READY(self) == -1)
10745        return NULL;
10746
10747    /* First pass: determine size of output string */
10748    src_len = PyUnicode_GET_LENGTH(self);
10749    i = j = line_pos = 0;
10750    kind = PyUnicode_KIND(self);
10751    src_data = PyUnicode_DATA(self);
10752    found = 0;
10753    for (; i < src_len; i++) {
10754        ch = PyUnicode_READ(kind, src_data, i);
10755        if (ch == '\t') {
10756            found = 1;
10757            if (tabsize > 0) {
10758                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10759                if (j > PY_SSIZE_T_MAX - incr)
10760                    goto overflow;
10761                line_pos += incr;
10762                j += incr;
10763            }
10764        }
10765        else {
10766            if (j > PY_SSIZE_T_MAX - 1)
10767                goto overflow;
10768            line_pos++;
10769            j++;
10770            if (ch == '\n' || ch == '\r')
10771                line_pos = 0;
10772        }
10773    }
10774    if (!found)
10775        return unicode_result_unchanged(self);
10776
10777    /* Second pass: create output string and fill it */
10778    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10779    if (!u)
10780        return NULL;
10781    dest_data = PyUnicode_DATA(u);
10782
10783    i = j = line_pos = 0;
10784
10785    for (; i < src_len; i++) {
10786        ch = PyUnicode_READ(kind, src_data, i);
10787        if (ch == '\t') {
10788            if (tabsize > 0) {
10789                incr = tabsize - (line_pos % tabsize);
10790                line_pos += incr;
10791                FILL(kind, dest_data, ' ', j, incr);
10792                j += incr;
10793            }
10794        }
10795        else {
10796            line_pos++;
10797            PyUnicode_WRITE(kind, dest_data, j, ch);
10798            j++;
10799            if (ch == '\n' || ch == '\r')
10800                line_pos = 0;
10801        }
10802    }
10803    assert (j == PyUnicode_GET_LENGTH(u));
10804    return unicode_result(u);
10805
10806  overflow:
10807    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10808    return NULL;
10809}
10810
10811PyDoc_STRVAR(find__doc__,
10812             "S.find(sub[, start[, end]]) -> int\n\
10813\n\
10814Return the lowest index in S where substring sub is found,\n\
10815such that sub is contained within S[start:end].  Optional\n\
10816arguments start and end are interpreted as in slice notation.\n\
10817\n\
10818Return -1 on failure.");
10819
10820static PyObject *
10821unicode_find(PyObject *self, PyObject *args)
10822{
10823    PyObject *substring;
10824    Py_ssize_t start;
10825    Py_ssize_t end;
10826    Py_ssize_t result;
10827
10828    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10829                                            &start, &end))
10830        return NULL;
10831
10832    if (PyUnicode_READY(self) == -1)
10833        return NULL;
10834    if (PyUnicode_READY(substring) == -1)
10835        return NULL;
10836
10837    result = any_find_slice(1, self, substring, start, end);
10838
10839    Py_DECREF(substring);
10840
10841    if (result == -2)
10842        return NULL;
10843
10844    return PyLong_FromSsize_t(result);
10845}
10846
10847static PyObject *
10848unicode_getitem(PyObject *self, Py_ssize_t index)
10849{
10850    void *data;
10851    enum PyUnicode_Kind kind;
10852    Py_UCS4 ch;
10853    PyObject *res;
10854
10855    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10856        PyErr_BadArgument();
10857        return NULL;
10858    }
10859    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10860        PyErr_SetString(PyExc_IndexError, "string index out of range");
10861        return NULL;
10862    }
10863    kind = PyUnicode_KIND(self);
10864    data = PyUnicode_DATA(self);
10865    ch = PyUnicode_READ(kind, data, index);
10866    if (ch < 256)
10867        return get_latin1_char(ch);
10868
10869    res = PyUnicode_New(1, ch);
10870    if (res == NULL)
10871        return NULL;
10872    kind = PyUnicode_KIND(res);
10873    data = PyUnicode_DATA(res);
10874    PyUnicode_WRITE(kind, data, 0, ch);
10875    assert(_PyUnicode_CheckConsistency(res, 1));
10876    return res;
10877}
10878
10879/* Believe it or not, this produces the same value for ASCII strings
10880   as bytes_hash(). */
10881static Py_hash_t
10882unicode_hash(PyObject *self)
10883{
10884    Py_ssize_t len;
10885    Py_uhash_t x;
10886
10887#ifdef Py_DEBUG
10888    assert(_Py_HashSecret_Initialized);
10889#endif
10890    if (_PyUnicode_HASH(self) != -1)
10891        return _PyUnicode_HASH(self);
10892    if (PyUnicode_READY(self) == -1)
10893        return -1;
10894    len = PyUnicode_GET_LENGTH(self);
10895    /*
10896      We make the hash of the empty string be 0, rather than using
10897      (prefix ^ suffix), since this slightly obfuscates the hash secret
10898    */
10899    if (len == 0) {
10900        _PyUnicode_HASH(self) = 0;
10901        return 0;
10902    }
10903
10904    /* The hash function as a macro, gets expanded three times below. */
10905#define HASH(P)                                            \
10906    x ^= (Py_uhash_t) *P << 7;                             \
10907    while (--len >= 0)                                     \
10908        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
10909
10910    x = (Py_uhash_t) _Py_HashSecret.prefix;
10911    switch (PyUnicode_KIND(self)) {
10912    case PyUnicode_1BYTE_KIND: {
10913        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10914        HASH(c);
10915        break;
10916    }
10917    case PyUnicode_2BYTE_KIND: {
10918        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10919        HASH(s);
10920        break;
10921    }
10922    default: {
10923        Py_UCS4 *l;
10924        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10925               "Impossible switch case in unicode_hash");
10926        l = PyUnicode_4BYTE_DATA(self);
10927        HASH(l);
10928        break;
10929    }
10930    }
10931    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10932    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
10933
10934    if (x == -1)
10935        x = -2;
10936    _PyUnicode_HASH(self) = x;
10937    return x;
10938}
10939#undef HASH
10940
10941PyDoc_STRVAR(index__doc__,
10942             "S.index(sub[, start[, end]]) -> int\n\
10943\n\
10944Like S.find() but raise ValueError when the substring is not found.");
10945
10946static PyObject *
10947unicode_index(PyObject *self, PyObject *args)
10948{
10949    Py_ssize_t result;
10950    PyObject *substring;
10951    Py_ssize_t start;
10952    Py_ssize_t end;
10953
10954    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10955                                            &start, &end))
10956        return NULL;
10957
10958    if (PyUnicode_READY(self) == -1)
10959        return NULL;
10960    if (PyUnicode_READY(substring) == -1)
10961        return NULL;
10962
10963    result = any_find_slice(1, self, substring, start, end);
10964
10965    Py_DECREF(substring);
10966
10967    if (result == -2)
10968        return NULL;
10969
10970    if (result < 0) {
10971        PyErr_SetString(PyExc_ValueError, "substring not found");
10972        return NULL;
10973    }
10974
10975    return PyLong_FromSsize_t(result);
10976}
10977
10978PyDoc_STRVAR(islower__doc__,
10979             "S.islower() -> bool\n\
10980\n\
10981Return True if all cased characters in S are lowercase and there is\n\
10982at least one cased character in S, False otherwise.");
10983
10984static PyObject*
10985unicode_islower(PyObject *self)
10986{
10987    Py_ssize_t i, length;
10988    int kind;
10989    void *data;
10990    int cased;
10991
10992    if (PyUnicode_READY(self) == -1)
10993        return NULL;
10994    length = PyUnicode_GET_LENGTH(self);
10995    kind = PyUnicode_KIND(self);
10996    data = PyUnicode_DATA(self);
10997
10998    /* Shortcut for single character strings */
10999    if (length == 1)
11000        return PyBool_FromLong(
11001            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11002
11003    /* Special case for empty strings */
11004    if (length == 0)
11005        return PyBool_FromLong(0);
11006
11007    cased = 0;
11008    for (i = 0; i < length; i++) {
11009        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11010
11011        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11012            return PyBool_FromLong(0);
11013        else if (!cased && Py_UNICODE_ISLOWER(ch))
11014            cased = 1;
11015    }
11016    return PyBool_FromLong(cased);
11017}
11018
11019PyDoc_STRVAR(isupper__doc__,
11020             "S.isupper() -> bool\n\
11021\n\
11022Return True if all cased characters in S are uppercase and there is\n\
11023at least one cased character in S, False otherwise.");
11024
11025static PyObject*
11026unicode_isupper(PyObject *self)
11027{
11028    Py_ssize_t i, length;
11029    int kind;
11030    void *data;
11031    int cased;
11032
11033    if (PyUnicode_READY(self) == -1)
11034        return NULL;
11035    length = PyUnicode_GET_LENGTH(self);
11036    kind = PyUnicode_KIND(self);
11037    data = PyUnicode_DATA(self);
11038
11039    /* Shortcut for single character strings */
11040    if (length == 1)
11041        return PyBool_FromLong(
11042            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11043
11044    /* Special case for empty strings */
11045    if (length == 0)
11046        return PyBool_FromLong(0);
11047
11048    cased = 0;
11049    for (i = 0; i < length; i++) {
11050        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11051
11052        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11053            return PyBool_FromLong(0);
11054        else if (!cased && Py_UNICODE_ISUPPER(ch))
11055            cased = 1;
11056    }
11057    return PyBool_FromLong(cased);
11058}
11059
11060PyDoc_STRVAR(istitle__doc__,
11061             "S.istitle() -> bool\n\
11062\n\
11063Return True if S is a titlecased string and there is at least one\n\
11064character in S, i.e. upper- and titlecase characters may only\n\
11065follow uncased characters and lowercase characters only cased ones.\n\
11066Return False otherwise.");
11067
11068static PyObject*
11069unicode_istitle(PyObject *self)
11070{
11071    Py_ssize_t i, length;
11072    int kind;
11073    void *data;
11074    int cased, previous_is_cased;
11075
11076    if (PyUnicode_READY(self) == -1)
11077        return NULL;
11078    length = PyUnicode_GET_LENGTH(self);
11079    kind = PyUnicode_KIND(self);
11080    data = PyUnicode_DATA(self);
11081
11082    /* Shortcut for single character strings */
11083    if (length == 1) {
11084        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11085        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11086                               (Py_UNICODE_ISUPPER(ch) != 0));
11087    }
11088
11089    /* Special case for empty strings */
11090    if (length == 0)
11091        return PyBool_FromLong(0);
11092
11093    cased = 0;
11094    previous_is_cased = 0;
11095    for (i = 0; i < length; i++) {
11096        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11097
11098        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11099            if (previous_is_cased)
11100                return PyBool_FromLong(0);
11101            previous_is_cased = 1;
11102            cased = 1;
11103        }
11104        else if (Py_UNICODE_ISLOWER(ch)) {
11105            if (!previous_is_cased)
11106                return PyBool_FromLong(0);
11107            previous_is_cased = 1;
11108            cased = 1;
11109        }
11110        else
11111            previous_is_cased = 0;
11112    }
11113    return PyBool_FromLong(cased);
11114}
11115
11116PyDoc_STRVAR(isspace__doc__,
11117             "S.isspace() -> bool\n\
11118\n\
11119Return True if all characters in S are whitespace\n\
11120and there is at least one character in S, False otherwise.");
11121
11122static PyObject*
11123unicode_isspace(PyObject *self)
11124{
11125    Py_ssize_t i, length;
11126    int kind;
11127    void *data;
11128
11129    if (PyUnicode_READY(self) == -1)
11130        return NULL;
11131    length = PyUnicode_GET_LENGTH(self);
11132    kind = PyUnicode_KIND(self);
11133    data = PyUnicode_DATA(self);
11134
11135    /* Shortcut for single character strings */
11136    if (length == 1)
11137        return PyBool_FromLong(
11138            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11139
11140    /* Special case for empty strings */
11141    if (length == 0)
11142        return PyBool_FromLong(0);
11143
11144    for (i = 0; i < length; i++) {
11145        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11146        if (!Py_UNICODE_ISSPACE(ch))
11147            return PyBool_FromLong(0);
11148    }
11149    return PyBool_FromLong(1);
11150}
11151
11152PyDoc_STRVAR(isalpha__doc__,
11153             "S.isalpha() -> bool\n\
11154\n\
11155Return True if all characters in S are alphabetic\n\
11156and there is at least one character in S, False otherwise.");
11157
11158static PyObject*
11159unicode_isalpha(PyObject *self)
11160{
11161    Py_ssize_t i, length;
11162    int kind;
11163    void *data;
11164
11165    if (PyUnicode_READY(self) == -1)
11166        return NULL;
11167    length = PyUnicode_GET_LENGTH(self);
11168    kind = PyUnicode_KIND(self);
11169    data = PyUnicode_DATA(self);
11170
11171    /* Shortcut for single character strings */
11172    if (length == 1)
11173        return PyBool_FromLong(
11174            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11175
11176    /* Special case for empty strings */
11177    if (length == 0)
11178        return PyBool_FromLong(0);
11179
11180    for (i = 0; i < length; i++) {
11181        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11182            return PyBool_FromLong(0);
11183    }
11184    return PyBool_FromLong(1);
11185}
11186
11187PyDoc_STRVAR(isalnum__doc__,
11188             "S.isalnum() -> bool\n\
11189\n\
11190Return True if all characters in S are alphanumeric\n\
11191and there is at least one character in S, False otherwise.");
11192
11193static PyObject*
11194unicode_isalnum(PyObject *self)
11195{
11196    int kind;
11197    void *data;
11198    Py_ssize_t len, i;
11199
11200    if (PyUnicode_READY(self) == -1)
11201        return NULL;
11202
11203    kind = PyUnicode_KIND(self);
11204    data = PyUnicode_DATA(self);
11205    len = PyUnicode_GET_LENGTH(self);
11206
11207    /* Shortcut for single character strings */
11208    if (len == 1) {
11209        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11210        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11211    }
11212
11213    /* Special case for empty strings */
11214    if (len == 0)
11215        return PyBool_FromLong(0);
11216
11217    for (i = 0; i < len; i++) {
11218        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11219        if (!Py_UNICODE_ISALNUM(ch))
11220            return PyBool_FromLong(0);
11221    }
11222    return PyBool_FromLong(1);
11223}
11224
11225PyDoc_STRVAR(isdecimal__doc__,
11226             "S.isdecimal() -> bool\n\
11227\n\
11228Return True if there are only decimal characters in S,\n\
11229False otherwise.");
11230
11231static PyObject*
11232unicode_isdecimal(PyObject *self)
11233{
11234    Py_ssize_t i, length;
11235    int kind;
11236    void *data;
11237
11238    if (PyUnicode_READY(self) == -1)
11239        return NULL;
11240    length = PyUnicode_GET_LENGTH(self);
11241    kind = PyUnicode_KIND(self);
11242    data = PyUnicode_DATA(self);
11243
11244    /* Shortcut for single character strings */
11245    if (length == 1)
11246        return PyBool_FromLong(
11247            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11248
11249    /* Special case for empty strings */
11250    if (length == 0)
11251        return PyBool_FromLong(0);
11252
11253    for (i = 0; i < length; i++) {
11254        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11255            return PyBool_FromLong(0);
11256    }
11257    return PyBool_FromLong(1);
11258}
11259
11260PyDoc_STRVAR(isdigit__doc__,
11261             "S.isdigit() -> bool\n\
11262\n\
11263Return True if all characters in S are digits\n\
11264and there is at least one character in S, False otherwise.");
11265
11266static PyObject*
11267unicode_isdigit(PyObject *self)
11268{
11269    Py_ssize_t i, length;
11270    int kind;
11271    void *data;
11272
11273    if (PyUnicode_READY(self) == -1)
11274        return NULL;
11275    length = PyUnicode_GET_LENGTH(self);
11276    kind = PyUnicode_KIND(self);
11277    data = PyUnicode_DATA(self);
11278
11279    /* Shortcut for single character strings */
11280    if (length == 1) {
11281        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11282        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11283    }
11284
11285    /* Special case for empty strings */
11286    if (length == 0)
11287        return PyBool_FromLong(0);
11288
11289    for (i = 0; i < length; i++) {
11290        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11291            return PyBool_FromLong(0);
11292    }
11293    return PyBool_FromLong(1);
11294}
11295
11296PyDoc_STRVAR(isnumeric__doc__,
11297             "S.isnumeric() -> bool\n\
11298\n\
11299Return True if there are only numeric characters in S,\n\
11300False otherwise.");
11301
11302static PyObject*
11303unicode_isnumeric(PyObject *self)
11304{
11305    Py_ssize_t i, length;
11306    int kind;
11307    void *data;
11308
11309    if (PyUnicode_READY(self) == -1)
11310        return NULL;
11311    length = PyUnicode_GET_LENGTH(self);
11312    kind = PyUnicode_KIND(self);
11313    data = PyUnicode_DATA(self);
11314
11315    /* Shortcut for single character strings */
11316    if (length == 1)
11317        return PyBool_FromLong(
11318            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11319
11320    /* Special case for empty strings */
11321    if (length == 0)
11322        return PyBool_FromLong(0);
11323
11324    for (i = 0; i < length; i++) {
11325        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11326            return PyBool_FromLong(0);
11327    }
11328    return PyBool_FromLong(1);
11329}
11330
11331int
11332PyUnicode_IsIdentifier(PyObject *self)
11333{
11334    int kind;
11335    void *data;
11336    Py_ssize_t i;
11337    Py_UCS4 first;
11338
11339    if (PyUnicode_READY(self) == -1) {
11340        Py_FatalError("identifier not ready");
11341        return 0;
11342    }
11343
11344    /* Special case for empty strings */
11345    if (PyUnicode_GET_LENGTH(self) == 0)
11346        return 0;
11347    kind = PyUnicode_KIND(self);
11348    data = PyUnicode_DATA(self);
11349
11350    /* PEP 3131 says that the first character must be in
11351       XID_Start and subsequent characters in XID_Continue,
11352       and for the ASCII range, the 2.x rules apply (i.e
11353       start with letters and underscore, continue with
11354       letters, digits, underscore). However, given the current
11355       definition of XID_Start and XID_Continue, it is sufficient
11356       to check just for these, except that _ must be allowed
11357       as starting an identifier.  */
11358    first = PyUnicode_READ(kind, data, 0);
11359    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11360        return 0;
11361
11362    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11363        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11364            return 0;
11365    return 1;
11366}
11367
11368PyDoc_STRVAR(isidentifier__doc__,
11369             "S.isidentifier() -> bool\n\
11370\n\
11371Return True if S is a valid identifier according\n\
11372to the language definition.");
11373
11374static PyObject*
11375unicode_isidentifier(PyObject *self)
11376{
11377    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11378}
11379
11380PyDoc_STRVAR(isprintable__doc__,
11381             "S.isprintable() -> bool\n\
11382\n\
11383Return True if all characters in S are considered\n\
11384printable in repr() or S is empty, False otherwise.");
11385
11386static PyObject*
11387unicode_isprintable(PyObject *self)
11388{
11389    Py_ssize_t i, length;
11390    int kind;
11391    void *data;
11392
11393    if (PyUnicode_READY(self) == -1)
11394        return NULL;
11395    length = PyUnicode_GET_LENGTH(self);
11396    kind = PyUnicode_KIND(self);
11397    data = PyUnicode_DATA(self);
11398
11399    /* Shortcut for single character strings */
11400    if (length == 1)
11401        return PyBool_FromLong(
11402            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11403
11404    for (i = 0; i < length; i++) {
11405        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11406            Py_RETURN_FALSE;
11407        }
11408    }
11409    Py_RETURN_TRUE;
11410}
11411
11412PyDoc_STRVAR(join__doc__,
11413             "S.join(iterable) -> str\n\
11414\n\
11415Return a string which is the concatenation of the strings in the\n\
11416iterable.  The separator between elements is S.");
11417
11418static PyObject*
11419unicode_join(PyObject *self, PyObject *data)
11420{
11421    return PyUnicode_Join(self, data);
11422}
11423
11424static Py_ssize_t
11425unicode_length(PyObject *self)
11426{
11427    if (PyUnicode_READY(self) == -1)
11428        return -1;
11429    return PyUnicode_GET_LENGTH(self);
11430}
11431
11432PyDoc_STRVAR(ljust__doc__,
11433             "S.ljust(width[, fillchar]) -> str\n\
11434\n\
11435Return S left-justified in a Unicode string of length width. Padding is\n\
11436done using the specified fill character (default is a space).");
11437
11438static PyObject *
11439unicode_ljust(PyObject *self, PyObject *args)
11440{
11441    Py_ssize_t width;
11442    Py_UCS4 fillchar = ' ';
11443
11444    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11445        return NULL;
11446
11447    if (PyUnicode_READY(self) == -1)
11448        return NULL;
11449
11450    if (PyUnicode_GET_LENGTH(self) >= width)
11451        return unicode_result_unchanged(self);
11452
11453    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11454}
11455
11456PyDoc_STRVAR(lower__doc__,
11457             "S.lower() -> str\n\
11458\n\
11459Return a copy of the string S converted to lowercase.");
11460
11461static PyObject*
11462unicode_lower(PyObject *self)
11463{
11464    if (PyUnicode_READY(self) == -1)
11465        return NULL;
11466    if (PyUnicode_IS_ASCII(self))
11467        return ascii_upper_or_lower(self, 1);
11468    return case_operation(self, do_lower);
11469}
11470
11471#define LEFTSTRIP 0
11472#define RIGHTSTRIP 1
11473#define BOTHSTRIP 2
11474
11475/* Arrays indexed by above */
11476static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11477
11478#define STRIPNAME(i) (stripformat[i]+3)
11479
11480/* externally visible for str.strip(unicode) */
11481PyObject *
11482_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11483{
11484    void *data;
11485    int kind;
11486    Py_ssize_t i, j, len;
11487    BLOOM_MASK sepmask;
11488
11489    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11490        return NULL;
11491
11492    kind = PyUnicode_KIND(self);
11493    data = PyUnicode_DATA(self);
11494    len = PyUnicode_GET_LENGTH(self);
11495    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11496                              PyUnicode_DATA(sepobj),
11497                              PyUnicode_GET_LENGTH(sepobj));
11498
11499    i = 0;
11500    if (striptype != RIGHTSTRIP) {
11501        while (i < len &&
11502               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11503            i++;
11504        }
11505    }
11506
11507    j = len;
11508    if (striptype != LEFTSTRIP) {
11509        do {
11510            j--;
11511        } while (j >= i &&
11512                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11513        j++;
11514    }
11515
11516    return PyUnicode_Substring(self, i, j);
11517}
11518
11519PyObject*
11520PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11521{
11522    unsigned char *data;
11523    int kind;
11524    Py_ssize_t length;
11525
11526    if (PyUnicode_READY(self) == -1)
11527        return NULL;
11528
11529    length = PyUnicode_GET_LENGTH(self);
11530    end = Py_MIN(end, length);
11531
11532    if (start == 0 && end == length)
11533        return unicode_result_unchanged(self);
11534
11535    if (start < 0 || end < 0) {
11536        PyErr_SetString(PyExc_IndexError, "string index out of range");
11537        return NULL;
11538    }
11539    if (start >= length || end < start) {
11540        Py_INCREF(unicode_empty);
11541        return unicode_empty;
11542    }
11543
11544    length = end - start;
11545    if (PyUnicode_IS_ASCII(self)) {
11546        data = PyUnicode_1BYTE_DATA(self);
11547        return _PyUnicode_FromASCII((char*)(data + start), length);
11548    }
11549    else {
11550        kind = PyUnicode_KIND(self);
11551        data = PyUnicode_1BYTE_DATA(self);
11552        return PyUnicode_FromKindAndData(kind,
11553                                         data + kind * start,
11554                                         length);
11555    }
11556}
11557
11558static PyObject *
11559do_strip(PyObject *self, int striptype)
11560{
11561    int kind;
11562    void *data;
11563    Py_ssize_t len, i, j;
11564
11565    if (PyUnicode_READY(self) == -1)
11566        return NULL;
11567
11568    kind = PyUnicode_KIND(self);
11569    data = PyUnicode_DATA(self);
11570    len = PyUnicode_GET_LENGTH(self);
11571
11572    i = 0;
11573    if (striptype != RIGHTSTRIP) {
11574        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11575            i++;
11576        }
11577    }
11578
11579    j = len;
11580    if (striptype != LEFTSTRIP) {
11581        do {
11582            j--;
11583        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11584        j++;
11585    }
11586
11587    return PyUnicode_Substring(self, i, j);
11588}
11589
11590
11591static PyObject *
11592do_argstrip(PyObject *self, int striptype, PyObject *args)
11593{
11594    PyObject *sep = NULL;
11595
11596    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11597        return NULL;
11598
11599    if (sep != NULL && sep != Py_None) {
11600        if (PyUnicode_Check(sep))
11601            return _PyUnicode_XStrip(self, striptype, sep);
11602        else {
11603            PyErr_Format(PyExc_TypeError,
11604                         "%s arg must be None or str",
11605                         STRIPNAME(striptype));
11606            return NULL;
11607        }
11608    }
11609
11610    return do_strip(self, striptype);
11611}
11612
11613
11614PyDoc_STRVAR(strip__doc__,
11615             "S.strip([chars]) -> str\n\
11616\n\
11617Return a copy of the string S with leading and trailing\n\
11618whitespace removed.\n\
11619If chars is given and not None, remove characters in chars instead.");
11620
11621static PyObject *
11622unicode_strip(PyObject *self, PyObject *args)
11623{
11624    if (PyTuple_GET_SIZE(args) == 0)
11625        return do_strip(self, BOTHSTRIP); /* Common case */
11626    else
11627        return do_argstrip(self, BOTHSTRIP, args);
11628}
11629
11630
11631PyDoc_STRVAR(lstrip__doc__,
11632             "S.lstrip([chars]) -> str\n\
11633\n\
11634Return a copy of the string S with leading whitespace removed.\n\
11635If chars is given and not None, remove characters in chars instead.");
11636
11637static PyObject *
11638unicode_lstrip(PyObject *self, PyObject *args)
11639{
11640    if (PyTuple_GET_SIZE(args) == 0)
11641        return do_strip(self, LEFTSTRIP); /* Common case */
11642    else
11643        return do_argstrip(self, LEFTSTRIP, args);
11644}
11645
11646
11647PyDoc_STRVAR(rstrip__doc__,
11648             "S.rstrip([chars]) -> str\n\
11649\n\
11650Return a copy of the string S with trailing whitespace removed.\n\
11651If chars is given and not None, remove characters in chars instead.");
11652
11653static PyObject *
11654unicode_rstrip(PyObject *self, PyObject *args)
11655{
11656    if (PyTuple_GET_SIZE(args) == 0)
11657        return do_strip(self, RIGHTSTRIP); /* Common case */
11658    else
11659        return do_argstrip(self, RIGHTSTRIP, args);
11660}
11661
11662
11663static PyObject*
11664unicode_repeat(PyObject *str, Py_ssize_t len)
11665{
11666    PyObject *u;
11667    Py_ssize_t nchars, n;
11668
11669    if (len < 1) {
11670        Py_INCREF(unicode_empty);
11671        return unicode_empty;
11672    }
11673
11674    /* no repeat, return original string */
11675    if (len == 1)
11676        return unicode_result_unchanged(str);
11677
11678    if (PyUnicode_READY(str) == -1)
11679        return NULL;
11680
11681    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11682        PyErr_SetString(PyExc_OverflowError,
11683                        "repeated string is too long");
11684        return NULL;
11685    }
11686    nchars = len * PyUnicode_GET_LENGTH(str);
11687
11688    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11689    if (!u)
11690        return NULL;
11691    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11692
11693    if (PyUnicode_GET_LENGTH(str) == 1) {
11694        const int kind = PyUnicode_KIND(str);
11695        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11696        if (kind == PyUnicode_1BYTE_KIND) {
11697            void *to = PyUnicode_DATA(u);
11698            memset(to, (unsigned char)fill_char, len);
11699        }
11700        else if (kind == PyUnicode_2BYTE_KIND) {
11701            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11702            for (n = 0; n < len; ++n)
11703                ucs2[n] = fill_char;
11704        } else {
11705            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11706            assert(kind == PyUnicode_4BYTE_KIND);
11707            for (n = 0; n < len; ++n)
11708                ucs4[n] = fill_char;
11709        }
11710    }
11711    else {
11712        /* number of characters copied this far */
11713        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11714        const Py_ssize_t char_size = PyUnicode_KIND(str);
11715        char *to = (char *) PyUnicode_DATA(u);
11716        Py_MEMCPY(to, PyUnicode_DATA(str),
11717                  PyUnicode_GET_LENGTH(str) * char_size);
11718        while (done < nchars) {
11719            n = (done <= nchars-done) ? done : nchars-done;
11720            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11721            done += n;
11722        }
11723    }
11724
11725    assert(_PyUnicode_CheckConsistency(u, 1));
11726    return u;
11727}
11728
11729PyObject *
11730PyUnicode_Replace(PyObject *obj,
11731                  PyObject *subobj,
11732                  PyObject *replobj,
11733                  Py_ssize_t maxcount)
11734{
11735    PyObject *self;
11736    PyObject *str1;
11737    PyObject *str2;
11738    PyObject *result;
11739
11740    self = PyUnicode_FromObject(obj);
11741    if (self == NULL)
11742        return NULL;
11743    str1 = PyUnicode_FromObject(subobj);
11744    if (str1 == NULL) {
11745        Py_DECREF(self);
11746        return NULL;
11747    }
11748    str2 = PyUnicode_FromObject(replobj);
11749    if (str2 == NULL) {
11750        Py_DECREF(self);
11751        Py_DECREF(str1);
11752        return NULL;
11753    }
11754    if (PyUnicode_READY(self) == -1 ||
11755        PyUnicode_READY(str1) == -1 ||
11756        PyUnicode_READY(str2) == -1)
11757        result = NULL;
11758    else
11759        result = replace(self, str1, str2, maxcount);
11760    Py_DECREF(self);
11761    Py_DECREF(str1);
11762    Py_DECREF(str2);
11763    return result;
11764}
11765
11766PyDoc_STRVAR(replace__doc__,
11767             "S.replace(old, new[, count]) -> str\n\
11768\n\
11769Return a copy of S with all occurrences of substring\n\
11770old replaced by new.  If the optional argument count is\n\
11771given, only the first count occurrences are replaced.");
11772
11773static PyObject*
11774unicode_replace(PyObject *self, PyObject *args)
11775{
11776    PyObject *str1;
11777    PyObject *str2;
11778    Py_ssize_t maxcount = -1;
11779    PyObject *result;
11780
11781    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11782        return NULL;
11783    if (PyUnicode_READY(self) == -1)
11784        return NULL;
11785    str1 = PyUnicode_FromObject(str1);
11786    if (str1 == NULL)
11787        return NULL;
11788    str2 = PyUnicode_FromObject(str2);
11789    if (str2 == NULL) {
11790        Py_DECREF(str1);
11791        return NULL;
11792    }
11793    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11794        result = NULL;
11795    else
11796        result = replace(self, str1, str2, maxcount);
11797
11798    Py_DECREF(str1);
11799    Py_DECREF(str2);
11800    return result;
11801}
11802
11803static PyObject *
11804unicode_repr(PyObject *unicode)
11805{
11806    PyObject *repr;
11807    Py_ssize_t isize;
11808    Py_ssize_t osize, squote, dquote, i, o;
11809    Py_UCS4 max, quote;
11810    int ikind, okind;
11811    void *idata, *odata;
11812
11813    if (PyUnicode_READY(unicode) == -1)
11814        return NULL;
11815
11816    isize = PyUnicode_GET_LENGTH(unicode);
11817    idata = PyUnicode_DATA(unicode);
11818
11819    /* Compute length of output, quote characters, and
11820       maximum character */
11821    osize = 2; /* quotes */
11822    max = 127;
11823    squote = dquote = 0;
11824    ikind = PyUnicode_KIND(unicode);
11825    for (i = 0; i < isize; i++) {
11826        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11827        switch (ch) {
11828        case '\'': squote++; osize++; break;
11829        case '"':  dquote++; osize++; break;
11830        case '\\': case '\t': case '\r': case '\n':
11831            osize += 2; break;
11832        default:
11833            /* Fast-path ASCII */
11834            if (ch < ' ' || ch == 0x7f)
11835                osize += 4; /* \xHH */
11836            else if (ch < 0x7f)
11837                osize++;
11838            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11839                osize++;
11840                max = ch > max ? ch : max;
11841            }
11842            else if (ch < 0x100)
11843                osize += 4; /* \xHH */
11844            else if (ch < 0x10000)
11845                osize += 6; /* \uHHHH */
11846            else
11847                osize += 10; /* \uHHHHHHHH */
11848        }
11849    }
11850
11851    quote = '\'';
11852    if (squote) {
11853        if (dquote)
11854            /* Both squote and dquote present. Use squote,
11855               and escape them */
11856            osize += squote;
11857        else
11858            quote = '"';
11859    }
11860
11861    repr = PyUnicode_New(osize, max);
11862    if (repr == NULL)
11863        return NULL;
11864    okind = PyUnicode_KIND(repr);
11865    odata = PyUnicode_DATA(repr);
11866
11867    PyUnicode_WRITE(okind, odata, 0, quote);
11868    PyUnicode_WRITE(okind, odata, osize-1, quote);
11869
11870    for (i = 0, o = 1; i < isize; i++) {
11871        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11872
11873        /* Escape quotes and backslashes */
11874        if ((ch == quote) || (ch == '\\')) {
11875            PyUnicode_WRITE(okind, odata, o++, '\\');
11876            PyUnicode_WRITE(okind, odata, o++, ch);
11877            continue;
11878        }
11879
11880        /* Map special whitespace to '\t', \n', '\r' */
11881        if (ch == '\t') {
11882            PyUnicode_WRITE(okind, odata, o++, '\\');
11883            PyUnicode_WRITE(okind, odata, o++, 't');
11884        }
11885        else if (ch == '\n') {
11886            PyUnicode_WRITE(okind, odata, o++, '\\');
11887            PyUnicode_WRITE(okind, odata, o++, 'n');
11888        }
11889        else if (ch == '\r') {
11890            PyUnicode_WRITE(okind, odata, o++, '\\');
11891            PyUnicode_WRITE(okind, odata, o++, 'r');
11892        }
11893
11894        /* Map non-printable US ASCII to '\xhh' */
11895        else if (ch < ' ' || ch == 0x7F) {
11896            PyUnicode_WRITE(okind, odata, o++, '\\');
11897            PyUnicode_WRITE(okind, odata, o++, 'x');
11898            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11899            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
11900        }
11901
11902        /* Copy ASCII characters as-is */
11903        else if (ch < 0x7F) {
11904            PyUnicode_WRITE(okind, odata, o++, ch);
11905        }
11906
11907        /* Non-ASCII characters */
11908        else {
11909            /* Map Unicode whitespace and control characters
11910               (categories Z* and C* except ASCII space)
11911            */
11912            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11913                PyUnicode_WRITE(okind, odata, o++, '\\');
11914                /* Map 8-bit characters to '\xhh' */
11915                if (ch <= 0xff) {
11916                    PyUnicode_WRITE(okind, odata, o++, 'x');
11917                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11918                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
11919                }
11920                /* Map 16-bit characters to '\uxxxx' */
11921                else if (ch <= 0xffff) {
11922                    PyUnicode_WRITE(okind, odata, o++, 'u');
11923                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11924                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11925                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11926                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
11927                }
11928                /* Map 21-bit characters to '\U00xxxxxx' */
11929                else {
11930                    PyUnicode_WRITE(okind, odata, o++, 'U');
11931                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11932                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11933                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11934                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11935                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11936                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11937                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11938                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
11939                }
11940            }
11941            /* Copy characters as-is */
11942            else {
11943                PyUnicode_WRITE(okind, odata, o++, ch);
11944            }
11945        }
11946    }
11947    /* Closing quote already added at the beginning */
11948    assert(_PyUnicode_CheckConsistency(repr, 1));
11949    return repr;
11950}
11951
11952PyDoc_STRVAR(rfind__doc__,
11953             "S.rfind(sub[, start[, end]]) -> int\n\
11954\n\
11955Return the highest index in S where substring sub is found,\n\
11956such that sub is contained within S[start:end].  Optional\n\
11957arguments start and end are interpreted as in slice notation.\n\
11958\n\
11959Return -1 on failure.");
11960
11961static PyObject *
11962unicode_rfind(PyObject *self, PyObject *args)
11963{
11964    PyObject *substring;
11965    Py_ssize_t start;
11966    Py_ssize_t end;
11967    Py_ssize_t result;
11968
11969    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11970                                            &start, &end))
11971        return NULL;
11972
11973    if (PyUnicode_READY(self) == -1)
11974        return NULL;
11975    if (PyUnicode_READY(substring) == -1)
11976        return NULL;
11977
11978    result = any_find_slice(-1, self, substring, start, end);
11979
11980    Py_DECREF(substring);
11981
11982    if (result == -2)
11983        return NULL;
11984
11985    return PyLong_FromSsize_t(result);
11986}
11987
11988PyDoc_STRVAR(rindex__doc__,
11989             "S.rindex(sub[, start[, end]]) -> int\n\
11990\n\
11991Like S.rfind() but raise ValueError when the substring is not found.");
11992
11993static PyObject *
11994unicode_rindex(PyObject *self, PyObject *args)
11995{
11996    PyObject *substring;
11997    Py_ssize_t start;
11998    Py_ssize_t end;
11999    Py_ssize_t result;
12000
12001    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12002                                            &start, &end))
12003        return NULL;
12004
12005    if (PyUnicode_READY(self) == -1)
12006        return NULL;
12007    if (PyUnicode_READY(substring) == -1)
12008        return NULL;
12009
12010    result = any_find_slice(-1, self, substring, start, end);
12011
12012    Py_DECREF(substring);
12013
12014    if (result == -2)
12015        return NULL;
12016
12017    if (result < 0) {
12018        PyErr_SetString(PyExc_ValueError, "substring not found");
12019        return NULL;
12020    }
12021
12022    return PyLong_FromSsize_t(result);
12023}
12024
12025PyDoc_STRVAR(rjust__doc__,
12026             "S.rjust(width[, fillchar]) -> str\n\
12027\n\
12028Return S right-justified in a string of length width. Padding is\n\
12029done using the specified fill character (default is a space).");
12030
12031static PyObject *
12032unicode_rjust(PyObject *self, PyObject *args)
12033{
12034    Py_ssize_t width;
12035    Py_UCS4 fillchar = ' ';
12036
12037    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12038        return NULL;
12039
12040    if (PyUnicode_READY(self) == -1)
12041        return NULL;
12042
12043    if (PyUnicode_GET_LENGTH(self) >= width)
12044        return unicode_result_unchanged(self);
12045
12046    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12047}
12048
12049PyObject *
12050PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12051{
12052    PyObject *result;
12053
12054    s = PyUnicode_FromObject(s);
12055    if (s == NULL)
12056        return NULL;
12057    if (sep != NULL) {
12058        sep = PyUnicode_FromObject(sep);
12059        if (sep == NULL) {
12060            Py_DECREF(s);
12061            return NULL;
12062        }
12063    }
12064
12065    result = split(s, sep, maxsplit);
12066
12067    Py_DECREF(s);
12068    Py_XDECREF(sep);
12069    return result;
12070}
12071
12072PyDoc_STRVAR(split__doc__,
12073             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12074\n\
12075Return a list of the words in S, using sep as the\n\
12076delimiter string.  If maxsplit is given, at most maxsplit\n\
12077splits are done. If sep is not specified or is None, any\n\
12078whitespace string is a separator and empty strings are\n\
12079removed from the result.");
12080
12081static PyObject*
12082unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12083{
12084    static char *kwlist[] = {"sep", "maxsplit", 0};
12085    PyObject *substring = Py_None;
12086    Py_ssize_t maxcount = -1;
12087
12088    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12089                                     kwlist, &substring, &maxcount))
12090        return NULL;
12091
12092    if (substring == Py_None)
12093        return split(self, NULL, maxcount);
12094    else if (PyUnicode_Check(substring))
12095        return split(self, substring, maxcount);
12096    else
12097        return PyUnicode_Split(self, substring, maxcount);
12098}
12099
12100PyObject *
12101PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12102{
12103    PyObject* str_obj;
12104    PyObject* sep_obj;
12105    PyObject* out;
12106    int kind1, kind2, kind;
12107    void *buf1 = NULL, *buf2 = NULL;
12108    Py_ssize_t len1, len2;
12109
12110    str_obj = PyUnicode_FromObject(str_in);
12111    if (!str_obj)
12112        return NULL;
12113    sep_obj = PyUnicode_FromObject(sep_in);
12114    if (!sep_obj) {
12115        Py_DECREF(str_obj);
12116        return NULL;
12117    }
12118    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12119        Py_DECREF(sep_obj);
12120        Py_DECREF(str_obj);
12121        return NULL;
12122    }
12123
12124    kind1 = PyUnicode_KIND(str_obj);
12125    kind2 = PyUnicode_KIND(sep_obj);
12126    kind = Py_MAX(kind1, kind2);
12127    buf1 = PyUnicode_DATA(str_obj);
12128    if (kind1 != kind)
12129        buf1 = _PyUnicode_AsKind(str_obj, kind);
12130    if (!buf1)
12131        goto onError;
12132    buf2 = PyUnicode_DATA(sep_obj);
12133    if (kind2 != kind)
12134        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12135    if (!buf2)
12136        goto onError;
12137    len1 = PyUnicode_GET_LENGTH(str_obj);
12138    len2 = PyUnicode_GET_LENGTH(sep_obj);
12139
12140    switch (PyUnicode_KIND(str_obj)) {
12141    case PyUnicode_1BYTE_KIND:
12142        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12143            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12144        else
12145            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12146        break;
12147    case PyUnicode_2BYTE_KIND:
12148        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12149        break;
12150    case PyUnicode_4BYTE_KIND:
12151        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12152        break;
12153    default:
12154        assert(0);
12155        out = 0;
12156    }
12157
12158    Py_DECREF(sep_obj);
12159    Py_DECREF(str_obj);
12160    if (kind1 != kind)
12161        PyMem_Free(buf1);
12162    if (kind2 != kind)
12163        PyMem_Free(buf2);
12164
12165    return out;
12166  onError:
12167    Py_DECREF(sep_obj);
12168    Py_DECREF(str_obj);
12169    if (kind1 != kind && buf1)
12170        PyMem_Free(buf1);
12171    if (kind2 != kind && buf2)
12172        PyMem_Free(buf2);
12173    return NULL;
12174}
12175
12176
12177PyObject *
12178PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12179{
12180    PyObject* str_obj;
12181    PyObject* sep_obj;
12182    PyObject* out;
12183    int kind1, kind2, kind;
12184    void *buf1 = NULL, *buf2 = NULL;
12185    Py_ssize_t len1, len2;
12186
12187    str_obj = PyUnicode_FromObject(str_in);
12188    if (!str_obj)
12189        return NULL;
12190    sep_obj = PyUnicode_FromObject(sep_in);
12191    if (!sep_obj) {
12192        Py_DECREF(str_obj);
12193        return NULL;
12194    }
12195
12196    kind1 = PyUnicode_KIND(str_in);
12197    kind2 = PyUnicode_KIND(sep_obj);
12198    kind = Py_MAX(kind1, kind2);
12199    buf1 = PyUnicode_DATA(str_in);
12200    if (kind1 != kind)
12201        buf1 = _PyUnicode_AsKind(str_in, kind);
12202    if (!buf1)
12203        goto onError;
12204    buf2 = PyUnicode_DATA(sep_obj);
12205    if (kind2 != kind)
12206        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12207    if (!buf2)
12208        goto onError;
12209    len1 = PyUnicode_GET_LENGTH(str_obj);
12210    len2 = PyUnicode_GET_LENGTH(sep_obj);
12211
12212    switch (PyUnicode_KIND(str_in)) {
12213    case PyUnicode_1BYTE_KIND:
12214        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12215            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12216        else
12217            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12218        break;
12219    case PyUnicode_2BYTE_KIND:
12220        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12221        break;
12222    case PyUnicode_4BYTE_KIND:
12223        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12224        break;
12225    default:
12226        assert(0);
12227        out = 0;
12228    }
12229
12230    Py_DECREF(sep_obj);
12231    Py_DECREF(str_obj);
12232    if (kind1 != kind)
12233        PyMem_Free(buf1);
12234    if (kind2 != kind)
12235        PyMem_Free(buf2);
12236
12237    return out;
12238  onError:
12239    Py_DECREF(sep_obj);
12240    Py_DECREF(str_obj);
12241    if (kind1 != kind && buf1)
12242        PyMem_Free(buf1);
12243    if (kind2 != kind && buf2)
12244        PyMem_Free(buf2);
12245    return NULL;
12246}
12247
12248PyDoc_STRVAR(partition__doc__,
12249             "S.partition(sep) -> (head, sep, tail)\n\
12250\n\
12251Search for the separator sep in S, and return the part before it,\n\
12252the separator itself, and the part after it.  If the separator is not\n\
12253found, return S and two empty strings.");
12254
12255static PyObject*
12256unicode_partition(PyObject *self, PyObject *separator)
12257{
12258    return PyUnicode_Partition(self, separator);
12259}
12260
12261PyDoc_STRVAR(rpartition__doc__,
12262             "S.rpartition(sep) -> (head, sep, tail)\n\
12263\n\
12264Search for the separator sep in S, starting at the end of S, and return\n\
12265the part before it, the separator itself, and the part after it.  If the\n\
12266separator is not found, return two empty strings and S.");
12267
12268static PyObject*
12269unicode_rpartition(PyObject *self, PyObject *separator)
12270{
12271    return PyUnicode_RPartition(self, separator);
12272}
12273
12274PyObject *
12275PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12276{
12277    PyObject *result;
12278
12279    s = PyUnicode_FromObject(s);
12280    if (s == NULL)
12281        return NULL;
12282    if (sep != NULL) {
12283        sep = PyUnicode_FromObject(sep);
12284        if (sep == NULL) {
12285            Py_DECREF(s);
12286            return NULL;
12287        }
12288    }
12289
12290    result = rsplit(s, sep, maxsplit);
12291
12292    Py_DECREF(s);
12293    Py_XDECREF(sep);
12294    return result;
12295}
12296
12297PyDoc_STRVAR(rsplit__doc__,
12298             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12299\n\
12300Return a list of the words in S, using sep as the\n\
12301delimiter string, starting at the end of the string and\n\
12302working to the front.  If maxsplit is given, at most maxsplit\n\
12303splits are done. If sep is not specified, any whitespace string\n\
12304is a separator.");
12305
12306static PyObject*
12307unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12308{
12309    static char *kwlist[] = {"sep", "maxsplit", 0};
12310    PyObject *substring = Py_None;
12311    Py_ssize_t maxcount = -1;
12312
12313    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12314                                     kwlist, &substring, &maxcount))
12315        return NULL;
12316
12317    if (substring == Py_None)
12318        return rsplit(self, NULL, maxcount);
12319    else if (PyUnicode_Check(substring))
12320        return rsplit(self, substring, maxcount);
12321    else
12322        return PyUnicode_RSplit(self, substring, maxcount);
12323}
12324
12325PyDoc_STRVAR(splitlines__doc__,
12326             "S.splitlines([keepends]) -> list of strings\n\
12327\n\
12328Return a list of the lines in S, breaking at line boundaries.\n\
12329Line breaks are not included in the resulting list unless keepends\n\
12330is given and true.");
12331
12332static PyObject*
12333unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12334{
12335    static char *kwlist[] = {"keepends", 0};
12336    int keepends = 0;
12337
12338    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12339                                     kwlist, &keepends))
12340        return NULL;
12341
12342    return PyUnicode_Splitlines(self, keepends);
12343}
12344
12345static
12346PyObject *unicode_str(PyObject *self)
12347{
12348    return unicode_result_unchanged(self);
12349}
12350
12351PyDoc_STRVAR(swapcase__doc__,
12352             "S.swapcase() -> str\n\
12353\n\
12354Return a copy of S with uppercase characters converted to lowercase\n\
12355and vice versa.");
12356
12357static PyObject*
12358unicode_swapcase(PyObject *self)
12359{
12360    if (PyUnicode_READY(self) == -1)
12361        return NULL;
12362    return case_operation(self, do_swapcase);
12363}
12364
12365PyDoc_STRVAR(maketrans__doc__,
12366             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12367\n\
12368Return a translation table usable for str.translate().\n\
12369If there is only one argument, it must be a dictionary mapping Unicode\n\
12370ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12371Character keys will be then converted to ordinals.\n\
12372If there are two arguments, they must be strings of equal length, and\n\
12373in the resulting dictionary, each character in x will be mapped to the\n\
12374character at the same position in y. If there is a third argument, it\n\
12375must be a string, whose characters will be mapped to None in the result.");
12376
12377static PyObject*
12378unicode_maketrans(PyObject *null, PyObject *args)
12379{
12380    PyObject *x, *y = NULL, *z = NULL;
12381    PyObject *new = NULL, *key, *value;
12382    Py_ssize_t i = 0;
12383    int res;
12384
12385    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12386        return NULL;
12387    new = PyDict_New();
12388    if (!new)
12389        return NULL;
12390    if (y != NULL) {
12391        int x_kind, y_kind, z_kind;
12392        void *x_data, *y_data, *z_data;
12393
12394        /* x must be a string too, of equal length */
12395        if (!PyUnicode_Check(x)) {
12396            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12397                            "be a string if there is a second argument");
12398            goto err;
12399        }
12400        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12401            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12402                            "arguments must have equal length");
12403            goto err;
12404        }
12405        /* create entries for translating chars in x to those in y */
12406        x_kind = PyUnicode_KIND(x);
12407        y_kind = PyUnicode_KIND(y);
12408        x_data = PyUnicode_DATA(x);
12409        y_data = PyUnicode_DATA(y);
12410        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12411            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12412            if (!key)
12413                goto err;
12414            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12415            if (!value) {
12416                Py_DECREF(key);
12417                goto err;
12418            }
12419            res = PyDict_SetItem(new, key, value);
12420            Py_DECREF(key);
12421            Py_DECREF(value);
12422            if (res < 0)
12423                goto err;
12424        }
12425        /* create entries for deleting chars in z */
12426        if (z != NULL) {
12427            z_kind = PyUnicode_KIND(z);
12428            z_data = PyUnicode_DATA(z);
12429            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12430                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12431                if (!key)
12432                    goto err;
12433                res = PyDict_SetItem(new, key, Py_None);
12434                Py_DECREF(key);
12435                if (res < 0)
12436                    goto err;
12437            }
12438        }
12439    } else {
12440        int kind;
12441        void *data;
12442
12443        /* x must be a dict */
12444        if (!PyDict_CheckExact(x)) {
12445            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12446                            "to maketrans it must be a dict");
12447            goto err;
12448        }
12449        /* copy entries into the new dict, converting string keys to int keys */
12450        while (PyDict_Next(x, &i, &key, &value)) {
12451            if (PyUnicode_Check(key)) {
12452                /* convert string keys to integer keys */
12453                PyObject *newkey;
12454                if (PyUnicode_GET_LENGTH(key) != 1) {
12455                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12456                                    "table must be of length 1");
12457                    goto err;
12458                }
12459                kind = PyUnicode_KIND(key);
12460                data = PyUnicode_DATA(key);
12461                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12462                if (!newkey)
12463                    goto err;
12464                res = PyDict_SetItem(new, newkey, value);
12465                Py_DECREF(newkey);
12466                if (res < 0)
12467                    goto err;
12468            } else if (PyLong_Check(key)) {
12469                /* just keep integer keys */
12470                if (PyDict_SetItem(new, key, value) < 0)
12471                    goto err;
12472            } else {
12473                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12474                                "be strings or integers");
12475                goto err;
12476            }
12477        }
12478    }
12479    return new;
12480  err:
12481    Py_DECREF(new);
12482    return NULL;
12483}
12484
12485PyDoc_STRVAR(translate__doc__,
12486             "S.translate(table) -> str\n\
12487\n\
12488Return a copy of the string S, where all characters have been mapped\n\
12489through the given translation table, which must be a mapping of\n\
12490Unicode ordinals to Unicode ordinals, strings, or None.\n\
12491Unmapped characters are left untouched. Characters mapped to None\n\
12492are deleted.");
12493
12494static PyObject*
12495unicode_translate(PyObject *self, PyObject *table)
12496{
12497    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12498}
12499
12500PyDoc_STRVAR(upper__doc__,
12501             "S.upper() -> str\n\
12502\n\
12503Return a copy of S converted to uppercase.");
12504
12505static PyObject*
12506unicode_upper(PyObject *self)
12507{
12508    if (PyUnicode_READY(self) == -1)
12509        return NULL;
12510    if (PyUnicode_IS_ASCII(self))
12511        return ascii_upper_or_lower(self, 0);
12512    return case_operation(self, do_upper);
12513}
12514
12515PyDoc_STRVAR(zfill__doc__,
12516             "S.zfill(width) -> str\n\
12517\n\
12518Pad a numeric string S with zeros on the left, to fill a field\n\
12519of the specified width. The string S is never truncated.");
12520
12521static PyObject *
12522unicode_zfill(PyObject *self, PyObject *args)
12523{
12524    Py_ssize_t fill;
12525    PyObject *u;
12526    Py_ssize_t width;
12527    int kind;
12528    void *data;
12529    Py_UCS4 chr;
12530
12531    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12532        return NULL;
12533
12534    if (PyUnicode_READY(self) == -1)
12535        return NULL;
12536
12537    if (PyUnicode_GET_LENGTH(self) >= width)
12538        return unicode_result_unchanged(self);
12539
12540    fill = width - PyUnicode_GET_LENGTH(self);
12541
12542    u = pad(self, fill, 0, '0');
12543
12544    if (u == NULL)
12545        return NULL;
12546
12547    kind = PyUnicode_KIND(u);
12548    data = PyUnicode_DATA(u);
12549    chr = PyUnicode_READ(kind, data, fill);
12550
12551    if (chr == '+' || chr == '-') {
12552        /* move sign to beginning of string */
12553        PyUnicode_WRITE(kind, data, 0, chr);
12554        PyUnicode_WRITE(kind, data, fill, '0');
12555    }
12556
12557    assert(_PyUnicode_CheckConsistency(u, 1));
12558    return u;
12559}
12560
12561#if 0
12562static PyObject *
12563unicode__decimal2ascii(PyObject *self)
12564{
12565    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12566}
12567#endif
12568
12569PyDoc_STRVAR(startswith__doc__,
12570             "S.startswith(prefix[, start[, end]]) -> bool\n\
12571\n\
12572Return True if S starts with the specified prefix, False otherwise.\n\
12573With optional start, test S beginning at that position.\n\
12574With optional end, stop comparing S at that position.\n\
12575prefix can also be a tuple of strings to try.");
12576
12577static PyObject *
12578unicode_startswith(PyObject *self,
12579                   PyObject *args)
12580{
12581    PyObject *subobj;
12582    PyObject *substring;
12583    Py_ssize_t start = 0;
12584    Py_ssize_t end = PY_SSIZE_T_MAX;
12585    int result;
12586
12587    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12588        return NULL;
12589    if (PyTuple_Check(subobj)) {
12590        Py_ssize_t i;
12591        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12592            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12593            if (substring == NULL)
12594                return NULL;
12595            result = tailmatch(self, substring, start, end, -1);
12596            Py_DECREF(substring);
12597            if (result) {
12598                Py_RETURN_TRUE;
12599            }
12600        }
12601        /* nothing matched */
12602        Py_RETURN_FALSE;
12603    }
12604    substring = PyUnicode_FromObject(subobj);
12605    if (substring == NULL) {
12606        if (PyErr_ExceptionMatches(PyExc_TypeError))
12607            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12608                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12609        return NULL;
12610    }
12611    result = tailmatch(self, substring, start, end, -1);
12612    Py_DECREF(substring);
12613    return PyBool_FromLong(result);
12614}
12615
12616
12617PyDoc_STRVAR(endswith__doc__,
12618             "S.endswith(suffix[, start[, end]]) -> bool\n\
12619\n\
12620Return True if S ends with the specified suffix, False otherwise.\n\
12621With optional start, test S beginning at that position.\n\
12622With optional end, stop comparing S at that position.\n\
12623suffix can also be a tuple of strings to try.");
12624
12625static PyObject *
12626unicode_endswith(PyObject *self,
12627                 PyObject *args)
12628{
12629    PyObject *subobj;
12630    PyObject *substring;
12631    Py_ssize_t start = 0;
12632    Py_ssize_t end = PY_SSIZE_T_MAX;
12633    int result;
12634
12635    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12636        return NULL;
12637    if (PyTuple_Check(subobj)) {
12638        Py_ssize_t i;
12639        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12640            substring = PyUnicode_FromObject(
12641                PyTuple_GET_ITEM(subobj, i));
12642            if (substring == NULL)
12643                return NULL;
12644            result = tailmatch(self, substring, start, end, +1);
12645            Py_DECREF(substring);
12646            if (result) {
12647                Py_RETURN_TRUE;
12648            }
12649        }
12650        Py_RETURN_FALSE;
12651    }
12652    substring = PyUnicode_FromObject(subobj);
12653    if (substring == NULL) {
12654        if (PyErr_ExceptionMatches(PyExc_TypeError))
12655            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12656                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12657        return NULL;
12658    }
12659    result = tailmatch(self, substring, start, end, +1);
12660    Py_DECREF(substring);
12661    return PyBool_FromLong(result);
12662}
12663
12664Py_LOCAL_INLINE(void)
12665_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12666{
12667    writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12668    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12669    writer->data = PyUnicode_DATA(writer->buffer);
12670    writer->kind = PyUnicode_KIND(writer->buffer);
12671}
12672
12673void
12674_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
12675{
12676    memset(writer, 0, sizeof(*writer));
12677#ifdef Py_DEBUG
12678    writer->kind = 5;    /* invalid kind */
12679#endif
12680    writer->min_length = Py_MAX(min_length, 100);
12681    writer->overallocate = (min_length > 0);
12682}
12683
12684int
12685_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12686                                 Py_ssize_t length, Py_UCS4 maxchar)
12687{
12688    Py_ssize_t newlen;
12689    PyObject *newbuffer;
12690
12691    assert(length > 0);
12692
12693    if (length > PY_SSIZE_T_MAX - writer->pos) {
12694        PyErr_NoMemory();
12695        return -1;
12696    }
12697    newlen = writer->pos + length;
12698
12699    if (writer->buffer == NULL) {
12700        if (writer->overallocate) {
12701            /* overallocate 25% to limit the number of resize */
12702            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12703                newlen += newlen / 4;
12704            if (newlen < writer->min_length)
12705                newlen = writer->min_length;
12706        }
12707        writer->buffer = PyUnicode_New(newlen, maxchar);
12708        if (writer->buffer == NULL)
12709            return -1;
12710        _PyUnicodeWriter_Update(writer);
12711        return 0;
12712    }
12713
12714    if (newlen > writer->size) {
12715        if (writer->overallocate) {
12716            /* overallocate 25% to limit the number of resize */
12717            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12718                newlen += newlen / 4;
12719            if (newlen < writer->min_length)
12720                newlen = writer->min_length;
12721        }
12722
12723        if (maxchar > writer->maxchar || writer->readonly) {
12724            /* resize + widen */
12725            newbuffer = PyUnicode_New(newlen, maxchar);
12726            if (newbuffer == NULL)
12727                return -1;
12728            _PyUnicode_FastCopyCharacters(newbuffer, 0,
12729                                          writer->buffer, 0, writer->pos);
12730            Py_DECREF(writer->buffer);
12731            writer->readonly = 0;
12732        }
12733        else {
12734            newbuffer = resize_compact(writer->buffer, newlen);
12735            if (newbuffer == NULL)
12736                return -1;
12737        }
12738        writer->buffer = newbuffer;
12739        _PyUnicodeWriter_Update(writer);
12740    }
12741    else if (maxchar > writer->maxchar) {
12742        assert(!writer->readonly);
12743        newbuffer = PyUnicode_New(writer->size, maxchar);
12744        if (newbuffer == NULL)
12745            return -1;
12746        _PyUnicode_FastCopyCharacters(newbuffer, 0,
12747                                      writer->buffer, 0, writer->pos);
12748        Py_DECREF(writer->buffer);
12749        writer->buffer = newbuffer;
12750        _PyUnicodeWriter_Update(writer);
12751    }
12752    return 0;
12753}
12754
12755int
12756_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12757{
12758    Py_UCS4 maxchar;
12759    Py_ssize_t len;
12760
12761    if (PyUnicode_READY(str) == -1)
12762        return -1;
12763    len = PyUnicode_GET_LENGTH(str);
12764    if (len == 0)
12765        return 0;
12766    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12767    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
12768        if (writer->buffer == NULL && !writer->overallocate) {
12769            Py_INCREF(str);
12770            writer->buffer = str;
12771            _PyUnicodeWriter_Update(writer);
12772            writer->readonly = 1;
12773            writer->size = 0;
12774            writer->pos += len;
12775            return 0;
12776        }
12777        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12778            return -1;
12779    }
12780    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12781                                  str, 0, len);
12782    writer->pos += len;
12783    return 0;
12784}
12785
12786int
12787_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12788{
12789    Py_UCS4 maxchar;
12790
12791    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12792    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12793        return -1;
12794    unicode_write_cstr(writer->buffer, writer->pos, str, len);
12795    writer->pos += len;
12796    return 0;
12797}
12798
12799PyObject *
12800_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
12801{
12802    if (writer->pos == 0) {
12803        Py_XDECREF(writer->buffer);
12804        Py_INCREF(unicode_empty);
12805        return unicode_empty;
12806    }
12807    if (writer->readonly) {
12808        assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12809        return writer->buffer;
12810    }
12811    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12812        PyObject *newbuffer;
12813        newbuffer = resize_compact(writer->buffer, writer->pos);
12814        if (newbuffer == NULL) {
12815            Py_DECREF(writer->buffer);
12816            return NULL;
12817        }
12818        writer->buffer = newbuffer;
12819    }
12820    assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
12821    return writer->buffer;
12822}
12823
12824void
12825_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
12826{
12827    Py_CLEAR(writer->buffer);
12828}
12829
12830#include "stringlib/unicode_format.h"
12831
12832PyDoc_STRVAR(format__doc__,
12833             "S.format(*args, **kwargs) -> str\n\
12834\n\
12835Return a formatted version of S, using substitutions from args and kwargs.\n\
12836The substitutions are identified by braces ('{' and '}').");
12837
12838PyDoc_STRVAR(format_map__doc__,
12839             "S.format_map(mapping) -> str\n\
12840\n\
12841Return a formatted version of S, using substitutions from mapping.\n\
12842The substitutions are identified by braces ('{' and '}').");
12843
12844static PyObject *
12845unicode__format__(PyObject* self, PyObject* args)
12846{
12847    PyObject *format_spec;
12848    _PyUnicodeWriter writer;
12849    int ret;
12850
12851    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12852        return NULL;
12853
12854    if (PyUnicode_READY(self) == -1)
12855        return NULL;
12856    _PyUnicodeWriter_Init(&writer, 0);
12857    ret = _PyUnicode_FormatAdvancedWriter(&writer,
12858                                          self, format_spec, 0,
12859                                          PyUnicode_GET_LENGTH(format_spec));
12860    if (ret == -1) {
12861        _PyUnicodeWriter_Dealloc(&writer);
12862        return NULL;
12863    }
12864    return _PyUnicodeWriter_Finish(&writer);
12865}
12866
12867PyDoc_STRVAR(p_format__doc__,
12868             "S.__format__(format_spec) -> str\n\
12869\n\
12870Return a formatted version of S as described by format_spec.");
12871
12872static PyObject *
12873unicode__sizeof__(PyObject *v)
12874{
12875    Py_ssize_t size;
12876
12877    /* If it's a compact object, account for base structure +
12878       character data. */
12879    if (PyUnicode_IS_COMPACT_ASCII(v))
12880        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12881    else if (PyUnicode_IS_COMPACT(v))
12882        size = sizeof(PyCompactUnicodeObject) +
12883            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12884    else {
12885        /* If it is a two-block object, account for base object, and
12886           for character block if present. */
12887        size = sizeof(PyUnicodeObject);
12888        if (_PyUnicode_DATA_ANY(v))
12889            size += (PyUnicode_GET_LENGTH(v) + 1) *
12890                PyUnicode_KIND(v);
12891    }
12892    /* If the wstr pointer is present, account for it unless it is shared
12893       with the data pointer. Check if the data is not shared. */
12894    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12895        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12896    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12897        size += PyUnicode_UTF8_LENGTH(v) + 1;
12898
12899    return PyLong_FromSsize_t(size);
12900}
12901
12902PyDoc_STRVAR(sizeof__doc__,
12903             "S.__sizeof__() -> size of S in memory, in bytes");
12904
12905static PyObject *
12906unicode_getnewargs(PyObject *v)
12907{
12908    PyObject *copy = _PyUnicode_Copy(v);
12909    if (!copy)
12910        return NULL;
12911    return Py_BuildValue("(N)", copy);
12912}
12913
12914static PyMethodDef unicode_methods[] = {
12915    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12916    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12917    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12918    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
12919    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12920    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12921    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
12922    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12923    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12924    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12925    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12926    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12927    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12928    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12929    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12930    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12931    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12932    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12933    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12934    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12935    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12936    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12937    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12938    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12939    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12940    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12941    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12942    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12943    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12944    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12945    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12946    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12947    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12948    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12949    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12950    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12951    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12952    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12953    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12954    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12955    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12956    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12957    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12958    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12959    {"maketrans", (PyCFunction) unicode_maketrans,
12960     METH_VARARGS | METH_STATIC, maketrans__doc__},
12961    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12962#if 0
12963    /* These methods are just used for debugging the implementation. */
12964    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12965#endif
12966
12967    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12968    {NULL, NULL}
12969};
12970
12971static PyObject *
12972unicode_mod(PyObject *v, PyObject *w)
12973{
12974    if (!PyUnicode_Check(v))
12975        Py_RETURN_NOTIMPLEMENTED;
12976    return PyUnicode_Format(v, w);
12977}
12978
12979static PyNumberMethods unicode_as_number = {
12980    0,              /*nb_add*/
12981    0,              /*nb_subtract*/
12982    0,              /*nb_multiply*/
12983    unicode_mod,            /*nb_remainder*/
12984};
12985
12986static PySequenceMethods unicode_as_sequence = {
12987    (lenfunc) unicode_length,       /* sq_length */
12988    PyUnicode_Concat,           /* sq_concat */
12989    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12990    (ssizeargfunc) unicode_getitem,     /* sq_item */
12991    0,                  /* sq_slice */
12992    0,                  /* sq_ass_item */
12993    0,                  /* sq_ass_slice */
12994    PyUnicode_Contains,         /* sq_contains */
12995};
12996
12997static PyObject*
12998unicode_subscript(PyObject* self, PyObject* item)
12999{
13000    if (PyUnicode_READY(self) == -1)
13001        return NULL;
13002
13003    if (PyIndex_Check(item)) {
13004        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13005        if (i == -1 && PyErr_Occurred())
13006            return NULL;
13007        if (i < 0)
13008            i += PyUnicode_GET_LENGTH(self);
13009        return unicode_getitem(self, i);
13010    } else if (PySlice_Check(item)) {
13011        Py_ssize_t start, stop, step, slicelength, cur, i;
13012        PyObject *result;
13013        void *src_data, *dest_data;
13014        int src_kind, dest_kind;
13015        Py_UCS4 ch, max_char, kind_limit;
13016
13017        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13018                                 &start, &stop, &step, &slicelength) < 0) {
13019            return NULL;
13020        }
13021
13022        if (slicelength <= 0) {
13023            Py_INCREF(unicode_empty);
13024            return unicode_empty;
13025        } else if (start == 0 && step == 1 &&
13026                   slicelength == PyUnicode_GET_LENGTH(self)) {
13027            return unicode_result_unchanged(self);
13028        } else if (step == 1) {
13029            return PyUnicode_Substring(self,
13030                                       start, start + slicelength);
13031        }
13032        /* General case */
13033        src_kind = PyUnicode_KIND(self);
13034        src_data = PyUnicode_DATA(self);
13035        if (!PyUnicode_IS_ASCII(self)) {
13036            kind_limit = kind_maxchar_limit(src_kind);
13037            max_char = 0;
13038            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13039                ch = PyUnicode_READ(src_kind, src_data, cur);
13040                if (ch > max_char) {
13041                    max_char = ch;
13042                    if (max_char >= kind_limit)
13043                        break;
13044                }
13045            }
13046        }
13047        else
13048            max_char = 127;
13049        result = PyUnicode_New(slicelength, max_char);
13050        if (result == NULL)
13051            return NULL;
13052        dest_kind = PyUnicode_KIND(result);
13053        dest_data = PyUnicode_DATA(result);
13054
13055        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13056            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13057            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13058        }
13059        assert(_PyUnicode_CheckConsistency(result, 1));
13060        return result;
13061    } else {
13062        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13063        return NULL;
13064    }
13065}
13066
13067static PyMappingMethods unicode_as_mapping = {
13068    (lenfunc)unicode_length,        /* mp_length */
13069    (binaryfunc)unicode_subscript,  /* mp_subscript */
13070    (objobjargproc)0,           /* mp_ass_subscript */
13071};
13072
13073
13074/* Helpers for PyUnicode_Format() */
13075
13076struct unicode_formatter_t {
13077    PyObject *args;
13078    int args_owned;
13079    Py_ssize_t arglen, argidx;
13080    PyObject *dict;
13081
13082    enum PyUnicode_Kind fmtkind;
13083    Py_ssize_t fmtcnt, fmtpos;
13084    void *fmtdata;
13085    PyObject *fmtstr;
13086
13087    _PyUnicodeWriter writer;
13088};
13089
13090struct unicode_format_arg_t {
13091    Py_UCS4 ch;
13092    int flags;
13093    Py_ssize_t width;
13094    int prec;
13095    int sign;
13096};
13097
13098static PyObject *
13099unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13100{
13101    Py_ssize_t argidx = ctx->argidx;
13102
13103    if (argidx < ctx->arglen) {
13104        ctx->argidx++;
13105        if (ctx->arglen < 0)
13106            return ctx->args;
13107        else
13108            return PyTuple_GetItem(ctx->args, argidx);
13109    }
13110    PyErr_SetString(PyExc_TypeError,
13111                    "not enough arguments for format string");
13112    return NULL;
13113}
13114
13115/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13116
13117/* Format a float into the writer if the writer is not NULL, or into *p_output
13118   otherwise.
13119
13120   Return 0 on success, raise an exception and return -1 on error. */
13121static int
13122formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13123            PyObject **p_output,
13124            _PyUnicodeWriter *writer)
13125{
13126    char *p;
13127    double x;
13128    Py_ssize_t len;
13129    int prec;
13130    int dtoa_flags;
13131
13132    x = PyFloat_AsDouble(v);
13133    if (x == -1.0 && PyErr_Occurred())
13134        return -1;
13135
13136    prec = arg->prec;
13137    if (prec < 0)
13138        prec = 6;
13139
13140    if (arg->flags & F_ALT)
13141        dtoa_flags = Py_DTSF_ALT;
13142    else
13143        dtoa_flags = 0;
13144    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13145    if (p == NULL)
13146        return -1;
13147    len = strlen(p);
13148    if (writer) {
13149        if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13150            PyMem_Free(p);
13151            return -1;
13152        }
13153        unicode_write_cstr(writer->buffer, writer->pos, p, len);
13154        writer->pos += len;
13155    }
13156    else
13157        *p_output = _PyUnicode_FromASCII(p, len);
13158    PyMem_Free(p);
13159    return 0;
13160}
13161
13162/* formatlong() emulates the format codes d, u, o, x and X, and
13163 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13164 * Python's regular ints.
13165 * Return value:  a new PyUnicodeObject*, or NULL if error.
13166 *     The output string is of the form
13167 *         "-"? ("0x" | "0X")? digit+
13168 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13169 *         set in flags.  The case of hex digits will be correct,
13170 *     There will be at least prec digits, zero-filled on the left if
13171 *         necessary to get that many.
13172 * val          object to be converted
13173 * flags        bitmask of format flags; only F_ALT is looked at
13174 * prec         minimum number of digits; 0-fill on left if needed
13175 * type         a character in [duoxX]; u acts the same as d
13176 *
13177 * CAUTION:  o, x and X conversions on regular ints can never
13178 * produce a '-' sign, but can for Python's unbounded ints.
13179 */
13180static PyObject*
13181formatlong(PyObject *val, struct unicode_format_arg_t *arg)
13182{
13183    PyObject *result = NULL;
13184    char *buf;
13185    Py_ssize_t i;
13186    int sign;           /* 1 if '-', else 0 */
13187    int len;            /* number of characters */
13188    Py_ssize_t llen;
13189    int numdigits;      /* len == numnondigits + numdigits */
13190    int numnondigits = 0;
13191    int prec = arg->prec;
13192    int type = arg->ch;
13193
13194    /* Avoid exceeding SSIZE_T_MAX */
13195    if (prec > INT_MAX-3) {
13196        PyErr_SetString(PyExc_OverflowError,
13197                        "precision too large");
13198        return NULL;
13199    }
13200
13201    assert(PyLong_Check(val));
13202
13203    switch (type) {
13204    default:
13205        assert(!"'type' not in [diuoxX]");
13206    case 'd':
13207    case 'i':
13208    case 'u':
13209        /* Special-case boolean: we want 0/1 */
13210        if (PyBool_Check(val))
13211            result = PyNumber_ToBase(val, 10);
13212        else
13213            result = Py_TYPE(val)->tp_str(val);
13214        break;
13215    case 'o':
13216        numnondigits = 2;
13217        result = PyNumber_ToBase(val, 8);
13218        break;
13219    case 'x':
13220    case 'X':
13221        numnondigits = 2;
13222        result = PyNumber_ToBase(val, 16);
13223        break;
13224    }
13225    if (!result)
13226        return NULL;
13227
13228    assert(unicode_modifiable(result));
13229    assert(PyUnicode_IS_READY(result));
13230    assert(PyUnicode_IS_ASCII(result));
13231
13232    /* To modify the string in-place, there can only be one reference. */
13233    if (Py_REFCNT(result) != 1) {
13234        PyErr_BadInternalCall();
13235        return NULL;
13236    }
13237    buf = PyUnicode_DATA(result);
13238    llen = PyUnicode_GET_LENGTH(result);
13239    if (llen > INT_MAX) {
13240        PyErr_SetString(PyExc_ValueError,
13241                        "string too large in _PyBytes_FormatLong");
13242        return NULL;
13243    }
13244    len = (int)llen;
13245    sign = buf[0] == '-';
13246    numnondigits += sign;
13247    numdigits = len - numnondigits;
13248    assert(numdigits > 0);
13249
13250    /* Get rid of base marker unless F_ALT */
13251    if (((arg->flags & F_ALT) == 0 &&
13252        (type == 'o' || type == 'x' || type == 'X'))) {
13253        assert(buf[sign] == '0');
13254        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13255               buf[sign+1] == 'o');
13256        numnondigits -= 2;
13257        buf += 2;
13258        len -= 2;
13259        if (sign)
13260            buf[0] = '-';
13261        assert(len == numnondigits + numdigits);
13262        assert(numdigits > 0);
13263    }
13264
13265    /* Fill with leading zeroes to meet minimum width. */
13266    if (prec > numdigits) {
13267        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13268                                numnondigits + prec);
13269        char *b1;
13270        if (!r1) {
13271            Py_DECREF(result);
13272            return NULL;
13273        }
13274        b1 = PyBytes_AS_STRING(r1);
13275        for (i = 0; i < numnondigits; ++i)
13276            *b1++ = *buf++;
13277        for (i = 0; i < prec - numdigits; i++)
13278            *b1++ = '0';
13279        for (i = 0; i < numdigits; i++)
13280            *b1++ = *buf++;
13281        *b1 = '\0';
13282        Py_DECREF(result);
13283        result = r1;
13284        buf = PyBytes_AS_STRING(result);
13285        len = numnondigits + prec;
13286    }
13287
13288    /* Fix up case for hex conversions. */
13289    if (type == 'X') {
13290        /* Need to convert all lower case letters to upper case.
13291           and need to convert 0x to 0X (and -0x to -0X). */
13292        for (i = 0; i < len; i++)
13293            if (buf[i] >= 'a' && buf[i] <= 'x')
13294                buf[i] -= 'a'-'A';
13295    }
13296    if (!PyUnicode_Check(result)
13297        || buf != PyUnicode_DATA(result)) {
13298        PyObject *unicode;
13299        unicode = _PyUnicode_FromASCII(buf, len);
13300        Py_DECREF(result);
13301        result = unicode;
13302    }
13303    else if (len != PyUnicode_GET_LENGTH(result)) {
13304        if (PyUnicode_Resize(&result, len) < 0)
13305            Py_CLEAR(result);
13306    }
13307    return result;
13308}
13309
13310/* Format an integer.
13311 * Return 1 if the number has been formatted into the writer,
13312 *        0 if the number has been formatted into *p_output
13313 *       -1 and raise an exception on error */
13314static int
13315mainformatlong(PyObject *v,
13316               struct unicode_format_arg_t *arg,
13317               PyObject **p_output,
13318               _PyUnicodeWriter *writer)
13319{
13320    PyObject *iobj, *res;
13321    char type = (char)arg->ch;
13322
13323    if (!PyNumber_Check(v))
13324        goto wrongtype;
13325
13326    if (!PyLong_Check(v)) {
13327        iobj = PyNumber_Long(v);
13328        if (iobj == NULL) {
13329            if (PyErr_ExceptionMatches(PyExc_TypeError))
13330                goto wrongtype;
13331            return -1;
13332        }
13333        assert(PyLong_Check(iobj));
13334    }
13335    else {
13336        iobj = v;
13337        Py_INCREF(iobj);
13338    }
13339
13340    if (PyLong_CheckExact(v)
13341        && arg->width == -1 && arg->prec == -1
13342        && !(arg->flags & (F_SIGN | F_BLANK))
13343        && type != 'X')
13344    {
13345        /* Fast path */
13346        int alternate = arg->flags & F_ALT;
13347        int base;
13348
13349        switch(type)
13350        {
13351            default:
13352                assert(0 && "'type' not in [diuoxX]");
13353            case 'd':
13354            case 'i':
13355            case 'u':
13356                base = 10;
13357                break;
13358            case 'o':
13359                base = 8;
13360                break;
13361            case 'x':
13362            case 'X':
13363                base = 16;
13364                break;
13365        }
13366
13367        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13368            Py_DECREF(iobj);
13369            return -1;
13370        }
13371        Py_DECREF(iobj);
13372        return 1;
13373    }
13374
13375    res = formatlong(iobj, arg);
13376    Py_DECREF(iobj);
13377    if (res == NULL)
13378        return -1;
13379    *p_output = res;
13380    return 0;
13381
13382wrongtype:
13383    PyErr_Format(PyExc_TypeError,
13384            "%%%c format: a number is required, "
13385            "not %.200s",
13386            type, Py_TYPE(v)->tp_name);
13387    return -1;
13388}
13389
13390static Py_UCS4
13391formatchar(PyObject *v)
13392{
13393    /* presume that the buffer is at least 3 characters long */
13394    if (PyUnicode_Check(v)) {
13395        if (PyUnicode_GET_LENGTH(v) == 1) {
13396            return PyUnicode_READ_CHAR(v, 0);
13397        }
13398        goto onError;
13399    }
13400    else {
13401        /* Integer input truncated to a character */
13402        long x;
13403        x = PyLong_AsLong(v);
13404        if (x == -1 && PyErr_Occurred())
13405            goto onError;
13406
13407        if (x < 0 || x > MAX_UNICODE) {
13408            PyErr_SetString(PyExc_OverflowError,
13409                            "%c arg not in range(0x110000)");
13410            return (Py_UCS4) -1;
13411        }
13412
13413        return (Py_UCS4) x;
13414    }
13415
13416  onError:
13417    PyErr_SetString(PyExc_TypeError,
13418                    "%c requires int or char");
13419    return (Py_UCS4) -1;
13420}
13421
13422/* Parse options of an argument: flags, width, precision.
13423   Handle also "%(name)" syntax.
13424
13425   Return 0 if the argument has been formatted into arg->str.
13426   Return 1 if the argument has been written into ctx->writer,
13427   Raise an exception and return -1 on error. */
13428static int
13429unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13430                         struct unicode_format_arg_t *arg)
13431{
13432#define FORMAT_READ(ctx) \
13433        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13434
13435    PyObject *v;
13436
13437    arg->ch = FORMAT_READ(ctx);
13438    if (arg->ch == '(') {
13439        /* Get argument value from a dictionary. Example: "%(name)s". */
13440        Py_ssize_t keystart;
13441        Py_ssize_t keylen;
13442        PyObject *key;
13443        int pcount = 1;
13444
13445        if (ctx->dict == NULL) {
13446            PyErr_SetString(PyExc_TypeError,
13447                            "format requires a mapping");
13448            return -1;
13449        }
13450        ++ctx->fmtpos;
13451        --ctx->fmtcnt;
13452        keystart = ctx->fmtpos;
13453        /* Skip over balanced parentheses */
13454        while (pcount > 0 && --ctx->fmtcnt >= 0) {
13455            arg->ch = FORMAT_READ(ctx);
13456            if (arg->ch == ')')
13457                --pcount;
13458            else if (arg->ch == '(')
13459                ++pcount;
13460            ctx->fmtpos++;
13461        }
13462        keylen = ctx->fmtpos - keystart - 1;
13463        if (ctx->fmtcnt < 0 || pcount > 0) {
13464            PyErr_SetString(PyExc_ValueError,
13465                            "incomplete format key");
13466            return -1;
13467        }
13468        key = PyUnicode_Substring(ctx->fmtstr,
13469                                  keystart, keystart + keylen);
13470        if (key == NULL)
13471            return -1;
13472        if (ctx->args_owned) {
13473            Py_DECREF(ctx->args);
13474            ctx->args_owned = 0;
13475        }
13476        ctx->args = PyObject_GetItem(ctx->dict, key);
13477        Py_DECREF(key);
13478        if (ctx->args == NULL)
13479            return -1;
13480        ctx->args_owned = 1;
13481        ctx->arglen = -1;
13482        ctx->argidx = -2;
13483    }
13484
13485    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13486    arg->flags = 0;
13487    while (--ctx->fmtcnt >= 0) {
13488        arg->ch = FORMAT_READ(ctx);
13489        ctx->fmtpos++;
13490        switch (arg->ch) {
13491        case '-': arg->flags |= F_LJUST; continue;
13492        case '+': arg->flags |= F_SIGN; continue;
13493        case ' ': arg->flags |= F_BLANK; continue;
13494        case '#': arg->flags |= F_ALT; continue;
13495        case '0': arg->flags |= F_ZERO; continue;
13496        }
13497        break;
13498    }
13499
13500    /* Parse width. Example: "%10s" => width=10 */
13501    arg->width = -1;
13502    if (arg->ch == '*') {
13503        v = unicode_format_getnextarg(ctx);
13504        if (v == NULL)
13505            return -1;
13506        if (!PyLong_Check(v)) {
13507            PyErr_SetString(PyExc_TypeError,
13508                            "* wants int");
13509            return -1;
13510        }
13511        arg->width = PyLong_AsLong(v);
13512        if (arg->width == -1 && PyErr_Occurred())
13513            return -1;
13514        if (arg->width < 0) {
13515            arg->flags |= F_LJUST;
13516            arg->width = -arg->width;
13517        }
13518        if (--ctx->fmtcnt >= 0) {
13519            arg->ch = FORMAT_READ(ctx);
13520            ctx->fmtpos++;
13521        }
13522    }
13523    else if (arg->ch >= '0' && arg->ch <= '9') {
13524        arg->width = arg->ch - '0';
13525        while (--ctx->fmtcnt >= 0) {
13526            arg->ch = FORMAT_READ(ctx);
13527            ctx->fmtpos++;
13528            if (arg->ch < '0' || arg->ch > '9')
13529                break;
13530            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13531               mixing signed and unsigned comparison. Since arg->ch is between
13532               '0' and '9', casting to int is safe. */
13533            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13534                PyErr_SetString(PyExc_ValueError,
13535                                "width too big");
13536                return -1;
13537            }
13538            arg->width = arg->width*10 + (arg->ch - '0');
13539        }
13540    }
13541
13542    /* Parse precision. Example: "%.3f" => prec=3 */
13543    arg->prec = -1;
13544    if (arg->ch == '.') {
13545        arg->prec = 0;
13546        if (--ctx->fmtcnt >= 0) {
13547            arg->ch = FORMAT_READ(ctx);
13548            ctx->fmtpos++;
13549        }
13550        if (arg->ch == '*') {
13551            v = unicode_format_getnextarg(ctx);
13552            if (v == NULL)
13553                return -1;
13554            if (!PyLong_Check(v)) {
13555                PyErr_SetString(PyExc_TypeError,
13556                                "* wants int");
13557                return -1;
13558            }
13559            arg->prec = PyLong_AsLong(v);
13560            if (arg->prec == -1 && PyErr_Occurred())
13561                return -1;
13562            if (arg->prec < 0)
13563                arg->prec = 0;
13564            if (--ctx->fmtcnt >= 0) {
13565                arg->ch = FORMAT_READ(ctx);
13566                ctx->fmtpos++;
13567            }
13568        }
13569        else if (arg->ch >= '0' && arg->ch <= '9') {
13570            arg->prec = arg->ch - '0';
13571            while (--ctx->fmtcnt >= 0) {
13572                arg->ch = FORMAT_READ(ctx);
13573                ctx->fmtpos++;
13574                if (arg->ch < '0' || arg->ch > '9')
13575                    break;
13576                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13577                    PyErr_SetString(PyExc_ValueError,
13578                                    "precision too big");
13579                    return -1;
13580                }
13581                arg->prec = arg->prec*10 + (arg->ch - '0');
13582            }
13583        }
13584    }
13585
13586    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13587    if (ctx->fmtcnt >= 0) {
13588        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13589            if (--ctx->fmtcnt >= 0) {
13590                arg->ch = FORMAT_READ(ctx);
13591                ctx->fmtpos++;
13592            }
13593        }
13594    }
13595    if (ctx->fmtcnt < 0) {
13596        PyErr_SetString(PyExc_ValueError,
13597                        "incomplete format");
13598        return -1;
13599    }
13600    return 0;
13601
13602#undef FORMAT_READ
13603}
13604
13605/* Format one argument. Supported conversion specifiers:
13606
13607   - "s", "r", "a": any type
13608   - "i", "d", "u", "o", "x", "X": int
13609   - "e", "E", "f", "F", "g", "G": float
13610   - "c": int or str (1 character)
13611
13612   Return 0 if the argument has been formatted into *p_str,
13613          1 if the argument has been written into ctx->writer,
13614          -1 on error. */
13615static int
13616unicode_format_arg_format(struct unicode_formatter_t *ctx,
13617                          struct unicode_format_arg_t *arg,
13618                          PyObject **p_str)
13619{
13620    PyObject *v;
13621    _PyUnicodeWriter *writer = &ctx->writer;
13622
13623    if (ctx->fmtcnt == 0)
13624        ctx->writer.overallocate = 0;
13625
13626    if (arg->ch == '%') {
13627        if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13628            return -1;
13629        PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13630        writer->pos += 1;
13631        return 1;
13632    }
13633
13634    v = unicode_format_getnextarg(ctx);
13635    if (v == NULL)
13636        return -1;
13637
13638    arg->sign = 0;
13639
13640    switch (arg->ch) {
13641
13642    case 's':
13643    case 'r':
13644    case 'a':
13645        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13646            /* Fast path */
13647            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13648                return -1;
13649            return 1;
13650        }
13651
13652        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13653            *p_str = v;
13654            Py_INCREF(*p_str);
13655        }
13656        else {
13657            if (arg->ch == 's')
13658                *p_str = PyObject_Str(v);
13659            else if (arg->ch == 'r')
13660                *p_str = PyObject_Repr(v);
13661            else
13662                *p_str = PyObject_ASCII(v);
13663        }
13664        break;
13665
13666    case 'i':
13667    case 'd':
13668    case 'u':
13669    case 'o':
13670    case 'x':
13671    case 'X':
13672    {
13673        int ret = mainformatlong(v, arg, p_str, writer);
13674        if (ret != 0)
13675            return ret;
13676        arg->sign = 1;
13677        break;
13678    }
13679
13680    case 'e':
13681    case 'E':
13682    case 'f':
13683    case 'F':
13684    case 'g':
13685    case 'G':
13686        if (arg->width == -1 && arg->prec == -1
13687            && !(arg->flags & (F_SIGN | F_BLANK)))
13688        {
13689            /* Fast path */
13690            if (formatfloat(v, arg, NULL, writer) == -1)
13691                return -1;
13692            return 1;
13693        }
13694
13695        arg->sign = 1;
13696        if (formatfloat(v, arg, p_str, NULL) == -1)
13697            return -1;
13698        break;
13699
13700    case 'c':
13701    {
13702        Py_UCS4 ch = formatchar(v);
13703        if (ch == (Py_UCS4) -1)
13704            return -1;
13705        if (arg->width == -1 && arg->prec == -1) {
13706            /* Fast path */
13707            if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13708                return -1;
13709            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13710            writer->pos += 1;
13711            return 1;
13712        }
13713        *p_str = PyUnicode_FromOrdinal(ch);
13714        break;
13715    }
13716
13717    default:
13718        PyErr_Format(PyExc_ValueError,
13719                     "unsupported format character '%c' (0x%x) "
13720                     "at index %zd",
13721                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13722                     (int)arg->ch,
13723                     ctx->fmtpos - 1);
13724        return -1;
13725    }
13726    if (*p_str == NULL)
13727        return -1;
13728    assert (PyUnicode_Check(*p_str));
13729    return 0;
13730}
13731
13732static int
13733unicode_format_arg_output(struct unicode_formatter_t *ctx,
13734                          struct unicode_format_arg_t *arg,
13735                          PyObject *str)
13736{
13737    Py_ssize_t len;
13738    enum PyUnicode_Kind kind;
13739    void *pbuf;
13740    Py_ssize_t pindex;
13741    Py_UCS4 signchar;
13742    Py_ssize_t buflen;
13743    Py_UCS4 maxchar, bufmaxchar;
13744    Py_ssize_t sublen;
13745    _PyUnicodeWriter *writer = &ctx->writer;
13746    Py_UCS4 fill;
13747
13748    fill = ' ';
13749    if (arg->sign && arg->flags & F_ZERO)
13750        fill = '0';
13751
13752    if (PyUnicode_READY(str) == -1)
13753        return -1;
13754
13755    len = PyUnicode_GET_LENGTH(str);
13756    if ((arg->width == -1 || arg->width <= len)
13757        && (arg->prec == -1 || arg->prec >= len)
13758        && !(arg->flags & (F_SIGN | F_BLANK)))
13759    {
13760        /* Fast path */
13761        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13762            return -1;
13763        return 0;
13764    }
13765
13766    /* Truncate the string for "s", "r" and "a" formats
13767       if the precision is set */
13768    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13769        if (arg->prec >= 0 && len > arg->prec)
13770            len = arg->prec;
13771    }
13772
13773    /* Adjust sign and width */
13774    kind = PyUnicode_KIND(str);
13775    pbuf = PyUnicode_DATA(str);
13776    pindex = 0;
13777    signchar = '\0';
13778    if (arg->sign) {
13779        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13780        if (ch == '-' || ch == '+') {
13781            signchar = ch;
13782            len--;
13783            pindex++;
13784        }
13785        else if (arg->flags & F_SIGN)
13786            signchar = '+';
13787        else if (arg->flags & F_BLANK)
13788            signchar = ' ';
13789        else
13790            arg->sign = 0;
13791    }
13792    if (arg->width < len)
13793        arg->width = len;
13794
13795    /* Prepare the writer */
13796    bufmaxchar = 127;
13797    if (!(arg->flags & F_LJUST)) {
13798        if (arg->sign) {
13799            if ((arg->width-1) > len)
13800                bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13801        }
13802        else {
13803            if (arg->width > len)
13804                bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13805        }
13806    }
13807    maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13808    bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13809    buflen = arg->width;
13810    if (arg->sign && len == arg->width)
13811        buflen++;
13812    if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13813        return -1;
13814
13815    /* Write the sign if needed */
13816    if (arg->sign) {
13817        if (fill != ' ') {
13818            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13819            writer->pos += 1;
13820        }
13821        if (arg->width > len)
13822            arg->width--;
13823    }
13824
13825    /* Write the numeric prefix for "x", "X" and "o" formats
13826       if the alternate form is used.
13827       For example, write "0x" for the "%#x" format. */
13828    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13829        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13830        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13831        if (fill != ' ') {
13832            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13833            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13834            writer->pos += 2;
13835            pindex += 2;
13836        }
13837        arg->width -= 2;
13838        if (arg->width < 0)
13839            arg->width = 0;
13840        len -= 2;
13841    }
13842
13843    /* Pad left with the fill character if needed */
13844    if (arg->width > len && !(arg->flags & F_LJUST)) {
13845        sublen = arg->width - len;
13846        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13847        writer->pos += sublen;
13848        arg->width = len;
13849    }
13850
13851    /* If padding with spaces: write sign if needed and/or numeric prefix if
13852       the alternate form is used */
13853    if (fill == ' ') {
13854        if (arg->sign) {
13855            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13856            writer->pos += 1;
13857        }
13858        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13859            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13860            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13861            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13862            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13863            writer->pos += 2;
13864            pindex += 2;
13865        }
13866    }
13867
13868    /* Write characters */
13869    if (len) {
13870        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13871                                      str, pindex, len);
13872        writer->pos += len;
13873    }
13874
13875    /* Pad right with the fill character if needed */
13876    if (arg->width > len) {
13877        sublen = arg->width - len;
13878        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13879        writer->pos += sublen;
13880    }
13881    return 0;
13882}
13883
13884/* Helper of PyUnicode_Format(): format one arg.
13885   Return 0 on success, raise an exception and return -1 on error. */
13886static int
13887unicode_format_arg(struct unicode_formatter_t *ctx)
13888{
13889    struct unicode_format_arg_t arg;
13890    PyObject *str;
13891    int ret;
13892
13893    ret = unicode_format_arg_parse(ctx, &arg);
13894    if (ret == -1)
13895        return -1;
13896
13897    ret = unicode_format_arg_format(ctx, &arg, &str);
13898    if (ret == -1)
13899        return -1;
13900
13901    if (ret != 1) {
13902        ret = unicode_format_arg_output(ctx, &arg, str);
13903        Py_DECREF(str);
13904        if (ret == -1)
13905            return -1;
13906    }
13907
13908    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13909        PyErr_SetString(PyExc_TypeError,
13910                        "not all arguments converted during string formatting");
13911        return -1;
13912    }
13913    return 0;
13914}
13915
13916PyObject *
13917PyUnicode_Format(PyObject *format, PyObject *args)
13918{
13919    struct unicode_formatter_t ctx;
13920
13921    if (format == NULL || args == NULL) {
13922        PyErr_BadInternalCall();
13923        return NULL;
13924    }
13925
13926    ctx.fmtstr = PyUnicode_FromObject(format);
13927    if (ctx.fmtstr == NULL)
13928        return NULL;
13929    if (PyUnicode_READY(ctx.fmtstr) == -1) {
13930        Py_DECREF(ctx.fmtstr);
13931        return NULL;
13932    }
13933    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13934    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13935    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13936    ctx.fmtpos = 0;
13937
13938    _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
13939
13940    if (PyTuple_Check(args)) {
13941        ctx.arglen = PyTuple_Size(args);
13942        ctx.argidx = 0;
13943    }
13944    else {
13945        ctx.arglen = -1;
13946        ctx.argidx = -2;
13947    }
13948    ctx.args_owned = 0;
13949    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
13950        ctx.dict = args;
13951    else
13952        ctx.dict = NULL;
13953    ctx.args = args;
13954
13955    while (--ctx.fmtcnt >= 0) {
13956        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13957            Py_ssize_t nonfmtpos, sublen;
13958            Py_UCS4 maxchar;
13959
13960            nonfmtpos = ctx.fmtpos++;
13961            while (ctx.fmtcnt >= 0 &&
13962                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13963                ctx.fmtpos++;
13964                ctx.fmtcnt--;
13965            }
13966            if (ctx.fmtcnt < 0) {
13967                ctx.fmtpos--;
13968                ctx.writer.overallocate = 0;
13969            }
13970            sublen = ctx.fmtpos - nonfmtpos;
13971            maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
13972                                             nonfmtpos, nonfmtpos + sublen);
13973            if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
13974                goto onError;
13975
13976            _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13977                                          ctx.fmtstr, nonfmtpos, sublen);
13978            ctx.writer.pos += sublen;
13979        }
13980        else {
13981            ctx.fmtpos++;
13982            if (unicode_format_arg(&ctx) == -1)
13983                goto onError;
13984        }
13985    }
13986
13987    if (ctx.argidx < ctx.arglen && !ctx.dict) {
13988        PyErr_SetString(PyExc_TypeError,
13989                        "not all arguments converted during string formatting");
13990        goto onError;
13991    }
13992
13993    if (ctx.args_owned) {
13994        Py_DECREF(ctx.args);
13995    }
13996    Py_DECREF(ctx.fmtstr);
13997    return _PyUnicodeWriter_Finish(&ctx.writer);
13998
13999  onError:
14000    Py_DECREF(ctx.fmtstr);
14001    _PyUnicodeWriter_Dealloc(&ctx.writer);
14002    if (ctx.args_owned) {
14003        Py_DECREF(ctx.args);
14004    }
14005    return NULL;
14006}
14007
14008static PyObject *
14009unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14010
14011static PyObject *
14012unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14013{
14014    PyObject *x = NULL;
14015    static char *kwlist[] = {"object", "encoding", "errors", 0};
14016    char *encoding = NULL;
14017    char *errors = NULL;
14018
14019    if (type != &PyUnicode_Type)
14020        return unicode_subtype_new(type, args, kwds);
14021    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14022                                     kwlist, &x, &encoding, &errors))
14023        return NULL;
14024    if (x == NULL) {
14025        Py_INCREF(unicode_empty);
14026        return unicode_empty;
14027    }
14028    if (encoding == NULL && errors == NULL)
14029        return PyObject_Str(x);
14030    else
14031        return PyUnicode_FromEncodedObject(x, encoding, errors);
14032}
14033
14034static PyObject *
14035unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14036{
14037    PyObject *unicode, *self;
14038    Py_ssize_t length, char_size;
14039    int share_wstr, share_utf8;
14040    unsigned int kind;
14041    void *data;
14042
14043    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14044
14045    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14046    if (unicode == NULL)
14047        return NULL;
14048    assert(_PyUnicode_CHECK(unicode));
14049    if (PyUnicode_READY(unicode) == -1) {
14050        Py_DECREF(unicode);
14051        return NULL;
14052    }
14053
14054    self = type->tp_alloc(type, 0);
14055    if (self == NULL) {
14056        Py_DECREF(unicode);
14057        return NULL;
14058    }
14059    kind = PyUnicode_KIND(unicode);
14060    length = PyUnicode_GET_LENGTH(unicode);
14061
14062    _PyUnicode_LENGTH(self) = length;
14063#ifdef Py_DEBUG
14064    _PyUnicode_HASH(self) = -1;
14065#else
14066    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14067#endif
14068    _PyUnicode_STATE(self).interned = 0;
14069    _PyUnicode_STATE(self).kind = kind;
14070    _PyUnicode_STATE(self).compact = 0;
14071    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14072    _PyUnicode_STATE(self).ready = 1;
14073    _PyUnicode_WSTR(self) = NULL;
14074    _PyUnicode_UTF8_LENGTH(self) = 0;
14075    _PyUnicode_UTF8(self) = NULL;
14076    _PyUnicode_WSTR_LENGTH(self) = 0;
14077    _PyUnicode_DATA_ANY(self) = NULL;
14078
14079    share_utf8 = 0;
14080    share_wstr = 0;
14081    if (kind == PyUnicode_1BYTE_KIND) {
14082        char_size = 1;
14083        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14084            share_utf8 = 1;
14085    }
14086    else if (kind == PyUnicode_2BYTE_KIND) {
14087        char_size = 2;
14088        if (sizeof(wchar_t) == 2)
14089            share_wstr = 1;
14090    }
14091    else {
14092        assert(kind == PyUnicode_4BYTE_KIND);
14093        char_size = 4;
14094        if (sizeof(wchar_t) == 4)
14095            share_wstr = 1;
14096    }
14097
14098    /* Ensure we won't overflow the length. */
14099    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14100        PyErr_NoMemory();
14101        goto onError;
14102    }
14103    data = PyObject_MALLOC((length + 1) * char_size);
14104    if (data == NULL) {
14105        PyErr_NoMemory();
14106        goto onError;
14107    }
14108
14109    _PyUnicode_DATA_ANY(self) = data;
14110    if (share_utf8) {
14111        _PyUnicode_UTF8_LENGTH(self) = length;
14112        _PyUnicode_UTF8(self) = data;
14113    }
14114    if (share_wstr) {
14115        _PyUnicode_WSTR_LENGTH(self) = length;
14116        _PyUnicode_WSTR(self) = (wchar_t *)data;
14117    }
14118
14119    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14120              kind * (length + 1));
14121    assert(_PyUnicode_CheckConsistency(self, 1));
14122#ifdef Py_DEBUG
14123    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14124#endif
14125    Py_DECREF(unicode);
14126    return self;
14127
14128onError:
14129    Py_DECREF(unicode);
14130    Py_DECREF(self);
14131    return NULL;
14132}
14133
14134PyDoc_STRVAR(unicode_doc,
14135"str(object='') -> str\n\
14136str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14137\n\
14138Create a new string object from the given object. If encoding or\n\
14139errors is specified, then the object must expose a data buffer\n\
14140that will be decoded using the given encoding and error handler.\n\
14141Otherwise, returns the result of object.__str__() (if defined)\n\
14142or repr(object).\n\
14143encoding defaults to sys.getdefaultencoding().\n\
14144errors defaults to 'strict'.");
14145
14146static PyObject *unicode_iter(PyObject *seq);
14147
14148PyTypeObject PyUnicode_Type = {
14149    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14150    "str",              /* tp_name */
14151    sizeof(PyUnicodeObject),        /* tp_size */
14152    0,                  /* tp_itemsize */
14153    /* Slots */
14154    (destructor)unicode_dealloc,    /* tp_dealloc */
14155    0,                  /* tp_print */
14156    0,                  /* tp_getattr */
14157    0,                  /* tp_setattr */
14158    0,                  /* tp_reserved */
14159    unicode_repr,           /* tp_repr */
14160    &unicode_as_number,         /* tp_as_number */
14161    &unicode_as_sequence,       /* tp_as_sequence */
14162    &unicode_as_mapping,        /* tp_as_mapping */
14163    (hashfunc) unicode_hash,        /* tp_hash*/
14164    0,                  /* tp_call*/
14165    (reprfunc) unicode_str,     /* tp_str */
14166    PyObject_GenericGetAttr,        /* tp_getattro */
14167    0,                  /* tp_setattro */
14168    0,                  /* tp_as_buffer */
14169    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14170    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14171    unicode_doc,            /* tp_doc */
14172    0,                  /* tp_traverse */
14173    0,                  /* tp_clear */
14174    PyUnicode_RichCompare,      /* tp_richcompare */
14175    0,                  /* tp_weaklistoffset */
14176    unicode_iter,           /* tp_iter */
14177    0,                  /* tp_iternext */
14178    unicode_methods,            /* tp_methods */
14179    0,                  /* tp_members */
14180    0,                  /* tp_getset */
14181    &PyBaseObject_Type,         /* tp_base */
14182    0,                  /* tp_dict */
14183    0,                  /* tp_descr_get */
14184    0,                  /* tp_descr_set */
14185    0,                  /* tp_dictoffset */
14186    0,                  /* tp_init */
14187    0,                  /* tp_alloc */
14188    unicode_new,            /* tp_new */
14189    PyObject_Del,           /* tp_free */
14190};
14191
14192/* Initialize the Unicode implementation */
14193
14194int _PyUnicode_Init(void)
14195{
14196    int i;
14197
14198    /* XXX - move this array to unicodectype.c ? */
14199    Py_UCS2 linebreak[] = {
14200        0x000A, /* LINE FEED */
14201        0x000D, /* CARRIAGE RETURN */
14202        0x001C, /* FILE SEPARATOR */
14203        0x001D, /* GROUP SEPARATOR */
14204        0x001E, /* RECORD SEPARATOR */
14205        0x0085, /* NEXT LINE */
14206        0x2028, /* LINE SEPARATOR */
14207        0x2029, /* PARAGRAPH SEPARATOR */
14208    };
14209
14210    /* Init the implementation */
14211    unicode_empty = PyUnicode_New(0, 0);
14212    if (!unicode_empty)
14213        Py_FatalError("Can't create empty string");
14214    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
14215
14216    for (i = 0; i < 256; i++)
14217        unicode_latin1[i] = NULL;
14218    if (PyType_Ready(&PyUnicode_Type) < 0)
14219        Py_FatalError("Can't initialize 'unicode'");
14220
14221    /* initialize the linebreak bloom filter */
14222    bloom_linebreak = make_bloom_mask(
14223        PyUnicode_2BYTE_KIND, linebreak,
14224        Py_ARRAY_LENGTH(linebreak));
14225
14226    PyType_Ready(&EncodingMapType);
14227
14228    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14229        Py_FatalError("Can't initialize field name iterator type");
14230
14231    if (PyType_Ready(&PyFormatterIter_Type) < 0)
14232        Py_FatalError("Can't initialize formatter iter type");
14233
14234#ifdef HAVE_MBCS
14235    winver.dwOSVersionInfoSize = sizeof(winver);
14236    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14237        PyErr_SetFromWindowsErr(0);
14238        return -1;
14239    }
14240#endif
14241    return 0;
14242}
14243
14244/* Finalize the Unicode implementation */
14245
14246int
14247PyUnicode_ClearFreeList(void)
14248{
14249    return 0;
14250}
14251
14252void
14253_PyUnicode_Fini(void)
14254{
14255    int i;
14256
14257    Py_XDECREF(unicode_empty);
14258    unicode_empty = NULL;
14259
14260    for (i = 0; i < 256; i++) {
14261        if (unicode_latin1[i]) {
14262            Py_DECREF(unicode_latin1[i]);
14263            unicode_latin1[i] = NULL;
14264        }
14265    }
14266    _PyUnicode_ClearStaticStrings();
14267    (void)PyUnicode_ClearFreeList();
14268}
14269
14270void
14271PyUnicode_InternInPlace(PyObject **p)
14272{
14273    register PyObject *s = *p;
14274    PyObject *t;
14275#ifdef Py_DEBUG
14276    assert(s != NULL);
14277    assert(_PyUnicode_CHECK(s));
14278#else
14279    if (s == NULL || !PyUnicode_Check(s))
14280        return;
14281#endif
14282    /* If it's a subclass, we don't really know what putting
14283       it in the interned dict might do. */
14284    if (!PyUnicode_CheckExact(s))
14285        return;
14286    if (PyUnicode_CHECK_INTERNED(s))
14287        return;
14288    if (interned == NULL) {
14289        interned = PyDict_New();
14290        if (interned == NULL) {
14291            PyErr_Clear(); /* Don't leave an exception */
14292            return;
14293        }
14294    }
14295    /* It might be that the GetItem call fails even
14296       though the key is present in the dictionary,
14297       namely when this happens during a stack overflow. */
14298    Py_ALLOW_RECURSION
14299    t = PyDict_GetItem(interned, s);
14300    Py_END_ALLOW_RECURSION
14301
14302        if (t) {
14303            Py_INCREF(t);
14304            Py_DECREF(*p);
14305            *p = t;
14306            return;
14307        }
14308
14309    PyThreadState_GET()->recursion_critical = 1;
14310    if (PyDict_SetItem(interned, s, s) < 0) {
14311        PyErr_Clear();
14312        PyThreadState_GET()->recursion_critical = 0;
14313        return;
14314    }
14315    PyThreadState_GET()->recursion_critical = 0;
14316    /* The two references in interned are not counted by refcnt.
14317       The deallocator will take care of this */
14318    Py_REFCNT(s) -= 2;
14319    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
14320}
14321
14322void
14323PyUnicode_InternImmortal(PyObject **p)
14324{
14325    PyUnicode_InternInPlace(p);
14326    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
14327        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
14328        Py_INCREF(*p);
14329    }
14330}
14331
14332PyObject *
14333PyUnicode_InternFromString(const char *cp)
14334{
14335    PyObject *s = PyUnicode_FromString(cp);
14336    if (s == NULL)
14337        return NULL;
14338    PyUnicode_InternInPlace(&s);
14339    return s;
14340}
14341
14342void
14343_Py_ReleaseInternedUnicodeStrings(void)
14344{
14345    PyObject *keys;
14346    PyObject *s;
14347    Py_ssize_t i, n;
14348    Py_ssize_t immortal_size = 0, mortal_size = 0;
14349
14350    if (interned == NULL || !PyDict_Check(interned))
14351        return;
14352    keys = PyDict_Keys(interned);
14353    if (keys == NULL || !PyList_Check(keys)) {
14354        PyErr_Clear();
14355        return;
14356    }
14357
14358    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14359       detector, interned unicode strings are not forcibly deallocated;
14360       rather, we give them their stolen references back, and then clear
14361       and DECREF the interned dict. */
14362
14363    n = PyList_GET_SIZE(keys);
14364    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14365            n);
14366    for (i = 0; i < n; i++) {
14367        s = PyList_GET_ITEM(keys, i);
14368        if (PyUnicode_READY(s) == -1) {
14369            assert(0 && "could not ready string");
14370            fprintf(stderr, "could not ready string\n");
14371        }
14372        switch (PyUnicode_CHECK_INTERNED(s)) {
14373        case SSTATE_NOT_INTERNED:
14374            /* XXX Shouldn't happen */
14375            break;
14376        case SSTATE_INTERNED_IMMORTAL:
14377            Py_REFCNT(s) += 1;
14378            immortal_size += PyUnicode_GET_LENGTH(s);
14379            break;
14380        case SSTATE_INTERNED_MORTAL:
14381            Py_REFCNT(s) += 2;
14382            mortal_size += PyUnicode_GET_LENGTH(s);
14383            break;
14384        default:
14385            Py_FatalError("Inconsistent interned string state.");
14386        }
14387        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14388    }
14389    fprintf(stderr, "total size of all interned strings: "
14390            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14391            "mortal/immortal\n", mortal_size, immortal_size);
14392    Py_DECREF(keys);
14393    PyDict_Clear(interned);
14394    Py_DECREF(interned);
14395    interned = NULL;
14396}
14397
14398
14399/********************* Unicode Iterator **************************/
14400
14401typedef struct {
14402    PyObject_HEAD
14403    Py_ssize_t it_index;
14404    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14405} unicodeiterobject;
14406
14407static void
14408unicodeiter_dealloc(unicodeiterobject *it)
14409{
14410    _PyObject_GC_UNTRACK(it);
14411    Py_XDECREF(it->it_seq);
14412    PyObject_GC_Del(it);
14413}
14414
14415static int
14416unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14417{
14418    Py_VISIT(it->it_seq);
14419    return 0;
14420}
14421
14422static PyObject *
14423unicodeiter_next(unicodeiterobject *it)
14424{
14425    PyObject *seq, *item;
14426
14427    assert(it != NULL);
14428    seq = it->it_seq;
14429    if (seq == NULL)
14430        return NULL;
14431    assert(_PyUnicode_CHECK(seq));
14432
14433    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14434        int kind = PyUnicode_KIND(seq);
14435        void *data = PyUnicode_DATA(seq);
14436        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14437        item = PyUnicode_FromOrdinal(chr);
14438        if (item != NULL)
14439            ++it->it_index;
14440        return item;
14441    }
14442
14443    Py_DECREF(seq);
14444    it->it_seq = NULL;
14445    return NULL;
14446}
14447
14448static PyObject *
14449unicodeiter_len(unicodeiterobject *it)
14450{
14451    Py_ssize_t len = 0;
14452    if (it->it_seq)
14453        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14454    return PyLong_FromSsize_t(len);
14455}
14456
14457PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14458
14459static PyObject *
14460unicodeiter_reduce(unicodeiterobject *it)
14461{
14462    if (it->it_seq != NULL) {
14463        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
14464                             it->it_seq, it->it_index);
14465    } else {
14466        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14467        if (u == NULL)
14468            return NULL;
14469        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
14470    }
14471}
14472
14473PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14474
14475static PyObject *
14476unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14477{
14478    Py_ssize_t index = PyLong_AsSsize_t(state);
14479    if (index == -1 && PyErr_Occurred())
14480        return NULL;
14481    if (index < 0)
14482        index = 0;
14483    it->it_index = index;
14484    Py_RETURN_NONE;
14485}
14486
14487PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14488
14489static PyMethodDef unicodeiter_methods[] = {
14490    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14491     length_hint_doc},
14492    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14493     reduce_doc},
14494    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
14495     setstate_doc},
14496    {NULL,      NULL}       /* sentinel */
14497};
14498
14499PyTypeObject PyUnicodeIter_Type = {
14500    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14501    "str_iterator",         /* tp_name */
14502    sizeof(unicodeiterobject),      /* tp_basicsize */
14503    0,                  /* tp_itemsize */
14504    /* methods */
14505    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14506    0,                  /* tp_print */
14507    0,                  /* tp_getattr */
14508    0,                  /* tp_setattr */
14509    0,                  /* tp_reserved */
14510    0,                  /* tp_repr */
14511    0,                  /* tp_as_number */
14512    0,                  /* tp_as_sequence */
14513    0,                  /* tp_as_mapping */
14514    0,                  /* tp_hash */
14515    0,                  /* tp_call */
14516    0,                  /* tp_str */
14517    PyObject_GenericGetAttr,        /* tp_getattro */
14518    0,                  /* tp_setattro */
14519    0,                  /* tp_as_buffer */
14520    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14521    0,                  /* tp_doc */
14522    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14523    0,                  /* tp_clear */
14524    0,                  /* tp_richcompare */
14525    0,                  /* tp_weaklistoffset */
14526    PyObject_SelfIter,          /* tp_iter */
14527    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14528    unicodeiter_methods,            /* tp_methods */
14529    0,
14530};
14531
14532static PyObject *
14533unicode_iter(PyObject *seq)
14534{
14535    unicodeiterobject *it;
14536
14537    if (!PyUnicode_Check(seq)) {
14538        PyErr_BadInternalCall();
14539        return NULL;
14540    }
14541    if (PyUnicode_READY(seq) == -1)
14542        return NULL;
14543    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14544    if (it == NULL)
14545        return NULL;
14546    it->it_index = 0;
14547    Py_INCREF(seq);
14548    it->it_seq = seq;
14549    _PyObject_GC_TRACK(it);
14550    return (PyObject *)it;
14551}
14552
14553
14554size_t
14555Py_UNICODE_strlen(const Py_UNICODE *u)
14556{
14557    int res = 0;
14558    while(*u++)
14559        res++;
14560    return res;
14561}
14562
14563Py_UNICODE*
14564Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14565{
14566    Py_UNICODE *u = s1;
14567    while ((*u++ = *s2++));
14568    return s1;
14569}
14570
14571Py_UNICODE*
14572Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14573{
14574    Py_UNICODE *u = s1;
14575    while ((*u++ = *s2++))
14576        if (n-- == 0)
14577            break;
14578    return s1;
14579}
14580
14581Py_UNICODE*
14582Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14583{
14584    Py_UNICODE *u1 = s1;
14585    u1 += Py_UNICODE_strlen(u1);
14586    Py_UNICODE_strcpy(u1, s2);
14587    return s1;
14588}
14589
14590int
14591Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14592{
14593    while (*s1 && *s2 && *s1 == *s2)
14594        s1++, s2++;
14595    if (*s1 && *s2)
14596        return (*s1 < *s2) ? -1 : +1;
14597    if (*s1)
14598        return 1;
14599    if (*s2)
14600        return -1;
14601    return 0;
14602}
14603
14604int
14605Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14606{
14607    register Py_UNICODE u1, u2;
14608    for (; n != 0; n--) {
14609        u1 = *s1;
14610        u2 = *s2;
14611        if (u1 != u2)
14612            return (u1 < u2) ? -1 : +1;
14613        if (u1 == '\0')
14614            return 0;
14615        s1++;
14616        s2++;
14617    }
14618    return 0;
14619}
14620
14621Py_UNICODE*
14622Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14623{
14624    const Py_UNICODE *p;
14625    for (p = s; *p; p++)
14626        if (*p == c)
14627            return (Py_UNICODE*)p;
14628    return NULL;
14629}
14630
14631Py_UNICODE*
14632Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14633{
14634    const Py_UNICODE *p;
14635    p = s + Py_UNICODE_strlen(s);
14636    while (p != s) {
14637        p--;
14638        if (*p == c)
14639            return (Py_UNICODE*)p;
14640    }
14641    return NULL;
14642}
14643
14644Py_UNICODE*
14645PyUnicode_AsUnicodeCopy(PyObject *unicode)
14646{
14647    Py_UNICODE *u, *copy;
14648    Py_ssize_t len, size;
14649
14650    if (!PyUnicode_Check(unicode)) {
14651        PyErr_BadArgument();
14652        return NULL;
14653    }
14654    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14655    if (u == NULL)
14656        return NULL;
14657    /* Ensure we won't overflow the size. */
14658    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14659        PyErr_NoMemory();
14660        return NULL;
14661    }
14662    size = len + 1; /* copy the null character */
14663    size *= sizeof(Py_UNICODE);
14664    copy = PyMem_Malloc(size);
14665    if (copy == NULL) {
14666        PyErr_NoMemory();
14667        return NULL;
14668    }
14669    memcpy(copy, u, size);
14670    return copy;
14671}
14672
14673/* A _string module, to export formatter_parser and formatter_field_name_split
14674   to the string.Formatter class implemented in Python. */
14675
14676static PyMethodDef _string_methods[] = {
14677    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14678     METH_O, PyDoc_STR("split the argument as a field name")},
14679    {"formatter_parser", (PyCFunction) formatter_parser,
14680     METH_O, PyDoc_STR("parse the argument as a format string")},
14681    {NULL, NULL}
14682};
14683
14684static struct PyModuleDef _string_module = {
14685    PyModuleDef_HEAD_INIT,
14686    "_string",
14687    PyDoc_STR("string helper module"),
14688    0,
14689    _string_methods,
14690    NULL,
14691    NULL,
14692    NULL,
14693    NULL
14694};
14695
14696PyMODINIT_FUNC
14697PyInit__string(void)
14698{
14699    return PyModule_Create(&_string_module);
14700}
14701
14702
14703#ifdef __cplusplus
14704}
14705#endif
14706