unicodeobject.c revision c6cf1ba29ea75d924fc4644e4f4383a71e146f22
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* --- Globals ------------------------------------------------------------
51
52   The globals are initialized by the _PyUnicode_Init() API and should
53   not be used before calling that API.
54
55*/
56
57
58#ifdef __cplusplus
59extern "C" {
60#endif
61
62/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63#define MAX_UNICODE 0x10ffff
64
65#ifdef Py_DEBUG
66#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
67#else
68#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
69#endif
70
71#define _PyUnicode_UTF8(op)                             \
72    (((PyCompactUnicodeObject*)(op))->utf8)
73#define PyUnicode_UTF8(op)                              \
74    (assert(_PyUnicode_CHECK(op)),                      \
75     assert(PyUnicode_IS_READY(op)),                    \
76     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
77         ((char*)((PyASCIIObject*)(op) + 1)) :          \
78         _PyUnicode_UTF8(op))
79#define _PyUnicode_UTF8_LENGTH(op)                      \
80    (((PyCompactUnicodeObject*)(op))->utf8_length)
81#define PyUnicode_UTF8_LENGTH(op)                       \
82    (assert(_PyUnicode_CHECK(op)),                      \
83     assert(PyUnicode_IS_READY(op)),                    \
84     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
85         ((PyASCIIObject*)(op))->length :               \
86         _PyUnicode_UTF8_LENGTH(op))
87#define _PyUnicode_WSTR(op)                             \
88    (((PyASCIIObject*)(op))->wstr)
89#define _PyUnicode_WSTR_LENGTH(op)                      \
90    (((PyCompactUnicodeObject*)(op))->wstr_length)
91#define _PyUnicode_LENGTH(op)                           \
92    (((PyASCIIObject *)(op))->length)
93#define _PyUnicode_STATE(op)                            \
94    (((PyASCIIObject *)(op))->state)
95#define _PyUnicode_HASH(op)                             \
96    (((PyASCIIObject *)(op))->hash)
97#define _PyUnicode_KIND(op)                             \
98    (assert(_PyUnicode_CHECK(op)),                      \
99     ((PyASCIIObject *)(op))->state.kind)
100#define _PyUnicode_GET_LENGTH(op)                       \
101    (assert(_PyUnicode_CHECK(op)),                      \
102     ((PyASCIIObject *)(op))->length)
103#define _PyUnicode_DATA_ANY(op)                         \
104    (((PyUnicodeObject*)(op))->data.any)
105
106/* Optimized version of Py_MAX() to compute the maximum character:
107   use it when your are computing the second argument of PyUnicode_New() */
108#define MAX_MAXCHAR(maxchar1, maxchar2)                 \
109    ((maxchar1) | (maxchar2))
110
111#undef PyUnicode_READY
112#define PyUnicode_READY(op)                             \
113    (assert(_PyUnicode_CHECK(op)),                      \
114     (PyUnicode_IS_READY(op) ?                          \
115      0 :                                               \
116      _PyUnicode_Ready(op)))
117
118#define _PyUnicode_SHARE_UTF8(op)                       \
119    (assert(_PyUnicode_CHECK(op)),                      \
120     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
121     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122#define _PyUnicode_SHARE_WSTR(op)                       \
123    (assert(_PyUnicode_CHECK(op)),                      \
124     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125
126/* true if the Unicode object has an allocated UTF-8 memory block
127   (not shared with other data) */
128#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
129    (assert(_PyUnicode_CHECK(op)),                      \
130     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
131      && _PyUnicode_UTF8(op)                            \
132      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated wstr memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
137    (assert(_PyUnicode_CHECK(op)),                      \
138     (_PyUnicode_WSTR(op) &&                            \
139      (!PyUnicode_IS_READY(op) ||                       \
140       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
142/* Generic helper macro to convert characters of different types.
143   from_type and to_type have to be valid type names, begin and end
144   are pointers to the source characters which should be of type
145   "from_type *".  to is a pointer of type "to_type *" and points to the
146   buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148    do {                                                \
149        to_type *_to = (to_type *) to;                  \
150        const from_type *_iter = (begin);               \
151        const from_type *_end = (end);                  \
152        Py_ssize_t n = (_end) - (_iter);                \
153        const from_type *_unrolled_end =                \
154            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
155        while (_iter < (_unrolled_end)) {               \
156            _to[0] = (to_type) _iter[0];                \
157            _to[1] = (to_type) _iter[1];                \
158            _to[2] = (to_type) _iter[2];                \
159            _to[3] = (to_type) _iter[3];                \
160            _iter += 4; _to += 4;                       \
161        }                                               \
162        while (_iter < (_end))                          \
163            *_to++ = (to_type) *_iter++;                \
164    } while (0)
165
166/* This dictionary holds all interned unicode strings.  Note that references
167   to strings in this dictionary are *not* counted in the string's ob_refcnt.
168   When the interned string reaches a refcnt of 0 the string deallocation
169   function will delete the reference from this dictionary.
170
171   Another way to look at this is that to say that the actual reference
172   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
173*/
174static PyObject *interned;
175
176/* The empty Unicode object is shared to improve performance. */
177static PyObject *unicode_empty;
178
179/* List of static strings. */
180static _Py_Identifier *static_strings;
181
182/* Single character Unicode strings in the Latin-1 range are being
183   shared as well. */
184static PyObject *unicode_latin1[256];
185
186/* Fast detection of the most frequent whitespace characters */
187const unsigned char _Py_ascii_whitespace[] = {
188    0, 0, 0, 0, 0, 0, 0, 0,
189/*     case 0x0009: * CHARACTER TABULATION */
190/*     case 0x000A: * LINE FEED */
191/*     case 0x000B: * LINE TABULATION */
192/*     case 0x000C: * FORM FEED */
193/*     case 0x000D: * CARRIAGE RETURN */
194    0, 1, 1, 1, 1, 1, 0, 0,
195    0, 0, 0, 0, 0, 0, 0, 0,
196/*     case 0x001C: * FILE SEPARATOR */
197/*     case 0x001D: * GROUP SEPARATOR */
198/*     case 0x001E: * RECORD SEPARATOR */
199/*     case 0x001F: * UNIT SEPARATOR */
200    0, 0, 0, 0, 1, 1, 1, 1,
201/*     case 0x0020: * SPACE */
202    1, 0, 0, 0, 0, 0, 0, 0,
203    0, 0, 0, 0, 0, 0, 0, 0,
204    0, 0, 0, 0, 0, 0, 0, 0,
205    0, 0, 0, 0, 0, 0, 0, 0,
206
207    0, 0, 0, 0, 0, 0, 0, 0,
208    0, 0, 0, 0, 0, 0, 0, 0,
209    0, 0, 0, 0, 0, 0, 0, 0,
210    0, 0, 0, 0, 0, 0, 0, 0,
211    0, 0, 0, 0, 0, 0, 0, 0,
212    0, 0, 0, 0, 0, 0, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214    0, 0, 0, 0, 0, 0, 0, 0
215};
216
217/* forward */
218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
219static PyObject* get_latin1_char(unsigned char ch);
220static int unicode_modifiable(PyObject *unicode);
221
222
223static PyObject *
224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
225static PyObject *
226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227static PyObject *
228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229
230static PyObject *
231unicode_encode_call_errorhandler(const char *errors,
232       PyObject **errorHandler,const char *encoding, const char *reason,
233       PyObject *unicode, PyObject **exceptionObject,
234       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
235
236static void
237raise_encode_exception(PyObject **exceptionObject,
238                       const char *encoding,
239                       PyObject *unicode,
240                       Py_ssize_t startpos, Py_ssize_t endpos,
241                       const char *reason);
242
243/* Same for linebreaks */
244static unsigned char ascii_linebreak[] = {
245    0, 0, 0, 0, 0, 0, 0, 0,
246/*         0x000A, * LINE FEED */
247/*         0x000B, * LINE TABULATION */
248/*         0x000C, * FORM FEED */
249/*         0x000D, * CARRIAGE RETURN */
250    0, 0, 1, 1, 1, 1, 0, 0,
251    0, 0, 0, 0, 0, 0, 0, 0,
252/*         0x001C, * FILE SEPARATOR */
253/*         0x001D, * GROUP SEPARATOR */
254/*         0x001E, * RECORD SEPARATOR */
255    0, 0, 0, 0, 1, 1, 1, 0,
256    0, 0, 0, 0, 0, 0, 0, 0,
257    0, 0, 0, 0, 0, 0, 0, 0,
258    0, 0, 0, 0, 0, 0, 0, 0,
259    0, 0, 0, 0, 0, 0, 0, 0,
260
261    0, 0, 0, 0, 0, 0, 0, 0,
262    0, 0, 0, 0, 0, 0, 0, 0,
263    0, 0, 0, 0, 0, 0, 0, 0,
264    0, 0, 0, 0, 0, 0, 0, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268    0, 0, 0, 0, 0, 0, 0, 0
269};
270
271/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
272   This function is kept for backward compatibility with the old API. */
273Py_UNICODE
274PyUnicode_GetMax(void)
275{
276#ifdef Py_UNICODE_WIDE
277    return 0x10FFFF;
278#else
279    /* This is actually an illegal character, so it should
280       not be passed to unichr. */
281    return 0xFFFF;
282#endif
283}
284
285#ifdef Py_DEBUG
286int
287_PyUnicode_CheckConsistency(PyObject *op, int check_content)
288{
289    PyASCIIObject *ascii;
290    unsigned int kind;
291
292    assert(PyUnicode_Check(op));
293
294    ascii = (PyASCIIObject *)op;
295    kind = ascii->state.kind;
296
297    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
298        assert(kind == PyUnicode_1BYTE_KIND);
299        assert(ascii->state.ready == 1);
300    }
301    else {
302        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
303        void *data;
304
305        if (ascii->state.compact == 1) {
306            data = compact + 1;
307            assert(kind == PyUnicode_1BYTE_KIND
308                   || kind == PyUnicode_2BYTE_KIND
309                   || kind == PyUnicode_4BYTE_KIND);
310            assert(ascii->state.ascii == 0);
311            assert(ascii->state.ready == 1);
312            assert (compact->utf8 != data);
313        }
314        else {
315            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
316
317            data = unicode->data.any;
318            if (kind == PyUnicode_WCHAR_KIND) {
319                assert(ascii->length == 0);
320                assert(ascii->hash == -1);
321                assert(ascii->state.compact == 0);
322                assert(ascii->state.ascii == 0);
323                assert(ascii->state.ready == 0);
324                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
325                assert(ascii->wstr != NULL);
326                assert(data == NULL);
327                assert(compact->utf8 == NULL);
328            }
329            else {
330                assert(kind == PyUnicode_1BYTE_KIND
331                       || kind == PyUnicode_2BYTE_KIND
332                       || kind == PyUnicode_4BYTE_KIND);
333                assert(ascii->state.compact == 0);
334                assert(ascii->state.ready == 1);
335                assert(data != NULL);
336                if (ascii->state.ascii) {
337                    assert (compact->utf8 == data);
338                    assert (compact->utf8_length == ascii->length);
339                }
340                else
341                    assert (compact->utf8 != data);
342            }
343        }
344        if (kind != PyUnicode_WCHAR_KIND) {
345            if (
346#if SIZEOF_WCHAR_T == 2
347                kind == PyUnicode_2BYTE_KIND
348#else
349                kind == PyUnicode_4BYTE_KIND
350#endif
351               )
352            {
353                assert(ascii->wstr == data);
354                assert(compact->wstr_length == ascii->length);
355            } else
356                assert(ascii->wstr != data);
357        }
358
359        if (compact->utf8 == NULL)
360            assert(compact->utf8_length == 0);
361        if (ascii->wstr == NULL)
362            assert(compact->wstr_length == 0);
363    }
364    /* check that the best kind is used */
365    if (check_content && kind != PyUnicode_WCHAR_KIND)
366    {
367        Py_ssize_t i;
368        Py_UCS4 maxchar = 0;
369        void *data;
370        Py_UCS4 ch;
371
372        data = PyUnicode_DATA(ascii);
373        for (i=0; i < ascii->length; i++)
374        {
375            ch = PyUnicode_READ(kind, data, i);
376            if (ch > maxchar)
377                maxchar = ch;
378        }
379        if (kind == PyUnicode_1BYTE_KIND) {
380            if (ascii->state.ascii == 0) {
381                assert(maxchar >= 128);
382                assert(maxchar <= 255);
383            }
384            else
385                assert(maxchar < 128);
386        }
387        else if (kind == PyUnicode_2BYTE_KIND) {
388            assert(maxchar >= 0x100);
389            assert(maxchar <= 0xFFFF);
390        }
391        else {
392            assert(maxchar >= 0x10000);
393            assert(maxchar <= MAX_UNICODE);
394        }
395        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
396    }
397    return 1;
398}
399#endif
400
401static PyObject*
402unicode_result_wchar(PyObject *unicode)
403{
404#ifndef Py_DEBUG
405    Py_ssize_t len;
406
407    len = _PyUnicode_WSTR_LENGTH(unicode);
408    if (len == 0) {
409        Py_INCREF(unicode_empty);
410        Py_DECREF(unicode);
411        return unicode_empty;
412    }
413
414    if (len == 1) {
415        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416        if (ch < 256) {
417            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418            Py_DECREF(unicode);
419            return latin1_char;
420        }
421    }
422
423    if (_PyUnicode_Ready(unicode) < 0) {
424        Py_DECREF(unicode);
425        return NULL;
426    }
427#else
428    assert(Py_REFCNT(unicode) == 1);
429
430    /* don't make the result ready in debug mode to ensure that the caller
431       makes the string ready before using it */
432    assert(_PyUnicode_CheckConsistency(unicode, 1));
433#endif
434    return unicode;
435}
436
437static PyObject*
438unicode_result_ready(PyObject *unicode)
439{
440    Py_ssize_t length;
441
442    length = PyUnicode_GET_LENGTH(unicode);
443    if (length == 0) {
444        if (unicode != unicode_empty) {
445            Py_INCREF(unicode_empty);
446            Py_DECREF(unicode);
447        }
448        return unicode_empty;
449    }
450
451    if (length == 1) {
452        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
453        if (ch < 256) {
454            PyObject *latin1_char = unicode_latin1[ch];
455            if (latin1_char != NULL) {
456                if (unicode != latin1_char) {
457                    Py_INCREF(latin1_char);
458                    Py_DECREF(unicode);
459                }
460                return latin1_char;
461            }
462            else {
463                assert(_PyUnicode_CheckConsistency(unicode, 1));
464                Py_INCREF(unicode);
465                unicode_latin1[ch] = unicode;
466                return unicode;
467            }
468        }
469    }
470
471    assert(_PyUnicode_CheckConsistency(unicode, 1));
472    return unicode;
473}
474
475static PyObject*
476unicode_result(PyObject *unicode)
477{
478    assert(_PyUnicode_CHECK(unicode));
479    if (PyUnicode_IS_READY(unicode))
480        return unicode_result_ready(unicode);
481    else
482        return unicode_result_wchar(unicode);
483}
484
485static PyObject*
486unicode_result_unchanged(PyObject *unicode)
487{
488    if (PyUnicode_CheckExact(unicode)) {
489        if (PyUnicode_READY(unicode) == -1)
490            return NULL;
491        Py_INCREF(unicode);
492        return unicode;
493    }
494    else
495        /* Subtype -- return genuine unicode string with the same value. */
496        return _PyUnicode_Copy(unicode);
497}
498
499#ifdef HAVE_MBCS
500static OSVERSIONINFOEX winver;
501#endif
502
503/* --- Bloom Filters ----------------------------------------------------- */
504
505/* stuff to implement simple "bloom filters" for Unicode characters.
506   to keep things simple, we use a single bitmask, using the least 5
507   bits from each unicode characters as the bit index. */
508
509/* the linebreak mask is set up by Unicode_Init below */
510
511#if LONG_BIT >= 128
512#define BLOOM_WIDTH 128
513#elif LONG_BIT >= 64
514#define BLOOM_WIDTH 64
515#elif LONG_BIT >= 32
516#define BLOOM_WIDTH 32
517#else
518#error "LONG_BIT is smaller than 32"
519#endif
520
521#define BLOOM_MASK unsigned long
522
523static BLOOM_MASK bloom_linebreak;
524
525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
527
528#define BLOOM_LINEBREAK(ch)                                             \
529    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
530     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
531
532Py_LOCAL_INLINE(BLOOM_MASK)
533make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
534{
535    /* calculate simple bloom-style bitmask for a given unicode string */
536
537    BLOOM_MASK mask;
538    Py_ssize_t i;
539
540    mask = 0;
541    for (i = 0; i < len; i++)
542        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
543
544    return mask;
545}
546
547#define BLOOM_MEMBER(mask, chr, str) \
548    (BLOOM(mask, chr) \
549     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
550
551/* Compilation of templated routines */
552
553#include "stringlib/asciilib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs1lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs2lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
583#include "stringlib/ucs4lib.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/partition.h"
586#include "stringlib/split.h"
587#include "stringlib/count.h"
588#include "stringlib/find.h"
589#include "stringlib/find_max_char.h"
590#include "stringlib/localeutil.h"
591#include "stringlib/undef.h"
592
593#include "stringlib/unicodedefs.h"
594#include "stringlib/fastsearch.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/undef.h"
598
599/* --- Unicode Object ----------------------------------------------------- */
600
601static PyObject *
602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
603
604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
605                                     Py_ssize_t size, Py_UCS4 ch,
606                                     int direction)
607{
608    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
609
610    switch (kind) {
611    case PyUnicode_1BYTE_KIND:
612        {
613            Py_UCS1 ch1 = (Py_UCS1) ch;
614            if (ch1 == ch)
615                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
616            else
617                return -1;
618        }
619    case PyUnicode_2BYTE_KIND:
620        {
621            Py_UCS2 ch2 = (Py_UCS2) ch;
622            if (ch2 == ch)
623                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
624            else
625                return -1;
626        }
627    case PyUnicode_4BYTE_KIND:
628        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
629    default:
630        assert(0);
631        return -1;
632    }
633}
634
635#ifdef Py_DEBUG
636/* Fill the data of an Unicode string with invalid characters to detect bugs
637   earlier.
638
639   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
640   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
641   invalid character in Unicode 6.0. */
642static void
643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
644{
645    int kind = PyUnicode_KIND(unicode);
646    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
647    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
648    if (length <= old_length)
649        return;
650    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
651}
652#endif
653
654static PyObject*
655resize_compact(PyObject *unicode, Py_ssize_t length)
656{
657    Py_ssize_t char_size;
658    Py_ssize_t struct_size;
659    Py_ssize_t new_size;
660    int share_wstr;
661    PyObject *new_unicode;
662#ifdef Py_DEBUG
663    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
664#endif
665
666    assert(unicode_modifiable(unicode));
667    assert(PyUnicode_IS_READY(unicode));
668    assert(PyUnicode_IS_COMPACT(unicode));
669
670    char_size = PyUnicode_KIND(unicode);
671    if (PyUnicode_IS_ASCII(unicode))
672        struct_size = sizeof(PyASCIIObject);
673    else
674        struct_size = sizeof(PyCompactUnicodeObject);
675    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
676
677    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
678        PyErr_NoMemory();
679        return NULL;
680    }
681    new_size = (struct_size + (length + 1) * char_size);
682
683    _Py_DEC_REFTOTAL;
684    _Py_ForgetReference(unicode);
685
686    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
687    if (new_unicode == NULL) {
688        _Py_NewReference(unicode);
689        PyErr_NoMemory();
690        return NULL;
691    }
692    unicode = new_unicode;
693    _Py_NewReference(unicode);
694
695    _PyUnicode_LENGTH(unicode) = length;
696    if (share_wstr) {
697        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
698        if (!PyUnicode_IS_ASCII(unicode))
699            _PyUnicode_WSTR_LENGTH(unicode) = length;
700    }
701#ifdef Py_DEBUG
702    unicode_fill_invalid(unicode, old_length);
703#endif
704    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705                    length, 0);
706    assert(_PyUnicode_CheckConsistency(unicode, 0));
707    return unicode;
708}
709
710static int
711resize_inplace(PyObject *unicode, Py_ssize_t length)
712{
713    wchar_t *wstr;
714    Py_ssize_t new_size;
715    assert(!PyUnicode_IS_COMPACT(unicode));
716    assert(Py_REFCNT(unicode) == 1);
717
718    if (PyUnicode_IS_READY(unicode)) {
719        Py_ssize_t char_size;
720        int share_wstr, share_utf8;
721        void *data;
722#ifdef Py_DEBUG
723        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
724#endif
725
726        data = _PyUnicode_DATA_ANY(unicode);
727        char_size = PyUnicode_KIND(unicode);
728        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
729        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
730
731        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
732            PyErr_NoMemory();
733            return -1;
734        }
735        new_size = (length + 1) * char_size;
736
737        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
738        {
739            PyObject_DEL(_PyUnicode_UTF8(unicode));
740            _PyUnicode_UTF8(unicode) = NULL;
741            _PyUnicode_UTF8_LENGTH(unicode) = 0;
742        }
743
744        data = (PyObject *)PyObject_REALLOC(data, new_size);
745        if (data == NULL) {
746            PyErr_NoMemory();
747            return -1;
748        }
749        _PyUnicode_DATA_ANY(unicode) = data;
750        if (share_wstr) {
751            _PyUnicode_WSTR(unicode) = data;
752            _PyUnicode_WSTR_LENGTH(unicode) = length;
753        }
754        if (share_utf8) {
755            _PyUnicode_UTF8(unicode) = data;
756            _PyUnicode_UTF8_LENGTH(unicode) = length;
757        }
758        _PyUnicode_LENGTH(unicode) = length;
759        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
760#ifdef Py_DEBUG
761        unicode_fill_invalid(unicode, old_length);
762#endif
763        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
764            assert(_PyUnicode_CheckConsistency(unicode, 0));
765            return 0;
766        }
767    }
768    assert(_PyUnicode_WSTR(unicode) != NULL);
769
770    /* check for integer overflow */
771    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
772        PyErr_NoMemory();
773        return -1;
774    }
775    new_size = sizeof(wchar_t) * (length + 1);
776    wstr =  _PyUnicode_WSTR(unicode);
777    wstr = PyObject_REALLOC(wstr, new_size);
778    if (!wstr) {
779        PyErr_NoMemory();
780        return -1;
781    }
782    _PyUnicode_WSTR(unicode) = wstr;
783    _PyUnicode_WSTR(unicode)[length] = 0;
784    _PyUnicode_WSTR_LENGTH(unicode) = length;
785    assert(_PyUnicode_CheckConsistency(unicode, 0));
786    return 0;
787}
788
789static PyObject*
790resize_copy(PyObject *unicode, Py_ssize_t length)
791{
792    Py_ssize_t copy_length;
793    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
794        PyObject *copy;
795
796        if (PyUnicode_READY(unicode) == -1)
797            return NULL;
798
799        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
800        if (copy == NULL)
801            return NULL;
802
803        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
804        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
805        return copy;
806    }
807    else {
808        PyObject *w;
809
810        w = (PyObject*)_PyUnicode_New(length);
811        if (w == NULL)
812            return NULL;
813        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814        copy_length = Py_MIN(copy_length, length);
815        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816                  copy_length * sizeof(wchar_t));
817        return w;
818    }
819}
820
821/* We allocate one more byte to make sure the string is
822   Ux0000 terminated; some code (e.g. new_identifier)
823   relies on that.
824
825   XXX This allocator could further be enhanced by assuring that the
826   free list never reduces its size below 1.
827
828*/
829
830static PyUnicodeObject *
831_PyUnicode_New(Py_ssize_t length)
832{
833    register PyUnicodeObject *unicode;
834    size_t new_size;
835
836    /* Optimization for empty strings */
837    if (length == 0 && unicode_empty != NULL) {
838        Py_INCREF(unicode_empty);
839        return (PyUnicodeObject*)unicode_empty;
840    }
841
842    /* Ensure we won't overflow the size. */
843    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844        return (PyUnicodeObject *)PyErr_NoMemory();
845    }
846    if (length < 0) {
847        PyErr_SetString(PyExc_SystemError,
848                        "Negative size passed to _PyUnicode_New");
849        return NULL;
850    }
851
852    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853    if (unicode == NULL)
854        return NULL;
855    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
857    if (!_PyUnicode_WSTR(unicode)) {
858        Py_DECREF(unicode);
859        PyErr_NoMemory();
860        return NULL;
861    }
862
863    /* Initialize the first element to guard against cases where
864     * the caller fails before initializing str -- unicode_resize()
865     * reads str[0], and the Keep-Alive optimization can keep memory
866     * allocated for str alive across a call to unicode_dealloc(unicode).
867     * We don't want unicode_resize to read uninitialized memory in
868     * that case.
869     */
870    _PyUnicode_WSTR(unicode)[0] = 0;
871    _PyUnicode_WSTR(unicode)[length] = 0;
872    _PyUnicode_WSTR_LENGTH(unicode) = length;
873    _PyUnicode_HASH(unicode) = -1;
874    _PyUnicode_STATE(unicode).interned = 0;
875    _PyUnicode_STATE(unicode).kind = 0;
876    _PyUnicode_STATE(unicode).compact = 0;
877    _PyUnicode_STATE(unicode).ready = 0;
878    _PyUnicode_STATE(unicode).ascii = 0;
879    _PyUnicode_DATA_ANY(unicode) = NULL;
880    _PyUnicode_LENGTH(unicode) = 0;
881    _PyUnicode_UTF8(unicode) = NULL;
882    _PyUnicode_UTF8_LENGTH(unicode) = 0;
883    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
884    return unicode;
885}
886
887static const char*
888unicode_kind_name(PyObject *unicode)
889{
890    /* don't check consistency: unicode_kind_name() is called from
891       _PyUnicode_Dump() */
892    if (!PyUnicode_IS_COMPACT(unicode))
893    {
894        if (!PyUnicode_IS_READY(unicode))
895            return "wstr";
896        switch (PyUnicode_KIND(unicode))
897        {
898        case PyUnicode_1BYTE_KIND:
899            if (PyUnicode_IS_ASCII(unicode))
900                return "legacy ascii";
901            else
902                return "legacy latin1";
903        case PyUnicode_2BYTE_KIND:
904            return "legacy UCS2";
905        case PyUnicode_4BYTE_KIND:
906            return "legacy UCS4";
907        default:
908            return "<legacy invalid kind>";
909        }
910    }
911    assert(PyUnicode_IS_READY(unicode));
912    switch (PyUnicode_KIND(unicode)) {
913    case PyUnicode_1BYTE_KIND:
914        if (PyUnicode_IS_ASCII(unicode))
915            return "ascii";
916        else
917            return "latin1";
918    case PyUnicode_2BYTE_KIND:
919        return "UCS2";
920    case PyUnicode_4BYTE_KIND:
921        return "UCS4";
922    default:
923        return "<invalid compact kind>";
924    }
925}
926
927#ifdef Py_DEBUG
928/* Functions wrapping macros for use in debugger */
929char *_PyUnicode_utf8(void *unicode){
930    return PyUnicode_UTF8(unicode);
931}
932
933void *_PyUnicode_compact_data(void *unicode) {
934    return _PyUnicode_COMPACT_DATA(unicode);
935}
936void *_PyUnicode_data(void *unicode){
937    printf("obj %p\n", unicode);
938    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
939    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
940    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
941    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
942    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
943    return PyUnicode_DATA(unicode);
944}
945
946void
947_PyUnicode_Dump(PyObject *op)
948{
949    PyASCIIObject *ascii = (PyASCIIObject *)op;
950    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
951    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
952    void *data;
953
954    if (ascii->state.compact)
955    {
956        if (ascii->state.ascii)
957            data = (ascii + 1);
958        else
959            data = (compact + 1);
960    }
961    else
962        data = unicode->data.any;
963    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
964
965    if (ascii->wstr == data)
966        printf("shared ");
967    printf("wstr=%p", ascii->wstr);
968
969    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
970        printf(" (%zu), ", compact->wstr_length);
971        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
972            printf("shared ");
973        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
974    }
975    printf(", data=%p\n", data);
976}
977#endif
978
979PyObject *
980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
981{
982    PyObject *obj;
983    PyCompactUnicodeObject *unicode;
984    void *data;
985    enum PyUnicode_Kind kind;
986    int is_sharing, is_ascii;
987    Py_ssize_t char_size;
988    Py_ssize_t struct_size;
989
990    /* Optimization for empty strings */
991    if (size == 0 && unicode_empty != NULL) {
992        Py_INCREF(unicode_empty);
993        return unicode_empty;
994    }
995
996    is_ascii = 0;
997    is_sharing = 0;
998    struct_size = sizeof(PyCompactUnicodeObject);
999    if (maxchar < 128) {
1000        kind = PyUnicode_1BYTE_KIND;
1001        char_size = 1;
1002        is_ascii = 1;
1003        struct_size = sizeof(PyASCIIObject);
1004    }
1005    else if (maxchar < 256) {
1006        kind = PyUnicode_1BYTE_KIND;
1007        char_size = 1;
1008    }
1009    else if (maxchar < 65536) {
1010        kind = PyUnicode_2BYTE_KIND;
1011        char_size = 2;
1012        if (sizeof(wchar_t) == 2)
1013            is_sharing = 1;
1014    }
1015    else {
1016        if (maxchar > MAX_UNICODE) {
1017            PyErr_SetString(PyExc_SystemError,
1018                            "invalid maximum character passed to PyUnicode_New");
1019            return NULL;
1020        }
1021        kind = PyUnicode_4BYTE_KIND;
1022        char_size = 4;
1023        if (sizeof(wchar_t) == 4)
1024            is_sharing = 1;
1025    }
1026
1027    /* Ensure we won't overflow the size. */
1028    if (size < 0) {
1029        PyErr_SetString(PyExc_SystemError,
1030                        "Negative size passed to PyUnicode_New");
1031        return NULL;
1032    }
1033    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1034        return PyErr_NoMemory();
1035
1036    /* Duplicated allocation code from _PyObject_New() instead of a call to
1037     * PyObject_New() so we are able to allocate space for the object and
1038     * it's data buffer.
1039     */
1040    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1041    if (obj == NULL)
1042        return PyErr_NoMemory();
1043    obj = PyObject_INIT(obj, &PyUnicode_Type);
1044    if (obj == NULL)
1045        return NULL;
1046
1047    unicode = (PyCompactUnicodeObject *)obj;
1048    if (is_ascii)
1049        data = ((PyASCIIObject*)obj) + 1;
1050    else
1051        data = unicode + 1;
1052    _PyUnicode_LENGTH(unicode) = size;
1053    _PyUnicode_HASH(unicode) = -1;
1054    _PyUnicode_STATE(unicode).interned = 0;
1055    _PyUnicode_STATE(unicode).kind = kind;
1056    _PyUnicode_STATE(unicode).compact = 1;
1057    _PyUnicode_STATE(unicode).ready = 1;
1058    _PyUnicode_STATE(unicode).ascii = is_ascii;
1059    if (is_ascii) {
1060        ((char*)data)[size] = 0;
1061        _PyUnicode_WSTR(unicode) = NULL;
1062    }
1063    else if (kind == PyUnicode_1BYTE_KIND) {
1064        ((char*)data)[size] = 0;
1065        _PyUnicode_WSTR(unicode) = NULL;
1066        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1067        unicode->utf8 = NULL;
1068        unicode->utf8_length = 0;
1069    }
1070    else {
1071        unicode->utf8 = NULL;
1072        unicode->utf8_length = 0;
1073        if (kind == PyUnicode_2BYTE_KIND)
1074            ((Py_UCS2*)data)[size] = 0;
1075        else /* kind == PyUnicode_4BYTE_KIND */
1076            ((Py_UCS4*)data)[size] = 0;
1077        if (is_sharing) {
1078            _PyUnicode_WSTR_LENGTH(unicode) = size;
1079            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1080        }
1081        else {
1082            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1083            _PyUnicode_WSTR(unicode) = NULL;
1084        }
1085    }
1086#ifdef Py_DEBUG
1087    unicode_fill_invalid((PyObject*)unicode, 0);
1088#endif
1089    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1090    return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095   will decode surrogate pairs, the other conversions are implemented as macros
1096   for efficiency.
1097
1098   This function assumes that unicode can hold one more code point than wstr
1099   characters for a terminating null character. */
1100static void
1101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1102                              PyObject *unicode)
1103{
1104    const wchar_t *iter;
1105    Py_UCS4 *ucs4_out;
1106
1107    assert(unicode != NULL);
1108    assert(_PyUnicode_CHECK(unicode));
1109    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112    for (iter = begin; iter < end; ) {
1113        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114                           _PyUnicode_GET_LENGTH(unicode)));
1115        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116            && (iter+1) < end
1117            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1118        {
1119            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1120            iter += 2;
1121        }
1122        else {
1123            *ucs4_out++ = *iter;
1124            iter++;
1125        }
1126    }
1127    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128                        _PyUnicode_GET_LENGTH(unicode)));
1129
1130}
1131#endif
1132
1133static int
1134unicode_check_modifiable(PyObject *unicode)
1135{
1136    if (!unicode_modifiable(unicode)) {
1137        PyErr_SetString(PyExc_SystemError,
1138                        "Cannot modify a string currently used");
1139        return -1;
1140    }
1141    return 0;
1142}
1143
1144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146                 PyObject *from, Py_ssize_t from_start,
1147                 Py_ssize_t how_many, int check_maxchar)
1148{
1149    unsigned int from_kind, to_kind;
1150    void *from_data, *to_data;
1151
1152    assert(0 <= how_many);
1153    assert(0 <= from_start);
1154    assert(0 <= to_start);
1155    assert(PyUnicode_Check(from));
1156    assert(PyUnicode_IS_READY(from));
1157    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1158
1159    assert(PyUnicode_Check(to));
1160    assert(PyUnicode_IS_READY(to));
1161    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
1163    if (how_many == 0)
1164        return 0;
1165
1166    from_kind = PyUnicode_KIND(from);
1167    from_data = PyUnicode_DATA(from);
1168    to_kind = PyUnicode_KIND(to);
1169    to_data = PyUnicode_DATA(to);
1170
1171#ifdef Py_DEBUG
1172    if (!check_maxchar
1173        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1174    {
1175        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176        Py_UCS4 ch;
1177        Py_ssize_t i;
1178        for (i=0; i < how_many; i++) {
1179            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180            assert(ch <= to_maxchar);
1181        }
1182    }
1183#endif
1184
1185    if (from_kind == to_kind) {
1186        if (check_maxchar
1187            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1188        {
1189            /* Writing Latin-1 characters into an ASCII string requires to
1190               check that all written characters are pure ASCII */
1191            Py_UCS4 max_char;
1192            max_char = ucs1lib_find_max_char(from_data,
1193                                             (Py_UCS1*)from_data + how_many);
1194            if (max_char >= 128)
1195                return -1;
1196        }
1197        Py_MEMCPY((char*)to_data + to_kind * to_start,
1198                  (char*)from_data + from_kind * from_start,
1199                  to_kind * how_many);
1200    }
1201    else if (from_kind == PyUnicode_1BYTE_KIND
1202             && to_kind == PyUnicode_2BYTE_KIND)
1203    {
1204        _PyUnicode_CONVERT_BYTES(
1205            Py_UCS1, Py_UCS2,
1206            PyUnicode_1BYTE_DATA(from) + from_start,
1207            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1208            PyUnicode_2BYTE_DATA(to) + to_start
1209            );
1210    }
1211    else if (from_kind == PyUnicode_1BYTE_KIND
1212             && to_kind == PyUnicode_4BYTE_KIND)
1213    {
1214        _PyUnicode_CONVERT_BYTES(
1215            Py_UCS1, Py_UCS4,
1216            PyUnicode_1BYTE_DATA(from) + from_start,
1217            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1218            PyUnicode_4BYTE_DATA(to) + to_start
1219            );
1220    }
1221    else if (from_kind == PyUnicode_2BYTE_KIND
1222             && to_kind == PyUnicode_4BYTE_KIND)
1223    {
1224        _PyUnicode_CONVERT_BYTES(
1225            Py_UCS2, Py_UCS4,
1226            PyUnicode_2BYTE_DATA(from) + from_start,
1227            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1228            PyUnicode_4BYTE_DATA(to) + to_start
1229            );
1230    }
1231    else {
1232        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1233
1234        if (!check_maxchar) {
1235            if (from_kind == PyUnicode_2BYTE_KIND
1236                && to_kind == PyUnicode_1BYTE_KIND)
1237            {
1238                _PyUnicode_CONVERT_BYTES(
1239                    Py_UCS2, Py_UCS1,
1240                    PyUnicode_2BYTE_DATA(from) + from_start,
1241                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1242                    PyUnicode_1BYTE_DATA(to) + to_start
1243                    );
1244            }
1245            else if (from_kind == PyUnicode_4BYTE_KIND
1246                     && to_kind == PyUnicode_1BYTE_KIND)
1247            {
1248                _PyUnicode_CONVERT_BYTES(
1249                    Py_UCS4, Py_UCS1,
1250                    PyUnicode_4BYTE_DATA(from) + from_start,
1251                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1252                    PyUnicode_1BYTE_DATA(to) + to_start
1253                    );
1254            }
1255            else if (from_kind == PyUnicode_4BYTE_KIND
1256                     && to_kind == PyUnicode_2BYTE_KIND)
1257            {
1258                _PyUnicode_CONVERT_BYTES(
1259                    Py_UCS4, Py_UCS2,
1260                    PyUnicode_4BYTE_DATA(from) + from_start,
1261                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1262                    PyUnicode_2BYTE_DATA(to) + to_start
1263                    );
1264            }
1265            else {
1266                assert(0);
1267                return -1;
1268            }
1269        }
1270        else {
1271            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1272            Py_UCS4 ch;
1273            Py_ssize_t i;
1274
1275            for (i=0; i < how_many; i++) {
1276                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1277                if (ch > to_maxchar)
1278                    return -1;
1279                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1280            }
1281        }
1282    }
1283    return 0;
1284}
1285
1286void
1287_PyUnicode_FastCopyCharacters(
1288    PyObject *to, Py_ssize_t to_start,
1289    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1290{
1291    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1292}
1293
1294Py_ssize_t
1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1296                         PyObject *from, Py_ssize_t from_start,
1297                         Py_ssize_t how_many)
1298{
1299    int err;
1300
1301    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1302        PyErr_BadInternalCall();
1303        return -1;
1304    }
1305
1306    if (PyUnicode_READY(from) == -1)
1307        return -1;
1308    if (PyUnicode_READY(to) == -1)
1309        return -1;
1310
1311    if (from_start < 0) {
1312        PyErr_SetString(PyExc_IndexError, "string index out of range");
1313        return -1;
1314    }
1315    if (to_start < 0) {
1316        PyErr_SetString(PyExc_IndexError, "string index out of range");
1317        return -1;
1318    }
1319    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1320    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1321        PyErr_Format(PyExc_SystemError,
1322                     "Cannot write %zi characters at %zi "
1323                     "in a string of %zi characters",
1324                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1325        return -1;
1326    }
1327
1328    if (how_many == 0)
1329        return 0;
1330
1331    if (unicode_check_modifiable(to))
1332        return -1;
1333
1334    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1335    if (err) {
1336        PyErr_Format(PyExc_SystemError,
1337                     "Cannot copy %s characters "
1338                     "into a string of %s characters",
1339                     unicode_kind_name(from),
1340                     unicode_kind_name(to));
1341        return -1;
1342    }
1343    return how_many;
1344}
1345
1346/* Find the maximum code point and count the number of surrogate pairs so a
1347   correct string length can be computed before converting a string to UCS4.
1348   This function counts single surrogates as a character and not as a pair.
1349
1350   Return 0 on success, or -1 on error. */
1351static int
1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1353                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1354{
1355    const wchar_t *iter;
1356    Py_UCS4 ch;
1357
1358    assert(num_surrogates != NULL && maxchar != NULL);
1359    *num_surrogates = 0;
1360    *maxchar = 0;
1361
1362    for (iter = begin; iter < end; ) {
1363#if SIZEOF_WCHAR_T == 2
1364        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1365            && (iter+1) < end
1366            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1367        {
1368            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1369            ++(*num_surrogates);
1370            iter += 2;
1371        }
1372        else
1373#endif
1374        {
1375            ch = *iter;
1376            iter++;
1377        }
1378        if (ch > *maxchar) {
1379            *maxchar = ch;
1380            if (*maxchar > MAX_UNICODE) {
1381                PyErr_Format(PyExc_ValueError,
1382                             "character U+%x is not in range [U+0000; U+10ffff]",
1383                             ch);
1384                return -1;
1385            }
1386        }
1387    }
1388    return 0;
1389}
1390
1391int
1392_PyUnicode_Ready(PyObject *unicode)
1393{
1394    wchar_t *end;
1395    Py_UCS4 maxchar = 0;
1396    Py_ssize_t num_surrogates;
1397#if SIZEOF_WCHAR_T == 2
1398    Py_ssize_t length_wo_surrogates;
1399#endif
1400
1401    /* _PyUnicode_Ready() is only intended for old-style API usage where
1402       strings were created using _PyObject_New() and where no canonical
1403       representation (the str field) has been set yet aka strings
1404       which are not yet ready. */
1405    assert(_PyUnicode_CHECK(unicode));
1406    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1407    assert(_PyUnicode_WSTR(unicode) != NULL);
1408    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1409    assert(_PyUnicode_UTF8(unicode) == NULL);
1410    /* Actually, it should neither be interned nor be anything else: */
1411    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1412
1413    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1414    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1415                                &maxchar, &num_surrogates) == -1)
1416        return -1;
1417
1418    if (maxchar < 256) {
1419        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1420        if (!_PyUnicode_DATA_ANY(unicode)) {
1421            PyErr_NoMemory();
1422            return -1;
1423        }
1424        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1425                                _PyUnicode_WSTR(unicode), end,
1426                                PyUnicode_1BYTE_DATA(unicode));
1427        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1428        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1429        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1430        if (maxchar < 128) {
1431            _PyUnicode_STATE(unicode).ascii = 1;
1432            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1433            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1434        }
1435        else {
1436            _PyUnicode_STATE(unicode).ascii = 0;
1437            _PyUnicode_UTF8(unicode) = NULL;
1438            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1439        }
1440        PyObject_FREE(_PyUnicode_WSTR(unicode));
1441        _PyUnicode_WSTR(unicode) = NULL;
1442        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1443    }
1444    /* In this case we might have to convert down from 4-byte native
1445       wchar_t to 2-byte unicode. */
1446    else if (maxchar < 65536) {
1447        assert(num_surrogates == 0 &&
1448               "FindMaxCharAndNumSurrogatePairs() messed up");
1449
1450#if SIZEOF_WCHAR_T == 2
1451        /* We can share representations and are done. */
1452        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1453        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1456        _PyUnicode_UTF8(unicode) = NULL;
1457        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1458#else
1459        /* sizeof(wchar_t) == 4 */
1460        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1461            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1462        if (!_PyUnicode_DATA_ANY(unicode)) {
1463            PyErr_NoMemory();
1464            return -1;
1465        }
1466        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1467                                _PyUnicode_WSTR(unicode), end,
1468                                PyUnicode_2BYTE_DATA(unicode));
1469        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1470        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1471        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1472        _PyUnicode_UTF8(unicode) = NULL;
1473        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1474        PyObject_FREE(_PyUnicode_WSTR(unicode));
1475        _PyUnicode_WSTR(unicode) = NULL;
1476        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1477#endif
1478    }
1479    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1480    else {
1481#if SIZEOF_WCHAR_T == 2
1482        /* in case the native representation is 2-bytes, we need to allocate a
1483           new normalized 4-byte version. */
1484        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1485        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1486        if (!_PyUnicode_DATA_ANY(unicode)) {
1487            PyErr_NoMemory();
1488            return -1;
1489        }
1490        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1491        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1492        _PyUnicode_UTF8(unicode) = NULL;
1493        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1494        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1495        _PyUnicode_STATE(unicode).ready = 1;
1496        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1497        PyObject_FREE(_PyUnicode_WSTR(unicode));
1498        _PyUnicode_WSTR(unicode) = NULL;
1499        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#else
1501        assert(num_surrogates == 0);
1502
1503        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1504        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1505        _PyUnicode_UTF8(unicode) = NULL;
1506        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1507        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508#endif
1509        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510    }
1511    _PyUnicode_STATE(unicode).ready = 1;
1512    assert(_PyUnicode_CheckConsistency(unicode, 1));
1513    return 0;
1514}
1515
1516static void
1517unicode_dealloc(register PyObject *unicode)
1518{
1519    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1520    case SSTATE_NOT_INTERNED:
1521        break;
1522
1523    case SSTATE_INTERNED_MORTAL:
1524        /* revive dead object temporarily for DelItem */
1525        Py_REFCNT(unicode) = 3;
1526        if (PyDict_DelItem(interned, unicode) != 0)
1527            Py_FatalError(
1528                "deletion of interned string failed");
1529        break;
1530
1531    case SSTATE_INTERNED_IMMORTAL:
1532        Py_FatalError("Immortal interned string died.");
1533
1534    default:
1535        Py_FatalError("Inconsistent interned string state.");
1536    }
1537
1538    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1539        PyObject_DEL(_PyUnicode_WSTR(unicode));
1540    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1541        PyObject_DEL(_PyUnicode_UTF8(unicode));
1542    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1543        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1544
1545    Py_TYPE(unicode)->tp_free(unicode);
1546}
1547
1548#ifdef Py_DEBUG
1549static int
1550unicode_is_singleton(PyObject *unicode)
1551{
1552    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1553    if (unicode == unicode_empty)
1554        return 1;
1555    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1556    {
1557        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1558        if (ch < 256 && unicode_latin1[ch] == unicode)
1559            return 1;
1560    }
1561    return 0;
1562}
1563#endif
1564
1565static int
1566unicode_modifiable(PyObject *unicode)
1567{
1568    assert(_PyUnicode_CHECK(unicode));
1569    if (Py_REFCNT(unicode) != 1)
1570        return 0;
1571    if (_PyUnicode_HASH(unicode) != -1)
1572        return 0;
1573    if (PyUnicode_CHECK_INTERNED(unicode))
1574        return 0;
1575    if (!PyUnicode_CheckExact(unicode))
1576        return 0;
1577#ifdef Py_DEBUG
1578    /* singleton refcount is greater than 1 */
1579    assert(!unicode_is_singleton(unicode));
1580#endif
1581    return 1;
1582}
1583
1584static int
1585unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1586{
1587    PyObject *unicode;
1588    Py_ssize_t old_length;
1589
1590    assert(p_unicode != NULL);
1591    unicode = *p_unicode;
1592
1593    assert(unicode != NULL);
1594    assert(PyUnicode_Check(unicode));
1595    assert(0 <= length);
1596
1597    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1598        old_length = PyUnicode_WSTR_LENGTH(unicode);
1599    else
1600        old_length = PyUnicode_GET_LENGTH(unicode);
1601    if (old_length == length)
1602        return 0;
1603
1604    if (length == 0) {
1605        Py_DECREF(*p_unicode);
1606        *p_unicode = unicode_empty;
1607        Py_INCREF(*p_unicode);
1608        return 0;
1609    }
1610
1611    if (!unicode_modifiable(unicode)) {
1612        PyObject *copy = resize_copy(unicode, length);
1613        if (copy == NULL)
1614            return -1;
1615        Py_DECREF(*p_unicode);
1616        *p_unicode = copy;
1617        return 0;
1618    }
1619
1620    if (PyUnicode_IS_COMPACT(unicode)) {
1621        PyObject *new_unicode = resize_compact(unicode, length);
1622        if (new_unicode == NULL)
1623            return -1;
1624        *p_unicode = new_unicode;
1625        return 0;
1626    }
1627    return resize_inplace(unicode, length);
1628}
1629
1630int
1631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1632{
1633    PyObject *unicode;
1634    if (p_unicode == NULL) {
1635        PyErr_BadInternalCall();
1636        return -1;
1637    }
1638    unicode = *p_unicode;
1639    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1640    {
1641        PyErr_BadInternalCall();
1642        return -1;
1643    }
1644    return unicode_resize(p_unicode, length);
1645}
1646
1647static int
1648unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1649              unsigned int maxchar)
1650{
1651    PyObject *result;
1652    assert(PyUnicode_IS_READY(*p_unicode));
1653    assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
1654    if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1655        return 0;
1656    result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1657                           maxchar);
1658    if (result == NULL)
1659        return -1;
1660    _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
1661    Py_DECREF(*p_unicode);
1662    *p_unicode = result;
1663    return 0;
1664}
1665
1666static int
1667unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1668                Py_UCS4 ch)
1669{
1670    assert(ch <= MAX_UNICODE);
1671    if (unicode_widen(p_unicode, *pos, ch) < 0)
1672        return -1;
1673    PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1674                    PyUnicode_DATA(*p_unicode),
1675                    (*pos)++, ch);
1676    return 0;
1677}
1678
1679/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1680
1681   WARNING: The function doesn't copy the terminating null character and
1682   doesn't check the maximum character (may write a latin1 character in an
1683   ASCII string). */
1684static void
1685unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1686                   const char *str, Py_ssize_t len)
1687{
1688    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1689    void *data = PyUnicode_DATA(unicode);
1690    const char *end = str + len;
1691
1692    switch (kind) {
1693    case PyUnicode_1BYTE_KIND: {
1694        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1695#ifdef Py_DEBUG
1696        if (PyUnicode_IS_ASCII(unicode)) {
1697            Py_UCS4 maxchar = ucs1lib_find_max_char(
1698                (const Py_UCS1*)str,
1699                (const Py_UCS1*)str + len);
1700            assert(maxchar < 128);
1701        }
1702#endif
1703        memcpy((char *) data + index, str, len);
1704        break;
1705    }
1706    case PyUnicode_2BYTE_KIND: {
1707        Py_UCS2 *start = (Py_UCS2 *)data + index;
1708        Py_UCS2 *ucs2 = start;
1709        assert(index <= PyUnicode_GET_LENGTH(unicode));
1710
1711        for (; str < end; ++ucs2, ++str)
1712            *ucs2 = (Py_UCS2)*str;
1713
1714        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1715        break;
1716    }
1717    default: {
1718        Py_UCS4 *start = (Py_UCS4 *)data + index;
1719        Py_UCS4 *ucs4 = start;
1720        assert(kind == PyUnicode_4BYTE_KIND);
1721        assert(index <= PyUnicode_GET_LENGTH(unicode));
1722
1723        for (; str < end; ++ucs4, ++str)
1724            *ucs4 = (Py_UCS4)*str;
1725
1726        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1727    }
1728    }
1729}
1730
1731
1732static PyObject*
1733get_latin1_char(unsigned char ch)
1734{
1735    PyObject *unicode = unicode_latin1[ch];
1736    if (!unicode) {
1737        unicode = PyUnicode_New(1, ch);
1738        if (!unicode)
1739            return NULL;
1740        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1741        assert(_PyUnicode_CheckConsistency(unicode, 1));
1742        unicode_latin1[ch] = unicode;
1743    }
1744    Py_INCREF(unicode);
1745    return unicode;
1746}
1747
1748PyObject *
1749PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1750{
1751    PyObject *unicode;
1752    Py_UCS4 maxchar = 0;
1753    Py_ssize_t num_surrogates;
1754
1755    if (u == NULL)
1756        return (PyObject*)_PyUnicode_New(size);
1757
1758    /* If the Unicode data is known at construction time, we can apply
1759       some optimizations which share commonly used objects. */
1760
1761    /* Optimization for empty strings */
1762    if (size == 0 && unicode_empty != NULL) {
1763        Py_INCREF(unicode_empty);
1764        return unicode_empty;
1765    }
1766
1767    /* Single character Unicode objects in the Latin-1 range are
1768       shared when using this constructor */
1769    if (size == 1 && *u < 256)
1770        return get_latin1_char((unsigned char)*u);
1771
1772    /* If not empty and not single character, copy the Unicode data
1773       into the new object */
1774    if (find_maxchar_surrogates(u, u + size,
1775                                &maxchar, &num_surrogates) == -1)
1776        return NULL;
1777
1778    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1779    if (!unicode)
1780        return NULL;
1781
1782    switch (PyUnicode_KIND(unicode)) {
1783    case PyUnicode_1BYTE_KIND:
1784        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1785                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1786        break;
1787    case PyUnicode_2BYTE_KIND:
1788#if Py_UNICODE_SIZE == 2
1789        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1790#else
1791        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1792                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1793#endif
1794        break;
1795    case PyUnicode_4BYTE_KIND:
1796#if SIZEOF_WCHAR_T == 2
1797        /* This is the only case which has to process surrogates, thus
1798           a simple copy loop is not enough and we need a function. */
1799        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1800#else
1801        assert(num_surrogates == 0);
1802        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1803#endif
1804        break;
1805    default:
1806        assert(0 && "Impossible state");
1807    }
1808
1809    return unicode_result(unicode);
1810}
1811
1812PyObject *
1813PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1814{
1815    if (size < 0) {
1816        PyErr_SetString(PyExc_SystemError,
1817                        "Negative size passed to PyUnicode_FromStringAndSize");
1818        return NULL;
1819    }
1820    if (u != NULL)
1821        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1822    else
1823        return (PyObject *)_PyUnicode_New(size);
1824}
1825
1826PyObject *
1827PyUnicode_FromString(const char *u)
1828{
1829    size_t size = strlen(u);
1830    if (size > PY_SSIZE_T_MAX) {
1831        PyErr_SetString(PyExc_OverflowError, "input too long");
1832        return NULL;
1833    }
1834    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1835}
1836
1837PyObject *
1838_PyUnicode_FromId(_Py_Identifier *id)
1839{
1840    if (!id->object) {
1841        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1842                                                  strlen(id->string),
1843                                                  NULL, NULL);
1844        if (!id->object)
1845            return NULL;
1846        PyUnicode_InternInPlace(&id->object);
1847        assert(!id->next);
1848        id->next = static_strings;
1849        static_strings = id;
1850    }
1851    return id->object;
1852}
1853
1854void
1855_PyUnicode_ClearStaticStrings()
1856{
1857    _Py_Identifier *i;
1858    for (i = static_strings; i; i = i->next) {
1859        Py_DECREF(i->object);
1860        i->object = NULL;
1861        i->next = NULL;
1862    }
1863}
1864
1865/* Internal function, doesn't check maximum character */
1866
1867PyObject*
1868_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1869{
1870    const unsigned char *s = (const unsigned char *)buffer;
1871    PyObject *unicode;
1872    if (size == 1) {
1873#ifdef Py_DEBUG
1874        assert(s[0] < 128);
1875#endif
1876        return get_latin1_char(s[0]);
1877    }
1878    unicode = PyUnicode_New(size, 127);
1879    if (!unicode)
1880        return NULL;
1881    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1882    assert(_PyUnicode_CheckConsistency(unicode, 1));
1883    return unicode;
1884}
1885
1886static Py_UCS4
1887kind_maxchar_limit(unsigned int kind)
1888{
1889    switch (kind) {
1890    case PyUnicode_1BYTE_KIND:
1891        return 0x80;
1892    case PyUnicode_2BYTE_KIND:
1893        return 0x100;
1894    case PyUnicode_4BYTE_KIND:
1895        return 0x10000;
1896    default:
1897        assert(0 && "invalid kind");
1898        return MAX_UNICODE;
1899    }
1900}
1901
1902Py_LOCAL_INLINE(Py_UCS4)
1903align_maxchar(Py_UCS4 maxchar)
1904{
1905    if (maxchar <= 127)
1906        return 127;
1907    else if (maxchar <= 255)
1908        return 255;
1909    else if (maxchar <= 65535)
1910        return 65535;
1911    else
1912        return MAX_UNICODE;
1913}
1914
1915static PyObject*
1916_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1917{
1918    PyObject *res;
1919    unsigned char max_char;
1920
1921    if (size == 0) {
1922        Py_INCREF(unicode_empty);
1923        return unicode_empty;
1924    }
1925    assert(size > 0);
1926    if (size == 1)
1927        return get_latin1_char(u[0]);
1928
1929    max_char = ucs1lib_find_max_char(u, u + size);
1930    res = PyUnicode_New(size, max_char);
1931    if (!res)
1932        return NULL;
1933    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1934    assert(_PyUnicode_CheckConsistency(res, 1));
1935    return res;
1936}
1937
1938static PyObject*
1939_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1940{
1941    PyObject *res;
1942    Py_UCS2 max_char;
1943
1944    if (size == 0) {
1945        Py_INCREF(unicode_empty);
1946        return unicode_empty;
1947    }
1948    assert(size > 0);
1949    if (size == 1) {
1950        Py_UCS4 ch = u[0];
1951        if (ch < 256)
1952            return get_latin1_char((unsigned char)ch);
1953
1954        res = PyUnicode_New(1, ch);
1955        if (res == NULL)
1956            return NULL;
1957        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1958        assert(_PyUnicode_CheckConsistency(res, 1));
1959        return res;
1960    }
1961
1962    max_char = ucs2lib_find_max_char(u, u + size);
1963    res = PyUnicode_New(size, max_char);
1964    if (!res)
1965        return NULL;
1966    if (max_char >= 256)
1967        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1968    else {
1969        _PyUnicode_CONVERT_BYTES(
1970            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1971    }
1972    assert(_PyUnicode_CheckConsistency(res, 1));
1973    return res;
1974}
1975
1976static PyObject*
1977_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1978{
1979    PyObject *res;
1980    Py_UCS4 max_char;
1981
1982    if (size == 0) {
1983        Py_INCREF(unicode_empty);
1984        return unicode_empty;
1985    }
1986    assert(size > 0);
1987    if (size == 1) {
1988        Py_UCS4 ch = u[0];
1989        if (ch < 256)
1990            return get_latin1_char((unsigned char)ch);
1991
1992        res = PyUnicode_New(1, ch);
1993        if (res == NULL)
1994            return NULL;
1995        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1996        assert(_PyUnicode_CheckConsistency(res, 1));
1997        return res;
1998    }
1999
2000    max_char = ucs4lib_find_max_char(u, u + size);
2001    res = PyUnicode_New(size, max_char);
2002    if (!res)
2003        return NULL;
2004    if (max_char < 256)
2005        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2006                                 PyUnicode_1BYTE_DATA(res));
2007    else if (max_char < 0x10000)
2008        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2009                                 PyUnicode_2BYTE_DATA(res));
2010    else
2011        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2012    assert(_PyUnicode_CheckConsistency(res, 1));
2013    return res;
2014}
2015
2016PyObject*
2017PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2018{
2019    if (size < 0) {
2020        PyErr_SetString(PyExc_ValueError, "size must be positive");
2021        return NULL;
2022    }
2023    switch (kind) {
2024    case PyUnicode_1BYTE_KIND:
2025        return _PyUnicode_FromUCS1(buffer, size);
2026    case PyUnicode_2BYTE_KIND:
2027        return _PyUnicode_FromUCS2(buffer, size);
2028    case PyUnicode_4BYTE_KIND:
2029        return _PyUnicode_FromUCS4(buffer, size);
2030    default:
2031        PyErr_SetString(PyExc_SystemError, "invalid kind");
2032        return NULL;
2033    }
2034}
2035
2036Py_UCS4
2037_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2038{
2039    enum PyUnicode_Kind kind;
2040    void *startptr, *endptr;
2041
2042    assert(PyUnicode_IS_READY(unicode));
2043    assert(0 <= start);
2044    assert(end <= PyUnicode_GET_LENGTH(unicode));
2045    assert(start <= end);
2046
2047    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2048        return PyUnicode_MAX_CHAR_VALUE(unicode);
2049
2050    if (start == end)
2051        return 127;
2052
2053    if (PyUnicode_IS_ASCII(unicode))
2054        return 127;
2055
2056    kind = PyUnicode_KIND(unicode);
2057    startptr = PyUnicode_DATA(unicode);
2058    endptr = (char *)startptr + end * kind;
2059    startptr = (char *)startptr + start * kind;
2060    switch(kind) {
2061    case PyUnicode_1BYTE_KIND:
2062        return ucs1lib_find_max_char(startptr, endptr);
2063    case PyUnicode_2BYTE_KIND:
2064        return ucs2lib_find_max_char(startptr, endptr);
2065    case PyUnicode_4BYTE_KIND:
2066        return ucs4lib_find_max_char(startptr, endptr);
2067    default:
2068        assert(0);
2069        return 0;
2070    }
2071}
2072
2073/* Ensure that a string uses the most efficient storage, if it is not the
2074   case: create a new string with of the right kind. Write NULL into *p_unicode
2075   on error. */
2076static void
2077unicode_adjust_maxchar(PyObject **p_unicode)
2078{
2079    PyObject *unicode, *copy;
2080    Py_UCS4 max_char;
2081    Py_ssize_t len;
2082    unsigned int kind;
2083
2084    assert(p_unicode != NULL);
2085    unicode = *p_unicode;
2086    assert(PyUnicode_IS_READY(unicode));
2087    if (PyUnicode_IS_ASCII(unicode))
2088        return;
2089
2090    len = PyUnicode_GET_LENGTH(unicode);
2091    kind = PyUnicode_KIND(unicode);
2092    if (kind == PyUnicode_1BYTE_KIND) {
2093        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2094        max_char = ucs1lib_find_max_char(u, u + len);
2095        if (max_char >= 128)
2096            return;
2097    }
2098    else if (kind == PyUnicode_2BYTE_KIND) {
2099        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2100        max_char = ucs2lib_find_max_char(u, u + len);
2101        if (max_char >= 256)
2102            return;
2103    }
2104    else {
2105        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2106        assert(kind == PyUnicode_4BYTE_KIND);
2107        max_char = ucs4lib_find_max_char(u, u + len);
2108        if (max_char >= 0x10000)
2109            return;
2110    }
2111    copy = PyUnicode_New(len, max_char);
2112    if (copy != NULL)
2113        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2114    Py_DECREF(unicode);
2115    *p_unicode = copy;
2116}
2117
2118PyObject*
2119_PyUnicode_Copy(PyObject *unicode)
2120{
2121    Py_ssize_t length;
2122    PyObject *copy;
2123
2124    if (!PyUnicode_Check(unicode)) {
2125        PyErr_BadInternalCall();
2126        return NULL;
2127    }
2128    if (PyUnicode_READY(unicode) == -1)
2129        return NULL;
2130
2131    length = PyUnicode_GET_LENGTH(unicode);
2132    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2133    if (!copy)
2134        return NULL;
2135    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2136
2137    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2138              length * PyUnicode_KIND(unicode));
2139    assert(_PyUnicode_CheckConsistency(copy, 1));
2140    return copy;
2141}
2142
2143
2144/* Widen Unicode objects to larger buffers. Don't write terminating null
2145   character. Return NULL on error. */
2146
2147void*
2148_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2149{
2150    Py_ssize_t len;
2151    void *result;
2152    unsigned int skind;
2153
2154    if (PyUnicode_READY(s) == -1)
2155        return NULL;
2156
2157    len = PyUnicode_GET_LENGTH(s);
2158    skind = PyUnicode_KIND(s);
2159    if (skind >= kind) {
2160        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2161        return NULL;
2162    }
2163    switch (kind) {
2164    case PyUnicode_2BYTE_KIND:
2165        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2166        if (!result)
2167            return PyErr_NoMemory();
2168        assert(skind == PyUnicode_1BYTE_KIND);
2169        _PyUnicode_CONVERT_BYTES(
2170            Py_UCS1, Py_UCS2,
2171            PyUnicode_1BYTE_DATA(s),
2172            PyUnicode_1BYTE_DATA(s) + len,
2173            result);
2174        return result;
2175    case PyUnicode_4BYTE_KIND:
2176        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2177        if (!result)
2178            return PyErr_NoMemory();
2179        if (skind == PyUnicode_2BYTE_KIND) {
2180            _PyUnicode_CONVERT_BYTES(
2181                Py_UCS2, Py_UCS4,
2182                PyUnicode_2BYTE_DATA(s),
2183                PyUnicode_2BYTE_DATA(s) + len,
2184                result);
2185        }
2186        else {
2187            assert(skind == PyUnicode_1BYTE_KIND);
2188            _PyUnicode_CONVERT_BYTES(
2189                Py_UCS1, Py_UCS4,
2190                PyUnicode_1BYTE_DATA(s),
2191                PyUnicode_1BYTE_DATA(s) + len,
2192                result);
2193        }
2194        return result;
2195    default:
2196        break;
2197    }
2198    PyErr_SetString(PyExc_SystemError, "invalid kind");
2199    return NULL;
2200}
2201
2202static Py_UCS4*
2203as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2204        int copy_null)
2205{
2206    int kind;
2207    void *data;
2208    Py_ssize_t len, targetlen;
2209    if (PyUnicode_READY(string) == -1)
2210        return NULL;
2211    kind = PyUnicode_KIND(string);
2212    data = PyUnicode_DATA(string);
2213    len = PyUnicode_GET_LENGTH(string);
2214    targetlen = len;
2215    if (copy_null)
2216        targetlen++;
2217    if (!target) {
2218        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2219            PyErr_NoMemory();
2220            return NULL;
2221        }
2222        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2223        if (!target) {
2224            PyErr_NoMemory();
2225            return NULL;
2226        }
2227    }
2228    else {
2229        if (targetsize < targetlen) {
2230            PyErr_Format(PyExc_SystemError,
2231                         "string is longer than the buffer");
2232            if (copy_null && 0 < targetsize)
2233                target[0] = 0;
2234            return NULL;
2235        }
2236    }
2237    if (kind == PyUnicode_1BYTE_KIND) {
2238        Py_UCS1 *start = (Py_UCS1 *) data;
2239        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2240    }
2241    else if (kind == PyUnicode_2BYTE_KIND) {
2242        Py_UCS2 *start = (Py_UCS2 *) data;
2243        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2244    }
2245    else {
2246        assert(kind == PyUnicode_4BYTE_KIND);
2247        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2248    }
2249    if (copy_null)
2250        target[len] = 0;
2251    return target;
2252}
2253
2254Py_UCS4*
2255PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2256                 int copy_null)
2257{
2258    if (target == NULL || targetsize < 0) {
2259        PyErr_BadInternalCall();
2260        return NULL;
2261    }
2262    return as_ucs4(string, target, targetsize, copy_null);
2263}
2264
2265Py_UCS4*
2266PyUnicode_AsUCS4Copy(PyObject *string)
2267{
2268    return as_ucs4(string, NULL, 0, 1);
2269}
2270
2271#ifdef HAVE_WCHAR_H
2272
2273PyObject *
2274PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2275{
2276    if (w == NULL) {
2277        if (size == 0) {
2278            Py_INCREF(unicode_empty);
2279            return unicode_empty;
2280        }
2281        PyErr_BadInternalCall();
2282        return NULL;
2283    }
2284
2285    if (size == -1) {
2286        size = wcslen(w);
2287    }
2288
2289    return PyUnicode_FromUnicode(w, size);
2290}
2291
2292#endif /* HAVE_WCHAR_H */
2293
2294static void
2295makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2296        char c)
2297{
2298    *fmt++ = '%';
2299    if (longflag)
2300        *fmt++ = 'l';
2301    else if (longlongflag) {
2302        /* longlongflag should only ever be nonzero on machines with
2303           HAVE_LONG_LONG defined */
2304#ifdef HAVE_LONG_LONG
2305        char *f = PY_FORMAT_LONG_LONG;
2306        while (*f)
2307            *fmt++ = *f++;
2308#else
2309        /* we shouldn't ever get here */
2310        assert(0);
2311        *fmt++ = 'l';
2312#endif
2313    }
2314    else if (size_tflag) {
2315        char *f = PY_FORMAT_SIZE_T;
2316        while (*f)
2317            *fmt++ = *f++;
2318    }
2319    *fmt++ = c;
2320    *fmt = '\0';
2321}
2322
2323/* maximum number of characters required for output of %lld or %p.
2324   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2325   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2326#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2327
2328static const char*
2329unicode_fromformat_arg(_PyUnicodeWriter *writer,
2330                       const char *f, va_list *vargs)
2331{
2332    const char *p;
2333    Py_ssize_t len;
2334    int zeropad;
2335    int width;
2336    int precision;
2337    int longflag;
2338    int longlongflag;
2339    int size_tflag;
2340    int fill;
2341
2342    p = f;
2343    f++;
2344    zeropad = 0;
2345    if (*f == '0') {
2346        zeropad = 1;
2347        f++;
2348    }
2349
2350    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2351    width = 0;
2352    while (Py_ISDIGIT((unsigned)*f)) {
2353        if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2354            PyErr_SetString(PyExc_ValueError,
2355                            "width too big");
2356            return NULL;
2357        }
2358        width = (width*10) + (*f - '0');
2359        f++;
2360    }
2361    precision = 0;
2362    if (*f == '.') {
2363        f++;
2364        while (Py_ISDIGIT((unsigned)*f)) {
2365            if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2366                PyErr_SetString(PyExc_ValueError,
2367                                "precision too big");
2368                return NULL;
2369            }
2370            precision = (precision*10) + (*f - '0');
2371            f++;
2372        }
2373        if (*f == '%') {
2374            /* "%.3%s" => f points to "3" */
2375            f--;
2376        }
2377    }
2378    if (*f == '\0') {
2379        /* bogus format "%.123" => go backward, f points to "3" */
2380        f--;
2381    }
2382
2383    /* Handle %ld, %lu, %lld and %llu. */
2384    longflag = 0;
2385    longlongflag = 0;
2386    size_tflag = 0;
2387    if (*f == 'l') {
2388        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2389            longflag = 1;
2390            ++f;
2391        }
2392#ifdef HAVE_LONG_LONG
2393        else if (f[1] == 'l' &&
2394                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2395            longlongflag = 1;
2396            f += 2;
2397        }
2398#endif
2399    }
2400    /* handle the size_t flag. */
2401    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2402        size_tflag = 1;
2403        ++f;
2404    }
2405
2406    if (f[1] == '\0')
2407        writer->overallocate = 0;
2408
2409    switch (*f) {
2410    case 'c':
2411    {
2412        int ordinal = va_arg(*vargs, int);
2413        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2414            PyErr_SetString(PyExc_ValueError,
2415                            "character argument not in range(0x110000)");
2416            return NULL;
2417        }
2418        if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2419            return NULL;
2420        PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2421        writer->pos++;
2422        break;
2423    }
2424
2425    case 'i':
2426    case 'd':
2427    case 'u':
2428    case 'x':
2429    {
2430        /* used by sprintf */
2431        char fmt[10]; /* should be enough for "%0lld\0" */
2432        char buffer[MAX_LONG_LONG_CHARS];
2433
2434        if (*f == 'u') {
2435            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2436
2437            if (longflag)
2438                len = sprintf(buffer, fmt,
2439                        va_arg(*vargs, unsigned long));
2440#ifdef HAVE_LONG_LONG
2441            else if (longlongflag)
2442                len = sprintf(buffer, fmt,
2443                        va_arg(*vargs, unsigned PY_LONG_LONG));
2444#endif
2445            else if (size_tflag)
2446                len = sprintf(buffer, fmt,
2447                        va_arg(*vargs, size_t));
2448            else
2449                len = sprintf(buffer, fmt,
2450                        va_arg(*vargs, unsigned int));
2451        }
2452        else if (*f == 'x') {
2453            makefmt(fmt, 0, 0, 0, 'x');
2454            len = sprintf(buffer, fmt, va_arg(*vargs, int));
2455        }
2456        else {
2457            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2458
2459            if (longflag)
2460                len = sprintf(buffer, fmt,
2461                        va_arg(*vargs, long));
2462#ifdef HAVE_LONG_LONG
2463            else if (longlongflag)
2464                len = sprintf(buffer, fmt,
2465                        va_arg(*vargs, PY_LONG_LONG));
2466#endif
2467            else if (size_tflag)
2468                len = sprintf(buffer, fmt,
2469                        va_arg(*vargs, Py_ssize_t));
2470            else
2471                len = sprintf(buffer, fmt,
2472                        va_arg(*vargs, int));
2473        }
2474        assert(len >= 0);
2475
2476        if (precision < len)
2477            precision = len;
2478        if (width > precision) {
2479            Py_UCS4 fillchar;
2480            fill = width - precision;
2481            fillchar = zeropad?'0':' ';
2482            if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2483                return NULL;
2484            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2485                return NULL;
2486            writer->pos += fill;
2487        }
2488        if (precision > len) {
2489            fill = precision - len;
2490            if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2491                return NULL;
2492            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2493                return NULL;
2494            writer->pos += fill;
2495        }
2496        if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
2497            return NULL;
2498        break;
2499    }
2500
2501    case 'p':
2502    {
2503        char number[MAX_LONG_LONG_CHARS];
2504
2505        len = sprintf(number, "%p", va_arg(*vargs, void*));
2506        assert(len >= 0);
2507
2508        /* %p is ill-defined:  ensure leading 0x. */
2509        if (number[1] == 'X')
2510            number[1] = 'x';
2511        else if (number[1] != 'x') {
2512            memmove(number + 2, number,
2513                    strlen(number) + 1);
2514            number[0] = '0';
2515            number[1] = 'x';
2516            len += 2;
2517        }
2518
2519        if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2520            return NULL;
2521        break;
2522    }
2523
2524    case 's':
2525    {
2526        /* UTF-8 */
2527        const char *s = va_arg(*vargs, const char*);
2528        PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2529        if (!str)
2530            return NULL;
2531        if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2532            Py_DECREF(str);
2533            return NULL;
2534        }
2535        Py_DECREF(str);
2536        break;
2537    }
2538
2539    case 'U':
2540    {
2541        PyObject *obj = va_arg(*vargs, PyObject *);
2542        assert(obj && _PyUnicode_CHECK(obj));
2543
2544        if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2545            return NULL;
2546        break;
2547    }
2548
2549    case 'V':
2550    {
2551        PyObject *obj = va_arg(*vargs, PyObject *);
2552        const char *str = va_arg(*vargs, const char *);
2553        PyObject *str_obj;
2554        assert(obj || str);
2555        if (obj) {
2556            assert(_PyUnicode_CHECK(obj));
2557            if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2558                return NULL;
2559        }
2560        else {
2561            str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2562            if (!str_obj)
2563                return NULL;
2564            if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2565                Py_DECREF(str_obj);
2566                return NULL;
2567            }
2568            Py_DECREF(str_obj);
2569        }
2570        break;
2571    }
2572
2573    case 'S':
2574    {
2575        PyObject *obj = va_arg(*vargs, PyObject *);
2576        PyObject *str;
2577        assert(obj);
2578        str = PyObject_Str(obj);
2579        if (!str)
2580            return NULL;
2581        if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2582            Py_DECREF(str);
2583            return NULL;
2584        }
2585        Py_DECREF(str);
2586        break;
2587    }
2588
2589    case 'R':
2590    {
2591        PyObject *obj = va_arg(*vargs, PyObject *);
2592        PyObject *repr;
2593        assert(obj);
2594        repr = PyObject_Repr(obj);
2595        if (!repr)
2596            return NULL;
2597        if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2598            Py_DECREF(repr);
2599            return NULL;
2600        }
2601        Py_DECREF(repr);
2602        break;
2603    }
2604
2605    case 'A':
2606    {
2607        PyObject *obj = va_arg(*vargs, PyObject *);
2608        PyObject *ascii;
2609        assert(obj);
2610        ascii = PyObject_ASCII(obj);
2611        if (!ascii)
2612            return NULL;
2613        if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2614            Py_DECREF(ascii);
2615            return NULL;
2616        }
2617        Py_DECREF(ascii);
2618        break;
2619    }
2620
2621    case '%':
2622        if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2623            return NULL;
2624        PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2625        writer->pos++;
2626        break;
2627
2628    default:
2629        /* if we stumble upon an unknown formatting code, copy the rest
2630           of the format string to the output string. (we cannot just
2631           skip the code, since there's no way to know what's in the
2632           argument list) */
2633        len = strlen(p);
2634        if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2635            return NULL;
2636        f = p+len;
2637        return f;
2638    }
2639
2640    f++;
2641    return f;
2642}
2643
2644PyObject *
2645PyUnicode_FromFormatV(const char *format, va_list vargs)
2646{
2647    va_list vargs2;
2648    const char *f;
2649    _PyUnicodeWriter writer;
2650
2651    _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2652
2653    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2654       Copy it to be able to pass a reference to a subfunction. */
2655    Py_VA_COPY(vargs2, vargs);
2656
2657    for (f = format; *f; ) {
2658        if (*f == '%') {
2659            f = unicode_fromformat_arg(&writer, f, &vargs2);
2660            if (f == NULL)
2661                goto fail;
2662        }
2663        else {
2664            const char *p;
2665            Py_ssize_t len;
2666
2667            p = f;
2668            do
2669            {
2670                if ((unsigned char)*p > 127) {
2671                    PyErr_Format(PyExc_ValueError,
2672                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2673                        "string, got a non-ASCII byte: 0x%02x",
2674                        (unsigned char)*p);
2675                    return NULL;
2676                }
2677                p++;
2678            }
2679            while (*p != '\0' && *p != '%');
2680            len = p - f;
2681
2682            if (*p == '\0')
2683                writer.overallocate = 0;
2684            if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2685                goto fail;
2686            unicode_write_cstr(writer.buffer, writer.pos, f, len);
2687            writer.pos += len;
2688
2689            f = p;
2690        }
2691    }
2692    return _PyUnicodeWriter_Finish(&writer);
2693
2694  fail:
2695    _PyUnicodeWriter_Dealloc(&writer);
2696    return NULL;
2697}
2698
2699PyObject *
2700PyUnicode_FromFormat(const char *format, ...)
2701{
2702    PyObject* ret;
2703    va_list vargs;
2704
2705#ifdef HAVE_STDARG_PROTOTYPES
2706    va_start(vargs, format);
2707#else
2708    va_start(vargs);
2709#endif
2710    ret = PyUnicode_FromFormatV(format, vargs);
2711    va_end(vargs);
2712    return ret;
2713}
2714
2715#ifdef HAVE_WCHAR_H
2716
2717/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2718   convert a Unicode object to a wide character string.
2719
2720   - If w is NULL: return the number of wide characters (including the null
2721     character) required to convert the unicode object. Ignore size argument.
2722
2723   - Otherwise: return the number of wide characters (excluding the null
2724     character) written into w. Write at most size wide characters (including
2725     the null character). */
2726static Py_ssize_t
2727unicode_aswidechar(PyObject *unicode,
2728                   wchar_t *w,
2729                   Py_ssize_t size)
2730{
2731    Py_ssize_t res;
2732    const wchar_t *wstr;
2733
2734    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2735    if (wstr == NULL)
2736        return -1;
2737
2738    if (w != NULL) {
2739        if (size > res)
2740            size = res + 1;
2741        else
2742            res = size;
2743        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2744        return res;
2745    }
2746    else
2747        return res + 1;
2748}
2749
2750Py_ssize_t
2751PyUnicode_AsWideChar(PyObject *unicode,
2752                     wchar_t *w,
2753                     Py_ssize_t size)
2754{
2755    if (unicode == NULL) {
2756        PyErr_BadInternalCall();
2757        return -1;
2758    }
2759    return unicode_aswidechar(unicode, w, size);
2760}
2761
2762wchar_t*
2763PyUnicode_AsWideCharString(PyObject *unicode,
2764                           Py_ssize_t *size)
2765{
2766    wchar_t* buffer;
2767    Py_ssize_t buflen;
2768
2769    if (unicode == NULL) {
2770        PyErr_BadInternalCall();
2771        return NULL;
2772    }
2773
2774    buflen = unicode_aswidechar(unicode, NULL, 0);
2775    if (buflen == -1)
2776        return NULL;
2777    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2778        PyErr_NoMemory();
2779        return NULL;
2780    }
2781
2782    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2783    if (buffer == NULL) {
2784        PyErr_NoMemory();
2785        return NULL;
2786    }
2787    buflen = unicode_aswidechar(unicode, buffer, buflen);
2788    if (buflen == -1) {
2789        PyMem_FREE(buffer);
2790        return NULL;
2791    }
2792    if (size != NULL)
2793        *size = buflen;
2794    return buffer;
2795}
2796
2797#endif /* HAVE_WCHAR_H */
2798
2799PyObject *
2800PyUnicode_FromOrdinal(int ordinal)
2801{
2802    PyObject *v;
2803    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2804        PyErr_SetString(PyExc_ValueError,
2805                        "chr() arg not in range(0x110000)");
2806        return NULL;
2807    }
2808
2809    if (ordinal < 256)
2810        return get_latin1_char(ordinal);
2811
2812    v = PyUnicode_New(1, ordinal);
2813    if (v == NULL)
2814        return NULL;
2815    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2816    assert(_PyUnicode_CheckConsistency(v, 1));
2817    return v;
2818}
2819
2820PyObject *
2821PyUnicode_FromObject(register PyObject *obj)
2822{
2823    /* XXX Perhaps we should make this API an alias of
2824       PyObject_Str() instead ?! */
2825    if (PyUnicode_CheckExact(obj)) {
2826        if (PyUnicode_READY(obj) == -1)
2827            return NULL;
2828        Py_INCREF(obj);
2829        return obj;
2830    }
2831    if (PyUnicode_Check(obj)) {
2832        /* For a Unicode subtype that's not a Unicode object,
2833           return a true Unicode object with the same data. */
2834        return _PyUnicode_Copy(obj);
2835    }
2836    PyErr_Format(PyExc_TypeError,
2837                 "Can't convert '%.100s' object to str implicitly",
2838                 Py_TYPE(obj)->tp_name);
2839    return NULL;
2840}
2841
2842PyObject *
2843PyUnicode_FromEncodedObject(register PyObject *obj,
2844                            const char *encoding,
2845                            const char *errors)
2846{
2847    Py_buffer buffer;
2848    PyObject *v;
2849
2850    if (obj == NULL) {
2851        PyErr_BadInternalCall();
2852        return NULL;
2853    }
2854
2855    /* Decoding bytes objects is the most common case and should be fast */
2856    if (PyBytes_Check(obj)) {
2857        if (PyBytes_GET_SIZE(obj) == 0) {
2858            Py_INCREF(unicode_empty);
2859            v = unicode_empty;
2860        }
2861        else {
2862            v = PyUnicode_Decode(
2863                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2864                    encoding, errors);
2865        }
2866        return v;
2867    }
2868
2869    if (PyUnicode_Check(obj)) {
2870        PyErr_SetString(PyExc_TypeError,
2871                        "decoding str is not supported");
2872        return NULL;
2873    }
2874
2875    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2876    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2877        PyErr_Format(PyExc_TypeError,
2878                     "coercing to str: need bytes, bytearray "
2879                     "or buffer-like object, %.80s found",
2880                     Py_TYPE(obj)->tp_name);
2881        return NULL;
2882    }
2883
2884    if (buffer.len == 0) {
2885        Py_INCREF(unicode_empty);
2886        v = unicode_empty;
2887    }
2888    else
2889        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2890
2891    PyBuffer_Release(&buffer);
2892    return v;
2893}
2894
2895/* Convert encoding to lower case and replace '_' with '-' in order to
2896   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2897   1 on success. */
2898static int
2899normalize_encoding(const char *encoding,
2900                   char *lower,
2901                   size_t lower_len)
2902{
2903    const char *e;
2904    char *l;
2905    char *l_end;
2906
2907    if (encoding == NULL) {
2908        strcpy(lower, "utf-8");
2909        return 1;
2910    }
2911    e = encoding;
2912    l = lower;
2913    l_end = &lower[lower_len - 1];
2914    while (*e) {
2915        if (l == l_end)
2916            return 0;
2917        if (Py_ISUPPER(*e)) {
2918            *l++ = Py_TOLOWER(*e++);
2919        }
2920        else if (*e == '_') {
2921            *l++ = '-';
2922            e++;
2923        }
2924        else {
2925            *l++ = *e++;
2926        }
2927    }
2928    *l = '\0';
2929    return 1;
2930}
2931
2932PyObject *
2933PyUnicode_Decode(const char *s,
2934                 Py_ssize_t size,
2935                 const char *encoding,
2936                 const char *errors)
2937{
2938    PyObject *buffer = NULL, *unicode;
2939    Py_buffer info;
2940    char lower[11];  /* Enough for any encoding shortcut */
2941
2942    /* Shortcuts for common default encodings */
2943    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2944        if ((strcmp(lower, "utf-8") == 0) ||
2945            (strcmp(lower, "utf8") == 0))
2946            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2947        else if ((strcmp(lower, "latin-1") == 0) ||
2948                 (strcmp(lower, "latin1") == 0) ||
2949                 (strcmp(lower, "iso-8859-1") == 0))
2950            return PyUnicode_DecodeLatin1(s, size, errors);
2951#ifdef HAVE_MBCS
2952        else if (strcmp(lower, "mbcs") == 0)
2953            return PyUnicode_DecodeMBCS(s, size, errors);
2954#endif
2955        else if (strcmp(lower, "ascii") == 0)
2956            return PyUnicode_DecodeASCII(s, size, errors);
2957        else if (strcmp(lower, "utf-16") == 0)
2958            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2959        else if (strcmp(lower, "utf-32") == 0)
2960            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2961    }
2962
2963    /* Decode via the codec registry */
2964    buffer = NULL;
2965    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2966        goto onError;
2967    buffer = PyMemoryView_FromBuffer(&info);
2968    if (buffer == NULL)
2969        goto onError;
2970    unicode = PyCodec_Decode(buffer, encoding, errors);
2971    if (unicode == NULL)
2972        goto onError;
2973    if (!PyUnicode_Check(unicode)) {
2974        PyErr_Format(PyExc_TypeError,
2975                     "decoder did not return a str object (type=%.400s)",
2976                     Py_TYPE(unicode)->tp_name);
2977        Py_DECREF(unicode);
2978        goto onError;
2979    }
2980    Py_DECREF(buffer);
2981    return unicode_result(unicode);
2982
2983  onError:
2984    Py_XDECREF(buffer);
2985    return NULL;
2986}
2987
2988PyObject *
2989PyUnicode_AsDecodedObject(PyObject *unicode,
2990                          const char *encoding,
2991                          const char *errors)
2992{
2993    PyObject *v;
2994
2995    if (!PyUnicode_Check(unicode)) {
2996        PyErr_BadArgument();
2997        goto onError;
2998    }
2999
3000    if (encoding == NULL)
3001        encoding = PyUnicode_GetDefaultEncoding();
3002
3003    /* Decode via the codec registry */
3004    v = PyCodec_Decode(unicode, encoding, errors);
3005    if (v == NULL)
3006        goto onError;
3007    return unicode_result(v);
3008
3009  onError:
3010    return NULL;
3011}
3012
3013PyObject *
3014PyUnicode_AsDecodedUnicode(PyObject *unicode,
3015                           const char *encoding,
3016                           const char *errors)
3017{
3018    PyObject *v;
3019
3020    if (!PyUnicode_Check(unicode)) {
3021        PyErr_BadArgument();
3022        goto onError;
3023    }
3024
3025    if (encoding == NULL)
3026        encoding = PyUnicode_GetDefaultEncoding();
3027
3028    /* Decode via the codec registry */
3029    v = PyCodec_Decode(unicode, encoding, errors);
3030    if (v == NULL)
3031        goto onError;
3032    if (!PyUnicode_Check(v)) {
3033        PyErr_Format(PyExc_TypeError,
3034                     "decoder did not return a str object (type=%.400s)",
3035                     Py_TYPE(v)->tp_name);
3036        Py_DECREF(v);
3037        goto onError;
3038    }
3039    return unicode_result(v);
3040
3041  onError:
3042    return NULL;
3043}
3044
3045PyObject *
3046PyUnicode_Encode(const Py_UNICODE *s,
3047                 Py_ssize_t size,
3048                 const char *encoding,
3049                 const char *errors)
3050{
3051    PyObject *v, *unicode;
3052
3053    unicode = PyUnicode_FromUnicode(s, size);
3054    if (unicode == NULL)
3055        return NULL;
3056    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3057    Py_DECREF(unicode);
3058    return v;
3059}
3060
3061PyObject *
3062PyUnicode_AsEncodedObject(PyObject *unicode,
3063                          const char *encoding,
3064                          const char *errors)
3065{
3066    PyObject *v;
3067
3068    if (!PyUnicode_Check(unicode)) {
3069        PyErr_BadArgument();
3070        goto onError;
3071    }
3072
3073    if (encoding == NULL)
3074        encoding = PyUnicode_GetDefaultEncoding();
3075
3076    /* Encode via the codec registry */
3077    v = PyCodec_Encode(unicode, encoding, errors);
3078    if (v == NULL)
3079        goto onError;
3080    return v;
3081
3082  onError:
3083    return NULL;
3084}
3085
3086static size_t
3087wcstombs_errorpos(const wchar_t *wstr)
3088{
3089    size_t len;
3090#if SIZEOF_WCHAR_T == 2
3091    wchar_t buf[3];
3092#else
3093    wchar_t buf[2];
3094#endif
3095    char outbuf[MB_LEN_MAX];
3096    const wchar_t *start, *previous;
3097
3098#if SIZEOF_WCHAR_T == 2
3099    buf[2] = 0;
3100#else
3101    buf[1] = 0;
3102#endif
3103    start = wstr;
3104    while (*wstr != L'\0')
3105    {
3106        previous = wstr;
3107#if SIZEOF_WCHAR_T == 2
3108        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3109            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3110        {
3111            buf[0] = wstr[0];
3112            buf[1] = wstr[1];
3113            wstr += 2;
3114        }
3115        else {
3116            buf[0] = *wstr;
3117            buf[1] = 0;
3118            wstr++;
3119        }
3120#else
3121        buf[0] = *wstr;
3122        wstr++;
3123#endif
3124        len = wcstombs(outbuf, buf, sizeof(outbuf));
3125        if (len == (size_t)-1)
3126            return previous - start;
3127    }
3128
3129    /* failed to find the unencodable character */
3130    return 0;
3131}
3132
3133static int
3134locale_error_handler(const char *errors, int *surrogateescape)
3135{
3136    if (errors == NULL) {
3137        *surrogateescape = 0;
3138        return 0;
3139    }
3140
3141    if (strcmp(errors, "strict") == 0) {
3142        *surrogateescape = 0;
3143        return 0;
3144    }
3145    if (strcmp(errors, "surrogateescape") == 0) {
3146        *surrogateescape = 1;
3147        return 0;
3148    }
3149    PyErr_Format(PyExc_ValueError,
3150                 "only 'strict' and 'surrogateescape' error handlers "
3151                 "are supported, not '%s'",
3152                 errors);
3153    return -1;
3154}
3155
3156PyObject *
3157PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3158{
3159    Py_ssize_t wlen, wlen2;
3160    wchar_t *wstr;
3161    PyObject *bytes = NULL;
3162    char *errmsg;
3163    PyObject *reason;
3164    PyObject *exc;
3165    size_t error_pos;
3166    int surrogateescape;
3167
3168    if (locale_error_handler(errors, &surrogateescape) < 0)
3169        return NULL;
3170
3171    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3172    if (wstr == NULL)
3173        return NULL;
3174
3175    wlen2 = wcslen(wstr);
3176    if (wlen2 != wlen) {
3177        PyMem_Free(wstr);
3178        PyErr_SetString(PyExc_TypeError, "embedded null character");
3179        return NULL;
3180    }
3181
3182    if (surrogateescape) {
3183        /* locale encoding with surrogateescape */
3184        char *str;
3185
3186        str = _Py_wchar2char(wstr, &error_pos);
3187        if (str == NULL) {
3188            if (error_pos == (size_t)-1) {
3189                PyErr_NoMemory();
3190                PyMem_Free(wstr);
3191                return NULL;
3192            }
3193            else {
3194                goto encode_error;
3195            }
3196        }
3197        PyMem_Free(wstr);
3198
3199        bytes = PyBytes_FromString(str);
3200        PyMem_Free(str);
3201    }
3202    else {
3203        size_t len, len2;
3204
3205        len = wcstombs(NULL, wstr, 0);
3206        if (len == (size_t)-1) {
3207            error_pos = (size_t)-1;
3208            goto encode_error;
3209        }
3210
3211        bytes = PyBytes_FromStringAndSize(NULL, len);
3212        if (bytes == NULL) {
3213            PyMem_Free(wstr);
3214            return NULL;
3215        }
3216
3217        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3218        if (len2 == (size_t)-1 || len2 > len) {
3219            error_pos = (size_t)-1;
3220            goto encode_error;
3221        }
3222        PyMem_Free(wstr);
3223    }
3224    return bytes;
3225
3226encode_error:
3227    errmsg = strerror(errno);
3228    assert(errmsg != NULL);
3229
3230    if (error_pos == (size_t)-1)
3231        error_pos = wcstombs_errorpos(wstr);
3232
3233    PyMem_Free(wstr);
3234    Py_XDECREF(bytes);
3235
3236    if (errmsg != NULL) {
3237        size_t errlen;
3238        wstr = _Py_char2wchar(errmsg, &errlen);
3239        if (wstr != NULL) {
3240            reason = PyUnicode_FromWideChar(wstr, errlen);
3241            PyMem_Free(wstr);
3242        } else
3243            errmsg = NULL;
3244    }
3245    if (errmsg == NULL)
3246        reason = PyUnicode_FromString(
3247            "wcstombs() encountered an unencodable "
3248            "wide character");
3249    if (reason == NULL)
3250        return NULL;
3251
3252    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3253                                "locale", unicode,
3254                                (Py_ssize_t)error_pos,
3255                                (Py_ssize_t)(error_pos+1),
3256                                reason);
3257    Py_DECREF(reason);
3258    if (exc != NULL) {
3259        PyCodec_StrictErrors(exc);
3260        Py_XDECREF(exc);
3261    }
3262    return NULL;
3263}
3264
3265PyObject *
3266PyUnicode_EncodeFSDefault(PyObject *unicode)
3267{
3268#ifdef HAVE_MBCS
3269    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3270#elif defined(__APPLE__)
3271    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3272#else
3273    PyInterpreterState *interp = PyThreadState_GET()->interp;
3274    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3275       cannot use it to encode and decode filenames before it is loaded. Load
3276       the Python codec requires to encode at least its own filename. Use the C
3277       version of the locale codec until the codec registry is initialized and
3278       the Python codec is loaded.
3279
3280       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3281       cannot only rely on it: check also interp->fscodec_initialized for
3282       subinterpreters. */
3283    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3284        return PyUnicode_AsEncodedString(unicode,
3285                                         Py_FileSystemDefaultEncoding,
3286                                         "surrogateescape");
3287    }
3288    else {
3289        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3290    }
3291#endif
3292}
3293
3294PyObject *
3295PyUnicode_AsEncodedString(PyObject *unicode,
3296                          const char *encoding,
3297                          const char *errors)
3298{
3299    PyObject *v;
3300    char lower[11];  /* Enough for any encoding shortcut */
3301
3302    if (!PyUnicode_Check(unicode)) {
3303        PyErr_BadArgument();
3304        return NULL;
3305    }
3306
3307    /* Shortcuts for common default encodings */
3308    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3309        if ((strcmp(lower, "utf-8") == 0) ||
3310            (strcmp(lower, "utf8") == 0))
3311        {
3312            if (errors == NULL || strcmp(errors, "strict") == 0)
3313                return _PyUnicode_AsUTF8String(unicode, NULL);
3314            else
3315                return _PyUnicode_AsUTF8String(unicode, errors);
3316        }
3317        else if ((strcmp(lower, "latin-1") == 0) ||
3318                 (strcmp(lower, "latin1") == 0) ||
3319                 (strcmp(lower, "iso-8859-1") == 0))
3320            return _PyUnicode_AsLatin1String(unicode, errors);
3321#ifdef HAVE_MBCS
3322        else if (strcmp(lower, "mbcs") == 0)
3323            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3324#endif
3325        else if (strcmp(lower, "ascii") == 0)
3326            return _PyUnicode_AsASCIIString(unicode, errors);
3327    }
3328
3329    /* Encode via the codec registry */
3330    v = PyCodec_Encode(unicode, encoding, errors);
3331    if (v == NULL)
3332        return NULL;
3333
3334    /* The normal path */
3335    if (PyBytes_Check(v))
3336        return v;
3337
3338    /* If the codec returns a buffer, raise a warning and convert to bytes */
3339    if (PyByteArray_Check(v)) {
3340        int error;
3341        PyObject *b;
3342
3343        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3344            "encoder %s returned bytearray instead of bytes",
3345            encoding);
3346        if (error) {
3347            Py_DECREF(v);
3348            return NULL;
3349        }
3350
3351        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3352        Py_DECREF(v);
3353        return b;
3354    }
3355
3356    PyErr_Format(PyExc_TypeError,
3357                 "encoder did not return a bytes object (type=%.400s)",
3358                 Py_TYPE(v)->tp_name);
3359    Py_DECREF(v);
3360    return NULL;
3361}
3362
3363PyObject *
3364PyUnicode_AsEncodedUnicode(PyObject *unicode,
3365                           const char *encoding,
3366                           const char *errors)
3367{
3368    PyObject *v;
3369
3370    if (!PyUnicode_Check(unicode)) {
3371        PyErr_BadArgument();
3372        goto onError;
3373    }
3374
3375    if (encoding == NULL)
3376        encoding = PyUnicode_GetDefaultEncoding();
3377
3378    /* Encode via the codec registry */
3379    v = PyCodec_Encode(unicode, encoding, errors);
3380    if (v == NULL)
3381        goto onError;
3382    if (!PyUnicode_Check(v)) {
3383        PyErr_Format(PyExc_TypeError,
3384                     "encoder did not return an str object (type=%.400s)",
3385                     Py_TYPE(v)->tp_name);
3386        Py_DECREF(v);
3387        goto onError;
3388    }
3389    return v;
3390
3391  onError:
3392    return NULL;
3393}
3394
3395static size_t
3396mbstowcs_errorpos(const char *str, size_t len)
3397{
3398#ifdef HAVE_MBRTOWC
3399    const char *start = str;
3400    mbstate_t mbs;
3401    size_t converted;
3402    wchar_t ch;
3403
3404    memset(&mbs, 0, sizeof mbs);
3405    while (len)
3406    {
3407        converted = mbrtowc(&ch, (char*)str, len, &mbs);
3408        if (converted == 0)
3409            /* Reached end of string */
3410            break;
3411        if (converted == (size_t)-1 || converted == (size_t)-2) {
3412            /* Conversion error or incomplete character */
3413            return str - start;
3414        }
3415        else {
3416            str += converted;
3417            len -= converted;
3418        }
3419    }
3420    /* failed to find the undecodable byte sequence */
3421    return 0;
3422#endif
3423    return 0;
3424}
3425
3426PyObject*
3427PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3428                              const char *errors)
3429{
3430    wchar_t smallbuf[256];
3431    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3432    wchar_t *wstr;
3433    size_t wlen, wlen2;
3434    PyObject *unicode;
3435    int surrogateescape;
3436    size_t error_pos;
3437    char *errmsg;
3438    PyObject *reason, *exc;
3439
3440    if (locale_error_handler(errors, &surrogateescape) < 0)
3441        return NULL;
3442
3443    if (str[len] != '\0' || len != strlen(str)) {
3444        PyErr_SetString(PyExc_TypeError, "embedded null character");
3445        return NULL;
3446    }
3447
3448    if (surrogateescape)
3449    {
3450        wstr = _Py_char2wchar(str, &wlen);
3451        if (wstr == NULL) {
3452            if (wlen == (size_t)-1)
3453                PyErr_NoMemory();
3454            else
3455                PyErr_SetFromErrno(PyExc_OSError);
3456            return NULL;
3457        }
3458
3459        unicode = PyUnicode_FromWideChar(wstr, wlen);
3460        PyMem_Free(wstr);
3461    }
3462    else {
3463#ifndef HAVE_BROKEN_MBSTOWCS
3464        wlen = mbstowcs(NULL, str, 0);
3465#else
3466        wlen = len;
3467#endif
3468        if (wlen == (size_t)-1)
3469            goto decode_error;
3470        if (wlen+1 <= smallbuf_len) {
3471            wstr = smallbuf;
3472        }
3473        else {
3474            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3475                return PyErr_NoMemory();
3476
3477            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3478            if (!wstr)
3479                return PyErr_NoMemory();
3480        }
3481
3482        /* This shouldn't fail now */
3483        wlen2 = mbstowcs(wstr, str, wlen+1);
3484        if (wlen2 == (size_t)-1) {
3485            if (wstr != smallbuf)
3486                PyMem_Free(wstr);
3487            goto decode_error;
3488        }
3489#ifdef HAVE_BROKEN_MBSTOWCS
3490        assert(wlen2 == wlen);
3491#endif
3492        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3493        if (wstr != smallbuf)
3494            PyMem_Free(wstr);
3495    }
3496    return unicode;
3497
3498decode_error:
3499    errmsg = strerror(errno);
3500    assert(errmsg != NULL);
3501
3502    error_pos = mbstowcs_errorpos(str, len);
3503    if (errmsg != NULL) {
3504        size_t errlen;
3505        wstr = _Py_char2wchar(errmsg, &errlen);
3506        if (wstr != NULL) {
3507            reason = PyUnicode_FromWideChar(wstr, errlen);
3508            PyMem_Free(wstr);
3509        } else
3510            errmsg = NULL;
3511    }
3512    if (errmsg == NULL)
3513        reason = PyUnicode_FromString(
3514            "mbstowcs() encountered an invalid multibyte sequence");
3515    if (reason == NULL)
3516        return NULL;
3517
3518    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3519                                "locale", str, len,
3520                                (Py_ssize_t)error_pos,
3521                                (Py_ssize_t)(error_pos+1),
3522                                reason);
3523    Py_DECREF(reason);
3524    if (exc != NULL) {
3525        PyCodec_StrictErrors(exc);
3526        Py_XDECREF(exc);
3527    }
3528    return NULL;
3529}
3530
3531PyObject*
3532PyUnicode_DecodeLocale(const char *str, const char *errors)
3533{
3534    Py_ssize_t size = (Py_ssize_t)strlen(str);
3535    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3536}
3537
3538
3539PyObject*
3540PyUnicode_DecodeFSDefault(const char *s) {
3541    Py_ssize_t size = (Py_ssize_t)strlen(s);
3542    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3543}
3544
3545PyObject*
3546PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3547{
3548#ifdef HAVE_MBCS
3549    return PyUnicode_DecodeMBCS(s, size, NULL);
3550#elif defined(__APPLE__)
3551    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3552#else
3553    PyInterpreterState *interp = PyThreadState_GET()->interp;
3554    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3555       cannot use it to encode and decode filenames before it is loaded. Load
3556       the Python codec requires to encode at least its own filename. Use the C
3557       version of the locale codec until the codec registry is initialized and
3558       the Python codec is loaded.
3559
3560       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3561       cannot only rely on it: check also interp->fscodec_initialized for
3562       subinterpreters. */
3563    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3564        return PyUnicode_Decode(s, size,
3565                                Py_FileSystemDefaultEncoding,
3566                                "surrogateescape");
3567    }
3568    else {
3569        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3570    }
3571#endif
3572}
3573
3574
3575int
3576_PyUnicode_HasNULChars(PyObject* str)
3577{
3578    Py_ssize_t pos;
3579
3580    if (PyUnicode_READY(str) == -1)
3581        return -1;
3582    pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3583                   PyUnicode_GET_LENGTH(str), '\0', 1);
3584    if (pos == -1)
3585        return 0;
3586    else
3587        return 1;
3588}
3589
3590int
3591PyUnicode_FSConverter(PyObject* arg, void* addr)
3592{
3593    PyObject *output = NULL;
3594    Py_ssize_t size;
3595    void *data;
3596    if (arg == NULL) {
3597        Py_DECREF(*(PyObject**)addr);
3598        return 1;
3599    }
3600    if (PyBytes_Check(arg)) {
3601        output = arg;
3602        Py_INCREF(output);
3603    }
3604    else {
3605        arg = PyUnicode_FromObject(arg);
3606        if (!arg)
3607            return 0;
3608        output = PyUnicode_EncodeFSDefault(arg);
3609        Py_DECREF(arg);
3610        if (!output)
3611            return 0;
3612        if (!PyBytes_Check(output)) {
3613            Py_DECREF(output);
3614            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3615            return 0;
3616        }
3617    }
3618    size = PyBytes_GET_SIZE(output);
3619    data = PyBytes_AS_STRING(output);
3620    if (size != strlen(data)) {
3621        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3622        Py_DECREF(output);
3623        return 0;
3624    }
3625    *(PyObject**)addr = output;
3626    return Py_CLEANUP_SUPPORTED;
3627}
3628
3629
3630int
3631PyUnicode_FSDecoder(PyObject* arg, void* addr)
3632{
3633    PyObject *output = NULL;
3634    if (arg == NULL) {
3635        Py_DECREF(*(PyObject**)addr);
3636        return 1;
3637    }
3638    if (PyUnicode_Check(arg)) {
3639        if (PyUnicode_READY(arg) == -1)
3640            return 0;
3641        output = arg;
3642        Py_INCREF(output);
3643    }
3644    else {
3645        arg = PyBytes_FromObject(arg);
3646        if (!arg)
3647            return 0;
3648        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3649                                                  PyBytes_GET_SIZE(arg));
3650        Py_DECREF(arg);
3651        if (!output)
3652            return 0;
3653        if (!PyUnicode_Check(output)) {
3654            Py_DECREF(output);
3655            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3656            return 0;
3657        }
3658    }
3659    if (PyUnicode_READY(output) == -1) {
3660        Py_DECREF(output);
3661        return 0;
3662    }
3663    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3664                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3665        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3666        Py_DECREF(output);
3667        return 0;
3668    }
3669    *(PyObject**)addr = output;
3670    return Py_CLEANUP_SUPPORTED;
3671}
3672
3673
3674char*
3675PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3676{
3677    PyObject *bytes;
3678
3679    if (!PyUnicode_Check(unicode)) {
3680        PyErr_BadArgument();
3681        return NULL;
3682    }
3683    if (PyUnicode_READY(unicode) == -1)
3684        return NULL;
3685
3686    if (PyUnicode_UTF8(unicode) == NULL) {
3687        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3688        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3689        if (bytes == NULL)
3690            return NULL;
3691        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3692        if (_PyUnicode_UTF8(unicode) == NULL) {
3693            Py_DECREF(bytes);
3694            return NULL;
3695        }
3696        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3697        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3698                  PyBytes_AS_STRING(bytes),
3699                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3700        Py_DECREF(bytes);
3701    }
3702
3703    if (psize)
3704        *psize = PyUnicode_UTF8_LENGTH(unicode);
3705    return PyUnicode_UTF8(unicode);
3706}
3707
3708char*
3709PyUnicode_AsUTF8(PyObject *unicode)
3710{
3711    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3712}
3713
3714Py_UNICODE *
3715PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3716{
3717    const unsigned char *one_byte;
3718#if SIZEOF_WCHAR_T == 4
3719    const Py_UCS2 *two_bytes;
3720#else
3721    const Py_UCS4 *four_bytes;
3722    const Py_UCS4 *ucs4_end;
3723    Py_ssize_t num_surrogates;
3724#endif
3725    wchar_t *w;
3726    wchar_t *wchar_end;
3727
3728    if (!PyUnicode_Check(unicode)) {
3729        PyErr_BadArgument();
3730        return NULL;
3731    }
3732    if (_PyUnicode_WSTR(unicode) == NULL) {
3733        /* Non-ASCII compact unicode object */
3734        assert(_PyUnicode_KIND(unicode) != 0);
3735        assert(PyUnicode_IS_READY(unicode));
3736
3737        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3738#if SIZEOF_WCHAR_T == 2
3739            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3740            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3741            num_surrogates = 0;
3742
3743            for (; four_bytes < ucs4_end; ++four_bytes) {
3744                if (*four_bytes > 0xFFFF)
3745                    ++num_surrogates;
3746            }
3747
3748            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3749                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3750            if (!_PyUnicode_WSTR(unicode)) {
3751                PyErr_NoMemory();
3752                return NULL;
3753            }
3754            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3755
3756            w = _PyUnicode_WSTR(unicode);
3757            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3758            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3759            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3760                if (*four_bytes > 0xFFFF) {
3761                    assert(*four_bytes <= MAX_UNICODE);
3762                    /* encode surrogate pair in this case */
3763                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3764                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3765                }
3766                else
3767                    *w = *four_bytes;
3768
3769                if (w > wchar_end) {
3770                    assert(0 && "Miscalculated string end");
3771                }
3772            }
3773            *w = 0;
3774#else
3775            /* sizeof(wchar_t) == 4 */
3776            Py_FatalError("Impossible unicode object state, wstr and str "
3777                          "should share memory already.");
3778            return NULL;
3779#endif
3780        }
3781        else {
3782            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3783                                                  (_PyUnicode_LENGTH(unicode) + 1));
3784            if (!_PyUnicode_WSTR(unicode)) {
3785                PyErr_NoMemory();
3786                return NULL;
3787            }
3788            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3789                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3790            w = _PyUnicode_WSTR(unicode);
3791            wchar_end = w + _PyUnicode_LENGTH(unicode);
3792
3793            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3794                one_byte = PyUnicode_1BYTE_DATA(unicode);
3795                for (; w < wchar_end; ++one_byte, ++w)
3796                    *w = *one_byte;
3797                /* null-terminate the wstr */
3798                *w = 0;
3799            }
3800            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3801#if SIZEOF_WCHAR_T == 4
3802                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3803                for (; w < wchar_end; ++two_bytes, ++w)
3804                    *w = *two_bytes;
3805                /* null-terminate the wstr */
3806                *w = 0;
3807#else
3808                /* sizeof(wchar_t) == 2 */
3809                PyObject_FREE(_PyUnicode_WSTR(unicode));
3810                _PyUnicode_WSTR(unicode) = NULL;
3811                Py_FatalError("Impossible unicode object state, wstr "
3812                              "and str should share memory already.");
3813                return NULL;
3814#endif
3815            }
3816            else {
3817                assert(0 && "This should never happen.");
3818            }
3819        }
3820    }
3821    if (size != NULL)
3822        *size = PyUnicode_WSTR_LENGTH(unicode);
3823    return _PyUnicode_WSTR(unicode);
3824}
3825
3826Py_UNICODE *
3827PyUnicode_AsUnicode(PyObject *unicode)
3828{
3829    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3830}
3831
3832
3833Py_ssize_t
3834PyUnicode_GetSize(PyObject *unicode)
3835{
3836    if (!PyUnicode_Check(unicode)) {
3837        PyErr_BadArgument();
3838        goto onError;
3839    }
3840    return PyUnicode_GET_SIZE(unicode);
3841
3842  onError:
3843    return -1;
3844}
3845
3846Py_ssize_t
3847PyUnicode_GetLength(PyObject *unicode)
3848{
3849    if (!PyUnicode_Check(unicode)) {
3850        PyErr_BadArgument();
3851        return -1;
3852    }
3853    if (PyUnicode_READY(unicode) == -1)
3854        return -1;
3855    return PyUnicode_GET_LENGTH(unicode);
3856}
3857
3858Py_UCS4
3859PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3860{
3861    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3862        PyErr_BadArgument();
3863        return (Py_UCS4)-1;
3864    }
3865    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3866        PyErr_SetString(PyExc_IndexError, "string index out of range");
3867        return (Py_UCS4)-1;
3868    }
3869    return PyUnicode_READ_CHAR(unicode, index);
3870}
3871
3872int
3873PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3874{
3875    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3876        PyErr_BadArgument();
3877        return -1;
3878    }
3879    assert(PyUnicode_IS_READY(unicode));
3880    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3881        PyErr_SetString(PyExc_IndexError, "string index out of range");
3882        return -1;
3883    }
3884    if (unicode_check_modifiable(unicode))
3885        return -1;
3886    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3887        PyErr_SetString(PyExc_ValueError, "character out of range");
3888        return -1;
3889    }
3890    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3891                    index, ch);
3892    return 0;
3893}
3894
3895const char *
3896PyUnicode_GetDefaultEncoding(void)
3897{
3898    return "utf-8";
3899}
3900
3901/* create or adjust a UnicodeDecodeError */
3902static void
3903make_decode_exception(PyObject **exceptionObject,
3904                      const char *encoding,
3905                      const char *input, Py_ssize_t length,
3906                      Py_ssize_t startpos, Py_ssize_t endpos,
3907                      const char *reason)
3908{
3909    if (*exceptionObject == NULL) {
3910        *exceptionObject = PyUnicodeDecodeError_Create(
3911            encoding, input, length, startpos, endpos, reason);
3912    }
3913    else {
3914        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3915            goto onError;
3916        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3917            goto onError;
3918        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3919            goto onError;
3920    }
3921    return;
3922
3923onError:
3924    Py_DECREF(*exceptionObject);
3925    *exceptionObject = NULL;
3926}
3927
3928/* error handling callback helper:
3929   build arguments, call the callback and check the arguments,
3930   if no exception occurred, copy the replacement to the output
3931   and adjust various state variables.
3932   return 0 on success, -1 on error
3933*/
3934
3935static int
3936unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3937                                 const char *encoding, const char *reason,
3938                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3939                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3940                                 PyObject **output, Py_ssize_t *outpos)
3941{
3942    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3943
3944    PyObject *restuple = NULL;
3945    PyObject *repunicode = NULL;
3946    Py_ssize_t outsize;
3947    Py_ssize_t insize;
3948    Py_ssize_t requiredsize;
3949    Py_ssize_t newpos;
3950    PyObject *inputobj = NULL;
3951    int res = -1;
3952
3953    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3954        outsize = PyUnicode_GET_LENGTH(*output);
3955    else
3956        outsize = _PyUnicode_WSTR_LENGTH(*output);
3957
3958    if (*errorHandler == NULL) {
3959        *errorHandler = PyCodec_LookupError(errors);
3960        if (*errorHandler == NULL)
3961            goto onError;
3962    }
3963
3964    make_decode_exception(exceptionObject,
3965        encoding,
3966        *input, *inend - *input,
3967        *startinpos, *endinpos,
3968        reason);
3969    if (*exceptionObject == NULL)
3970        goto onError;
3971
3972    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3973    if (restuple == NULL)
3974        goto onError;
3975    if (!PyTuple_Check(restuple)) {
3976        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3977        goto onError;
3978    }
3979    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3980        goto onError;
3981    if (PyUnicode_READY(repunicode) == -1)
3982        goto onError;
3983
3984    /* Copy back the bytes variables, which might have been modified by the
3985       callback */
3986    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3987    if (!inputobj)
3988        goto onError;
3989    if (!PyBytes_Check(inputobj)) {
3990        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3991    }
3992    *input = PyBytes_AS_STRING(inputobj);
3993    insize = PyBytes_GET_SIZE(inputobj);
3994    *inend = *input + insize;
3995    /* we can DECREF safely, as the exception has another reference,
3996       so the object won't go away. */
3997    Py_DECREF(inputobj);
3998
3999    if (newpos<0)
4000        newpos = insize+newpos;
4001    if (newpos<0 || newpos>insize) {
4002        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4003        goto onError;
4004    }
4005
4006    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4007        /* need more space? (at least enough for what we
4008           have+the replacement+the rest of the string (starting
4009           at the new input position), so we won't have to check space
4010           when there are no errors in the rest of the string) */
4011        Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4012        requiredsize = *outpos + replen + insize-newpos;
4013        if (requiredsize > outsize) {
4014            if (requiredsize<2*outsize)
4015                requiredsize = 2*outsize;
4016            if (unicode_resize(output, requiredsize) < 0)
4017                goto onError;
4018        }
4019        if (unicode_widen(output, *outpos,
4020                          PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
4021            goto onError;
4022        _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
4023        *outpos += replen;
4024    }
4025    else {
4026        wchar_t *repwstr;
4027        Py_ssize_t repwlen;
4028        repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4029        if (repwstr == NULL)
4030            goto onError;
4031        /* need more space? (at least enough for what we
4032           have+the replacement+the rest of the string (starting
4033           at the new input position), so we won't have to check space
4034           when there are no errors in the rest of the string) */
4035        requiredsize = *outpos + repwlen + insize-newpos;
4036        if (requiredsize > outsize) {
4037            if (requiredsize < 2*outsize)
4038                requiredsize = 2*outsize;
4039            if (unicode_resize(output, requiredsize) < 0)
4040                goto onError;
4041        }
4042        wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4043        *outpos += repwlen;
4044    }
4045    *endinpos = newpos;
4046    *inptr = *input + newpos;
4047
4048    /* we made it! */
4049    res = 0;
4050
4051  onError:
4052    Py_XDECREF(restuple);
4053    return res;
4054}
4055
4056/* --- UTF-7 Codec -------------------------------------------------------- */
4057
4058/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4059
4060/* Three simple macros defining base-64. */
4061
4062/* Is c a base-64 character? */
4063
4064#define IS_BASE64(c) \
4065    (((c) >= 'A' && (c) <= 'Z') ||     \
4066     ((c) >= 'a' && (c) <= 'z') ||     \
4067     ((c) >= '0' && (c) <= '9') ||     \
4068     (c) == '+' || (c) == '/')
4069
4070/* given that c is a base-64 character, what is its base-64 value? */
4071
4072#define FROM_BASE64(c)                                                  \
4073    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4074     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4075     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4076     (c) == '+' ? 62 : 63)
4077
4078/* What is the base-64 character of the bottom 6 bits of n? */
4079
4080#define TO_BASE64(n)  \
4081    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4082
4083/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4084 * decoded as itself.  We are permissive on decoding; the only ASCII
4085 * byte not decoding to itself is the + which begins a base64
4086 * string. */
4087
4088#define DECODE_DIRECT(c)                                \
4089    ((c) <= 127 && (c) != '+')
4090
4091/* The UTF-7 encoder treats ASCII characters differently according to
4092 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4093 * the above).  See RFC2152.  This array identifies these different
4094 * sets:
4095 * 0 : "Set D"
4096 *     alphanumeric and '(),-./:?
4097 * 1 : "Set O"
4098 *     !"#$%&*;<=>@[]^_`{|}
4099 * 2 : "whitespace"
4100 *     ht nl cr sp
4101 * 3 : special (must be base64 encoded)
4102 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4103 */
4104
4105static
4106char utf7_category[128] = {
4107/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4108    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4109/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4110    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4111/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4112    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4113/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4114    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4115/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4116    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4117/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4118    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4119/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4120    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4121/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4122    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4123};
4124
4125/* ENCODE_DIRECT: this character should be encoded as itself.  The
4126 * answer depends on whether we are encoding set O as itself, and also
4127 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4128 * clear that the answers to these questions vary between
4129 * applications, so this code needs to be flexible.  */
4130
4131#define ENCODE_DIRECT(c, directO, directWS)             \
4132    ((c) < 128 && (c) > 0 &&                            \
4133     ((utf7_category[(c)] == 0) ||                      \
4134      (directWS && (utf7_category[(c)] == 2)) ||        \
4135      (directO && (utf7_category[(c)] == 1))))
4136
4137PyObject *
4138PyUnicode_DecodeUTF7(const char *s,
4139                     Py_ssize_t size,
4140                     const char *errors)
4141{
4142    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4143}
4144
4145/* The decoder.  The only state we preserve is our read position,
4146 * i.e. how many characters we have consumed.  So if we end in the
4147 * middle of a shift sequence we have to back off the read position
4148 * and the output to the beginning of the sequence, otherwise we lose
4149 * all the shift state (seen bits, number of bits seen, high
4150 * surrogate). */
4151
4152PyObject *
4153PyUnicode_DecodeUTF7Stateful(const char *s,
4154                             Py_ssize_t size,
4155                             const char *errors,
4156                             Py_ssize_t *consumed)
4157{
4158    const char *starts = s;
4159    Py_ssize_t startinpos;
4160    Py_ssize_t endinpos;
4161    Py_ssize_t outpos;
4162    const char *e;
4163    PyObject *unicode;
4164    const char *errmsg = "";
4165    int inShift = 0;
4166    Py_ssize_t shiftOutStart;
4167    unsigned int base64bits = 0;
4168    unsigned long base64buffer = 0;
4169    Py_UCS4 surrogate = 0;
4170    PyObject *errorHandler = NULL;
4171    PyObject *exc = NULL;
4172
4173    /* Start off assuming it's all ASCII. Widen later as necessary. */
4174    unicode = PyUnicode_New(size, 127);
4175    if (!unicode)
4176        return NULL;
4177    if (size == 0) {
4178        if (consumed)
4179            *consumed = 0;
4180        return unicode;
4181    }
4182
4183    shiftOutStart = outpos = 0;
4184    e = s + size;
4185
4186    while (s < e) {
4187        Py_UCS4 ch;
4188      restart:
4189        ch = (unsigned char) *s;
4190
4191        if (inShift) { /* in a base-64 section */
4192            if (IS_BASE64(ch)) { /* consume a base-64 character */
4193                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4194                base64bits += 6;
4195                s++;
4196                if (base64bits >= 16) {
4197                    /* we have enough bits for a UTF-16 value */
4198                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4199                    base64bits -= 16;
4200                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4201                    if (surrogate) {
4202                        /* expecting a second surrogate */
4203                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4204                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4205                            if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4206                                goto onError;
4207                            surrogate = 0;
4208                            continue;
4209                        }
4210                        else {
4211                            if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4212                                goto onError;
4213                            surrogate = 0;
4214                        }
4215                    }
4216                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4217                        /* first surrogate */
4218                        surrogate = outCh;
4219                    }
4220                    else {
4221                        if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4222                            goto onError;
4223                    }
4224                }
4225            }
4226            else { /* now leaving a base-64 section */
4227                inShift = 0;
4228                s++;
4229                if (surrogate) {
4230                    if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4231                        goto onError;
4232                    surrogate = 0;
4233                }
4234                if (base64bits > 0) { /* left-over bits */
4235                    if (base64bits >= 6) {
4236                        /* We've seen at least one base-64 character */
4237                        errmsg = "partial character in shift sequence";
4238                        goto utf7Error;
4239                    }
4240                    else {
4241                        /* Some bits remain; they should be zero */
4242                        if (base64buffer != 0) {
4243                            errmsg = "non-zero padding bits in shift sequence";
4244                            goto utf7Error;
4245                        }
4246                    }
4247                }
4248                if (ch != '-') {
4249                    /* '-' is absorbed; other terminating
4250                       characters are preserved */
4251                    if (unicode_putchar(&unicode, &outpos, ch) < 0)
4252                        goto onError;
4253                }
4254            }
4255        }
4256        else if ( ch == '+' ) {
4257            startinpos = s-starts;
4258            s++; /* consume '+' */
4259            if (s < e && *s == '-') { /* '+-' encodes '+' */
4260                s++;
4261                if (unicode_putchar(&unicode, &outpos, '+') < 0)
4262                    goto onError;
4263            }
4264            else { /* begin base64-encoded section */
4265                inShift = 1;
4266                shiftOutStart = outpos;
4267                base64bits = 0;
4268            }
4269        }
4270        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4271            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4272                goto onError;
4273            s++;
4274        }
4275        else {
4276            startinpos = s-starts;
4277            s++;
4278            errmsg = "unexpected special character";
4279            goto utf7Error;
4280        }
4281        continue;
4282utf7Error:
4283        endinpos = s-starts;
4284        if (unicode_decode_call_errorhandler(
4285                errors, &errorHandler,
4286                "utf7", errmsg,
4287                &starts, &e, &startinpos, &endinpos, &exc, &s,
4288                &unicode, &outpos))
4289            goto onError;
4290    }
4291
4292    /* end of string */
4293
4294    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4295        /* if we're in an inconsistent state, that's an error */
4296        if (surrogate ||
4297                (base64bits >= 6) ||
4298                (base64bits > 0 && base64buffer != 0)) {
4299            endinpos = size;
4300            if (unicode_decode_call_errorhandler(
4301                    errors, &errorHandler,
4302                    "utf7", "unterminated shift sequence",
4303                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4304                    &unicode, &outpos))
4305                goto onError;
4306            if (s < e)
4307                goto restart;
4308        }
4309    }
4310
4311    /* return state */
4312    if (consumed) {
4313        if (inShift) {
4314            outpos = shiftOutStart; /* back off output */
4315            *consumed = startinpos;
4316        }
4317        else {
4318            *consumed = s-starts;
4319        }
4320    }
4321
4322    if (unicode_resize(&unicode, outpos) < 0)
4323        goto onError;
4324
4325    Py_XDECREF(errorHandler);
4326    Py_XDECREF(exc);
4327    return unicode_result(unicode);
4328
4329  onError:
4330    Py_XDECREF(errorHandler);
4331    Py_XDECREF(exc);
4332    Py_DECREF(unicode);
4333    return NULL;
4334}
4335
4336
4337PyObject *
4338_PyUnicode_EncodeUTF7(PyObject *str,
4339                      int base64SetO,
4340                      int base64WhiteSpace,
4341                      const char *errors)
4342{
4343    int kind;
4344    void *data;
4345    Py_ssize_t len;
4346    PyObject *v;
4347    int inShift = 0;
4348    Py_ssize_t i;
4349    unsigned int base64bits = 0;
4350    unsigned long base64buffer = 0;
4351    char * out;
4352    char * start;
4353
4354    if (PyUnicode_READY(str) == -1)
4355        return NULL;
4356    kind = PyUnicode_KIND(str);
4357    data = PyUnicode_DATA(str);
4358    len = PyUnicode_GET_LENGTH(str);
4359
4360    if (len == 0)
4361        return PyBytes_FromStringAndSize(NULL, 0);
4362
4363    /* It might be possible to tighten this worst case */
4364    if (len > PY_SSIZE_T_MAX / 8)
4365        return PyErr_NoMemory();
4366    v = PyBytes_FromStringAndSize(NULL, len * 8);
4367    if (v == NULL)
4368        return NULL;
4369
4370    start = out = PyBytes_AS_STRING(v);
4371    for (i = 0; i < len; ++i) {
4372        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4373
4374        if (inShift) {
4375            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4376                /* shifting out */
4377                if (base64bits) { /* output remaining bits */
4378                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4379                    base64buffer = 0;
4380                    base64bits = 0;
4381                }
4382                inShift = 0;
4383                /* Characters not in the BASE64 set implicitly unshift the sequence
4384                   so no '-' is required, except if the character is itself a '-' */
4385                if (IS_BASE64(ch) || ch == '-') {
4386                    *out++ = '-';
4387                }
4388                *out++ = (char) ch;
4389            }
4390            else {
4391                goto encode_char;
4392            }
4393        }
4394        else { /* not in a shift sequence */
4395            if (ch == '+') {
4396                *out++ = '+';
4397                        *out++ = '-';
4398            }
4399            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4400                *out++ = (char) ch;
4401            }
4402            else {
4403                *out++ = '+';
4404                inShift = 1;
4405                goto encode_char;
4406            }
4407        }
4408        continue;
4409encode_char:
4410        if (ch >= 0x10000) {
4411            assert(ch <= MAX_UNICODE);
4412
4413            /* code first surrogate */
4414            base64bits += 16;
4415            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4416            while (base64bits >= 6) {
4417                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4418                base64bits -= 6;
4419            }
4420            /* prepare second surrogate */
4421            ch = Py_UNICODE_LOW_SURROGATE(ch);
4422        }
4423        base64bits += 16;
4424        base64buffer = (base64buffer << 16) | ch;
4425        while (base64bits >= 6) {
4426            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4427            base64bits -= 6;
4428        }
4429    }
4430    if (base64bits)
4431        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4432    if (inShift)
4433        *out++ = '-';
4434    if (_PyBytes_Resize(&v, out - start) < 0)
4435        return NULL;
4436    return v;
4437}
4438PyObject *
4439PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4440                     Py_ssize_t size,
4441                     int base64SetO,
4442                     int base64WhiteSpace,
4443                     const char *errors)
4444{
4445    PyObject *result;
4446    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4447    if (tmp == NULL)
4448        return NULL;
4449    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4450                                   base64WhiteSpace, errors);
4451    Py_DECREF(tmp);
4452    return result;
4453}
4454
4455#undef IS_BASE64
4456#undef FROM_BASE64
4457#undef TO_BASE64
4458#undef DECODE_DIRECT
4459#undef ENCODE_DIRECT
4460
4461/* --- UTF-8 Codec -------------------------------------------------------- */
4462
4463PyObject *
4464PyUnicode_DecodeUTF8(const char *s,
4465                     Py_ssize_t size,
4466                     const char *errors)
4467{
4468    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4469}
4470
4471#include "stringlib/asciilib.h"
4472#include "stringlib/codecs.h"
4473#include "stringlib/undef.h"
4474
4475#include "stringlib/ucs1lib.h"
4476#include "stringlib/codecs.h"
4477#include "stringlib/undef.h"
4478
4479#include "stringlib/ucs2lib.h"
4480#include "stringlib/codecs.h"
4481#include "stringlib/undef.h"
4482
4483#include "stringlib/ucs4lib.h"
4484#include "stringlib/codecs.h"
4485#include "stringlib/undef.h"
4486
4487/* Mask to quickly check whether a C 'long' contains a
4488   non-ASCII, UTF8-encoded char. */
4489#if (SIZEOF_LONG == 8)
4490# define ASCII_CHAR_MASK 0x8080808080808080UL
4491#elif (SIZEOF_LONG == 4)
4492# define ASCII_CHAR_MASK 0x80808080UL
4493#else
4494# error C 'long' size should be either 4 or 8!
4495#endif
4496
4497static Py_ssize_t
4498ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4499{
4500    const char *p = start;
4501    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4502
4503#if SIZEOF_LONG <= SIZEOF_VOID_P
4504    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4505    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4506        /* Fast path, see in STRINGLIB(utf8_decode) for
4507           an explanation. */
4508        /* Help register allocation */
4509        register const char *_p = p;
4510        register Py_UCS1 * q = dest;
4511        while (_p < aligned_end) {
4512            unsigned long value = *(const unsigned long *) _p;
4513            if (value & ASCII_CHAR_MASK)
4514                break;
4515            *((unsigned long *)q) = value;
4516            _p += SIZEOF_LONG;
4517            q += SIZEOF_LONG;
4518        }
4519        p = _p;
4520        while (p < end) {
4521            if ((unsigned char)*p & 0x80)
4522                break;
4523            *q++ = *p++;
4524        }
4525        return p - start;
4526    }
4527#endif
4528    while (p < end) {
4529        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4530           for an explanation. */
4531        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4532            /* Help register allocation */
4533            register const char *_p = p;
4534            while (_p < aligned_end) {
4535                unsigned long value = *(unsigned long *) _p;
4536                if (value & ASCII_CHAR_MASK)
4537                    break;
4538                _p += SIZEOF_LONG;
4539            }
4540            p = _p;
4541            if (_p == end)
4542                break;
4543        }
4544        if ((unsigned char)*p & 0x80)
4545            break;
4546        ++p;
4547    }
4548    memcpy(dest, start, p - start);
4549    return p - start;
4550}
4551
4552PyObject *
4553PyUnicode_DecodeUTF8Stateful(const char *s,
4554                             Py_ssize_t size,
4555                             const char *errors,
4556                             Py_ssize_t *consumed)
4557{
4558    PyObject *unicode;
4559    const char *starts = s;
4560    const char *end = s + size;
4561    Py_ssize_t outpos;
4562
4563    Py_ssize_t startinpos;
4564    Py_ssize_t endinpos;
4565    const char *errmsg = "";
4566    PyObject *errorHandler = NULL;
4567    PyObject *exc = NULL;
4568
4569    if (size == 0) {
4570        if (consumed)
4571            *consumed = 0;
4572        Py_INCREF(unicode_empty);
4573        return unicode_empty;
4574    }
4575
4576    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4577    if (size == 1 && (unsigned char)s[0] < 128) {
4578        if (consumed)
4579            *consumed = 1;
4580        return get_latin1_char((unsigned char)s[0]);
4581    }
4582
4583    unicode = PyUnicode_New(size, 127);
4584    if (!unicode)
4585        return NULL;
4586
4587    outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4588    s += outpos;
4589    while (s < end) {
4590        Py_UCS4 ch;
4591        int kind = PyUnicode_KIND(unicode);
4592        if (kind == PyUnicode_1BYTE_KIND) {
4593            if (PyUnicode_IS_ASCII(unicode))
4594                ch = asciilib_utf8_decode(&s, end,
4595                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4596            else
4597                ch = ucs1lib_utf8_decode(&s, end,
4598                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4599        } else if (kind == PyUnicode_2BYTE_KIND) {
4600            ch = ucs2lib_utf8_decode(&s, end,
4601                    PyUnicode_2BYTE_DATA(unicode), &outpos);
4602        } else {
4603            assert(kind == PyUnicode_4BYTE_KIND);
4604            ch = ucs4lib_utf8_decode(&s, end,
4605                    PyUnicode_4BYTE_DATA(unicode), &outpos);
4606        }
4607
4608        switch (ch) {
4609        case 0:
4610            if (s == end || consumed)
4611                goto End;
4612            errmsg = "unexpected end of data";
4613            startinpos = s - starts;
4614            endinpos = startinpos + 1;
4615            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4616                endinpos++;
4617            break;
4618        case 1:
4619            errmsg = "invalid start byte";
4620            startinpos = s - starts;
4621            endinpos = startinpos + 1;
4622            break;
4623        case 2:
4624            errmsg = "invalid continuation byte";
4625            startinpos = s - starts;
4626            endinpos = startinpos + 1;
4627            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4628                endinpos++;
4629            break;
4630        default:
4631            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4632                goto onError;
4633            continue;
4634        }
4635
4636        if (unicode_decode_call_errorhandler(
4637                errors, &errorHandler,
4638                "utf-8", errmsg,
4639                &starts, &end, &startinpos, &endinpos, &exc, &s,
4640                &unicode, &outpos))
4641            goto onError;
4642    }
4643
4644End:
4645    if (unicode_resize(&unicode, outpos) < 0)
4646        goto onError;
4647
4648    if (consumed)
4649        *consumed = s - starts;
4650
4651    Py_XDECREF(errorHandler);
4652    Py_XDECREF(exc);
4653    assert(_PyUnicode_CheckConsistency(unicode, 1));
4654    return unicode;
4655
4656onError:
4657    Py_XDECREF(errorHandler);
4658    Py_XDECREF(exc);
4659    Py_XDECREF(unicode);
4660    return NULL;
4661}
4662
4663#ifdef __APPLE__
4664
4665/* Simplified UTF-8 decoder using surrogateescape error handler,
4666   used to decode the command line arguments on Mac OS X. */
4667
4668wchar_t*
4669_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4670{
4671    const char *e;
4672    wchar_t *unicode;
4673    Py_ssize_t outpos;
4674
4675    /* Note: size will always be longer than the resulting Unicode
4676       character count */
4677    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4678        PyErr_NoMemory();
4679        return NULL;
4680    }
4681    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4682    if (!unicode)
4683        return NULL;
4684
4685    /* Unpack UTF-8 encoded data */
4686    e = s + size;
4687    outpos = 0;
4688    while (s < e) {
4689        Py_UCS4 ch;
4690#if SIZEOF_WCHAR_T == 4
4691        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4692#else
4693        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4694#endif
4695        if (ch > 0xFF) {
4696#if SIZEOF_WCHAR_T == 4
4697            assert(0);
4698#else
4699            assert(Py_UNICODE_IS_SURROGATE(ch));
4700            /*  compute and append the two surrogates: */
4701            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4702            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4703#endif
4704        }
4705        else {
4706            if (!ch && s == e)
4707                break;
4708            /* surrogateescape */
4709            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4710        }
4711    }
4712    unicode[outpos] = L'\0';
4713    return unicode;
4714}
4715
4716#endif /* __APPLE__ */
4717
4718/* Primary internal function which creates utf8 encoded bytes objects.
4719
4720   Allocation strategy:  if the string is short, convert into a stack buffer
4721   and allocate exactly as much space needed at the end.  Else allocate the
4722   maximum possible needed (4 result bytes per Unicode character), and return
4723   the excess memory at the end.
4724*/
4725PyObject *
4726_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4727{
4728    enum PyUnicode_Kind kind;
4729    void *data;
4730    Py_ssize_t size;
4731
4732    if (!PyUnicode_Check(unicode)) {
4733        PyErr_BadArgument();
4734        return NULL;
4735    }
4736
4737    if (PyUnicode_READY(unicode) == -1)
4738        return NULL;
4739
4740    if (PyUnicode_UTF8(unicode))
4741        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4742                                         PyUnicode_UTF8_LENGTH(unicode));
4743
4744    kind = PyUnicode_KIND(unicode);
4745    data = PyUnicode_DATA(unicode);
4746    size = PyUnicode_GET_LENGTH(unicode);
4747
4748    switch (kind) {
4749    default:
4750        assert(0);
4751    case PyUnicode_1BYTE_KIND:
4752        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4753        assert(!PyUnicode_IS_ASCII(unicode));
4754        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4755    case PyUnicode_2BYTE_KIND:
4756        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4757    case PyUnicode_4BYTE_KIND:
4758        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4759    }
4760}
4761
4762PyObject *
4763PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4764                     Py_ssize_t size,
4765                     const char *errors)
4766{
4767    PyObject *v, *unicode;
4768
4769    unicode = PyUnicode_FromUnicode(s, size);
4770    if (unicode == NULL)
4771        return NULL;
4772    v = _PyUnicode_AsUTF8String(unicode, errors);
4773    Py_DECREF(unicode);
4774    return v;
4775}
4776
4777PyObject *
4778PyUnicode_AsUTF8String(PyObject *unicode)
4779{
4780    return _PyUnicode_AsUTF8String(unicode, NULL);
4781}
4782
4783/* --- UTF-32 Codec ------------------------------------------------------- */
4784
4785PyObject *
4786PyUnicode_DecodeUTF32(const char *s,
4787                      Py_ssize_t size,
4788                      const char *errors,
4789                      int *byteorder)
4790{
4791    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4792}
4793
4794PyObject *
4795PyUnicode_DecodeUTF32Stateful(const char *s,
4796                              Py_ssize_t size,
4797                              const char *errors,
4798                              int *byteorder,
4799                              Py_ssize_t *consumed)
4800{
4801    const char *starts = s;
4802    Py_ssize_t startinpos;
4803    Py_ssize_t endinpos;
4804    Py_ssize_t outpos;
4805    PyObject *unicode;
4806    const unsigned char *q, *e;
4807    int bo = 0;       /* assume native ordering by default */
4808    const char *errmsg = "";
4809    /* Offsets from q for retrieving bytes in the right order. */
4810#if PY_LITTLE_ENDIAN
4811    int iorder[] = {0, 1, 2, 3};
4812#else
4813    int iorder[] = {3, 2, 1, 0};
4814#endif
4815    PyObject *errorHandler = NULL;
4816    PyObject *exc = NULL;
4817
4818    q = (unsigned char *)s;
4819    e = q + size;
4820
4821    if (byteorder)
4822        bo = *byteorder;
4823
4824    /* Check for BOM marks (U+FEFF) in the input and adjust current
4825       byte order setting accordingly. In native mode, the leading BOM
4826       mark is skipped, in all other modes, it is copied to the output
4827       stream as-is (giving a ZWNBSP character). */
4828    if (bo == 0) {
4829        if (size >= 4) {
4830            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4831                (q[iorder[1]] << 8) | q[iorder[0]];
4832#if PY_LITTLE_ENDIAN
4833            if (bom == 0x0000FEFF) {
4834                q += 4;
4835                bo = -1;
4836            }
4837            else if (bom == 0xFFFE0000) {
4838                q += 4;
4839                bo = 1;
4840            }
4841#else
4842            if (bom == 0x0000FEFF) {
4843                q += 4;
4844                bo = 1;
4845            }
4846            else if (bom == 0xFFFE0000) {
4847                q += 4;
4848                bo = -1;
4849            }
4850#endif
4851        }
4852    }
4853
4854    if (bo == -1) {
4855        /* force LE */
4856        iorder[0] = 0;
4857        iorder[1] = 1;
4858        iorder[2] = 2;
4859        iorder[3] = 3;
4860    }
4861    else if (bo == 1) {
4862        /* force BE */
4863        iorder[0] = 3;
4864        iorder[1] = 2;
4865        iorder[2] = 1;
4866        iorder[3] = 0;
4867    }
4868
4869    /* This might be one to much, because of a BOM */
4870    unicode = PyUnicode_New((size+3)/4, 127);
4871    if (!unicode)
4872        return NULL;
4873    if (size == 0)
4874        return unicode;
4875    outpos = 0;
4876
4877    while (q < e) {
4878        Py_UCS4 ch;
4879        /* remaining bytes at the end? (size should be divisible by 4) */
4880        if (e-q<4) {
4881            if (consumed)
4882                break;
4883            errmsg = "truncated data";
4884            startinpos = ((const char *)q)-starts;
4885            endinpos = ((const char *)e)-starts;
4886            goto utf32Error;
4887            /* The remaining input chars are ignored if the callback
4888               chooses to skip the input */
4889        }
4890        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4891            (q[iorder[1]] << 8) | q[iorder[0]];
4892
4893        if (ch >= 0x110000)
4894        {
4895            errmsg = "codepoint not in range(0x110000)";
4896            startinpos = ((const char *)q)-starts;
4897            endinpos = startinpos+4;
4898            goto utf32Error;
4899        }
4900        if (unicode_putchar(&unicode, &outpos, ch) < 0)
4901            goto onError;
4902        q += 4;
4903        continue;
4904      utf32Error:
4905        if (unicode_decode_call_errorhandler(
4906                errors, &errorHandler,
4907                "utf32", errmsg,
4908                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4909                &unicode, &outpos))
4910            goto onError;
4911    }
4912
4913    if (byteorder)
4914        *byteorder = bo;
4915
4916    if (consumed)
4917        *consumed = (const char *)q-starts;
4918
4919    /* Adjust length */
4920    if (unicode_resize(&unicode, outpos) < 0)
4921        goto onError;
4922
4923    Py_XDECREF(errorHandler);
4924    Py_XDECREF(exc);
4925    return unicode_result(unicode);
4926
4927  onError:
4928    Py_DECREF(unicode);
4929    Py_XDECREF(errorHandler);
4930    Py_XDECREF(exc);
4931    return NULL;
4932}
4933
4934PyObject *
4935_PyUnicode_EncodeUTF32(PyObject *str,
4936                       const char *errors,
4937                       int byteorder)
4938{
4939    int kind;
4940    void *data;
4941    Py_ssize_t len;
4942    PyObject *v;
4943    unsigned char *p;
4944    Py_ssize_t nsize, i;
4945    /* Offsets from p for storing byte pairs in the right order. */
4946#if PY_LITTLE_ENDIAN
4947    int iorder[] = {0, 1, 2, 3};
4948#else
4949    int iorder[] = {3, 2, 1, 0};
4950#endif
4951
4952#define STORECHAR(CH)                           \
4953    do {                                        \
4954        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
4955        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
4956        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
4957        p[iorder[0]] = (CH) & 0xff;             \
4958        p += 4;                                 \
4959    } while(0)
4960
4961    if (!PyUnicode_Check(str)) {
4962        PyErr_BadArgument();
4963        return NULL;
4964    }
4965    if (PyUnicode_READY(str) == -1)
4966        return NULL;
4967    kind = PyUnicode_KIND(str);
4968    data = PyUnicode_DATA(str);
4969    len = PyUnicode_GET_LENGTH(str);
4970
4971    nsize = len + (byteorder == 0);
4972    if (nsize > PY_SSIZE_T_MAX / 4)
4973        return PyErr_NoMemory();
4974    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
4975    if (v == NULL)
4976        return NULL;
4977
4978    p = (unsigned char *)PyBytes_AS_STRING(v);
4979    if (byteorder == 0)
4980        STORECHAR(0xFEFF);
4981    if (len == 0)
4982        goto done;
4983
4984    if (byteorder == -1) {
4985        /* force LE */
4986        iorder[0] = 0;
4987        iorder[1] = 1;
4988        iorder[2] = 2;
4989        iorder[3] = 3;
4990    }
4991    else if (byteorder == 1) {
4992        /* force BE */
4993        iorder[0] = 3;
4994        iorder[1] = 2;
4995        iorder[2] = 1;
4996        iorder[3] = 0;
4997    }
4998
4999    for (i = 0; i < len; i++)
5000        STORECHAR(PyUnicode_READ(kind, data, i));
5001
5002  done:
5003    return v;
5004#undef STORECHAR
5005}
5006
5007PyObject *
5008PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5009                      Py_ssize_t size,
5010                      const char *errors,
5011                      int byteorder)
5012{
5013    PyObject *result;
5014    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5015    if (tmp == NULL)
5016        return NULL;
5017    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5018    Py_DECREF(tmp);
5019    return result;
5020}
5021
5022PyObject *
5023PyUnicode_AsUTF32String(PyObject *unicode)
5024{
5025    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5026}
5027
5028/* --- UTF-16 Codec ------------------------------------------------------- */
5029
5030PyObject *
5031PyUnicode_DecodeUTF16(const char *s,
5032                      Py_ssize_t size,
5033                      const char *errors,
5034                      int *byteorder)
5035{
5036    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5037}
5038
5039PyObject *
5040PyUnicode_DecodeUTF16Stateful(const char *s,
5041                              Py_ssize_t size,
5042                              const char *errors,
5043                              int *byteorder,
5044                              Py_ssize_t *consumed)
5045{
5046    const char *starts = s;
5047    Py_ssize_t startinpos;
5048    Py_ssize_t endinpos;
5049    Py_ssize_t outpos;
5050    PyObject *unicode;
5051    const unsigned char *q, *e;
5052    int bo = 0;       /* assume native ordering by default */
5053    int native_ordering;
5054    const char *errmsg = "";
5055    PyObject *errorHandler = NULL;
5056    PyObject *exc = NULL;
5057
5058    q = (unsigned char *)s;
5059    e = q + size;
5060
5061    if (byteorder)
5062        bo = *byteorder;
5063
5064    /* Check for BOM marks (U+FEFF) in the input and adjust current
5065       byte order setting accordingly. In native mode, the leading BOM
5066       mark is skipped, in all other modes, it is copied to the output
5067       stream as-is (giving a ZWNBSP character). */
5068    if (bo == 0 && size >= 2) {
5069        const Py_UCS4 bom = (q[1] << 8) | q[0];
5070        if (bom == 0xFEFF) {
5071            q += 2;
5072            bo = -1;
5073        }
5074        else if (bom == 0xFFFE) {
5075            q += 2;
5076            bo = 1;
5077        }
5078        if (byteorder)
5079            *byteorder = bo;
5080    }
5081
5082    if (q == e) {
5083        if (consumed)
5084            *consumed = size;
5085        Py_INCREF(unicode_empty);
5086        return unicode_empty;
5087    }
5088
5089#if PY_LITTLE_ENDIAN
5090    native_ordering = bo <= 0;
5091#else
5092    native_ordering = bo >= 0;
5093#endif
5094
5095    /* Note: size will always be longer than the resulting Unicode
5096       character count */
5097    unicode = PyUnicode_New((e - q + 1) / 2, 127);
5098    if (!unicode)
5099        return NULL;
5100
5101    outpos = 0;
5102    while (1) {
5103        Py_UCS4 ch = 0;
5104        if (e - q >= 2) {
5105            int kind = PyUnicode_KIND(unicode);
5106            if (kind == PyUnicode_1BYTE_KIND) {
5107                if (PyUnicode_IS_ASCII(unicode))
5108                    ch = asciilib_utf16_decode(&q, e,
5109                            PyUnicode_1BYTE_DATA(unicode), &outpos,
5110                            native_ordering);
5111                else
5112                    ch = ucs1lib_utf16_decode(&q, e,
5113                            PyUnicode_1BYTE_DATA(unicode), &outpos,
5114                            native_ordering);
5115            } else if (kind == PyUnicode_2BYTE_KIND) {
5116                ch = ucs2lib_utf16_decode(&q, e,
5117                        PyUnicode_2BYTE_DATA(unicode), &outpos,
5118                        native_ordering);
5119            } else {
5120                assert(kind == PyUnicode_4BYTE_KIND);
5121                ch = ucs4lib_utf16_decode(&q, e,
5122                        PyUnicode_4BYTE_DATA(unicode), &outpos,
5123                        native_ordering);
5124            }
5125        }
5126
5127        switch (ch)
5128        {
5129        case 0:
5130            /* remaining byte at the end? (size should be even) */
5131            if (q == e || consumed)
5132                goto End;
5133            errmsg = "truncated data";
5134            startinpos = ((const char *)q) - starts;
5135            endinpos = ((const char *)e) - starts;
5136            break;
5137            /* The remaining input chars are ignored if the callback
5138               chooses to skip the input */
5139        case 1:
5140            errmsg = "unexpected end of data";
5141            startinpos = ((const char *)q) - 2 - starts;
5142            endinpos = ((const char *)e) - starts;
5143            break;
5144        case 2:
5145            errmsg = "illegal encoding";
5146            startinpos = ((const char *)q) - 2 - starts;
5147            endinpos = startinpos + 2;
5148            break;
5149        case 3:
5150            errmsg = "illegal UTF-16 surrogate";
5151            startinpos = ((const char *)q) - 4 - starts;
5152            endinpos = startinpos + 2;
5153            break;
5154        default:
5155            if (unicode_putchar(&unicode, &outpos, ch) < 0)
5156                goto onError;
5157            continue;
5158        }
5159
5160        if (unicode_decode_call_errorhandler(
5161                errors,
5162                &errorHandler,
5163                "utf16", errmsg,
5164                &starts,
5165                (const char **)&e,
5166                &startinpos,
5167                &endinpos,
5168                &exc,
5169                (const char **)&q,
5170                &unicode,
5171                &outpos))
5172            goto onError;
5173    }
5174
5175End:
5176    if (consumed)
5177        *consumed = (const char *)q-starts;
5178
5179    /* Adjust length */
5180    if (unicode_resize(&unicode, outpos) < 0)
5181        goto onError;
5182
5183    Py_XDECREF(errorHandler);
5184    Py_XDECREF(exc);
5185    return unicode_result(unicode);
5186
5187  onError:
5188    Py_DECREF(unicode);
5189    Py_XDECREF(errorHandler);
5190    Py_XDECREF(exc);
5191    return NULL;
5192}
5193
5194PyObject *
5195_PyUnicode_EncodeUTF16(PyObject *str,
5196                       const char *errors,
5197                       int byteorder)
5198{
5199    enum PyUnicode_Kind kind;
5200    const void *data;
5201    Py_ssize_t len;
5202    PyObject *v;
5203    unsigned short *out;
5204    Py_ssize_t bytesize;
5205    Py_ssize_t pairs;
5206#if PY_BIG_ENDIAN
5207    int native_ordering = byteorder >= 0;
5208#else
5209    int native_ordering = byteorder <= 0;
5210#endif
5211
5212    if (!PyUnicode_Check(str)) {
5213        PyErr_BadArgument();
5214        return NULL;
5215    }
5216    if (PyUnicode_READY(str) == -1)
5217        return NULL;
5218    kind = PyUnicode_KIND(str);
5219    data = PyUnicode_DATA(str);
5220    len = PyUnicode_GET_LENGTH(str);
5221
5222    pairs = 0;
5223    if (kind == PyUnicode_4BYTE_KIND) {
5224        const Py_UCS4 *in = (const Py_UCS4 *)data;
5225        const Py_UCS4 *end = in + len;
5226        while (in < end)
5227            if (*in++ >= 0x10000)
5228                pairs++;
5229    }
5230    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5231        return PyErr_NoMemory();
5232    bytesize = (len + pairs + (byteorder == 0)) * 2;
5233    v = PyBytes_FromStringAndSize(NULL, bytesize);
5234    if (v == NULL)
5235        return NULL;
5236
5237    /* output buffer is 2-bytes aligned */
5238    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5239    out = (unsigned short *)PyBytes_AS_STRING(v);
5240    if (byteorder == 0)
5241        *out++ = 0xFEFF;
5242    if (len == 0)
5243        goto done;
5244
5245    switch (kind) {
5246    case PyUnicode_1BYTE_KIND: {
5247        ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5248        break;
5249    }
5250    case PyUnicode_2BYTE_KIND: {
5251        ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5252        break;
5253    }
5254    case PyUnicode_4BYTE_KIND: {
5255        ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5256        break;
5257    }
5258    default:
5259        assert(0);
5260    }
5261
5262  done:
5263    return v;
5264}
5265
5266PyObject *
5267PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5268                      Py_ssize_t size,
5269                      const char *errors,
5270                      int byteorder)
5271{
5272    PyObject *result;
5273    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5274    if (tmp == NULL)
5275        return NULL;
5276    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5277    Py_DECREF(tmp);
5278    return result;
5279}
5280
5281PyObject *
5282PyUnicode_AsUTF16String(PyObject *unicode)
5283{
5284    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5285}
5286
5287/* --- Unicode Escape Codec ----------------------------------------------- */
5288
5289/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5290   if all the escapes in the string make it still a valid ASCII string.
5291   Returns -1 if any escapes were found which cause the string to
5292   pop out of ASCII range.  Otherwise returns the length of the
5293   required buffer to hold the string.
5294   */
5295static Py_ssize_t
5296length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5297{
5298    const unsigned char *p = (const unsigned char *)s;
5299    const unsigned char *end = p + size;
5300    Py_ssize_t length = 0;
5301
5302    if (size < 0)
5303        return -1;
5304
5305    for (; p < end; ++p) {
5306        if (*p > 127) {
5307            /* Non-ASCII */
5308            return -1;
5309        }
5310        else if (*p != '\\') {
5311            /* Normal character */
5312            ++length;
5313        }
5314        else {
5315            /* Backslash-escape, check next char */
5316            ++p;
5317            /* Escape sequence reaches till end of string or
5318               non-ASCII follow-up. */
5319            if (p >= end || *p > 127)
5320                return -1;
5321            switch (*p) {
5322            case '\n':
5323                /* backslash + \n result in zero characters */
5324                break;
5325            case '\\': case '\'': case '\"':
5326            case 'b': case 'f': case 't':
5327            case 'n': case 'r': case 'v': case 'a':
5328                ++length;
5329                break;
5330            case '0': case '1': case '2': case '3':
5331            case '4': case '5': case '6': case '7':
5332            case 'x': case 'u': case 'U': case 'N':
5333                /* these do not guarantee ASCII characters */
5334                return -1;
5335            default:
5336                /* count the backslash + the other character */
5337                length += 2;
5338            }
5339        }
5340    }
5341    return length;
5342}
5343
5344static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5345
5346PyObject *
5347PyUnicode_DecodeUnicodeEscape(const char *s,
5348                              Py_ssize_t size,
5349                              const char *errors)
5350{
5351    const char *starts = s;
5352    Py_ssize_t startinpos;
5353    Py_ssize_t endinpos;
5354    int j;
5355    PyObject *v;
5356    const char *end;
5357    char* message;
5358    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5359    PyObject *errorHandler = NULL;
5360    PyObject *exc = NULL;
5361    Py_ssize_t len;
5362    Py_ssize_t i;
5363
5364    len = length_of_escaped_ascii_string(s, size);
5365
5366    /* After length_of_escaped_ascii_string() there are two alternatives,
5367       either the string is pure ASCII with named escapes like \n, etc.
5368       and we determined it's exact size (common case)
5369       or it contains \x, \u, ... escape sequences.  then we create a
5370       legacy wchar string and resize it at the end of this function. */
5371    if (len >= 0) {
5372        v = PyUnicode_New(len, 127);
5373        if (!v)
5374            goto onError;
5375        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5376    }
5377    else {
5378        /* Escaped strings will always be longer than the resulting
5379           Unicode string, so we start with size here and then reduce the
5380           length after conversion to the true value.
5381           (but if the error callback returns a long replacement string
5382           we'll have to allocate more space) */
5383        v = PyUnicode_New(size, 127);
5384        if (!v)
5385            goto onError;
5386        len = size;
5387    }
5388
5389    if (size == 0)
5390        return v;
5391    i = 0;
5392    end = s + size;
5393
5394    while (s < end) {
5395        unsigned char c;
5396        Py_UCS4 x;
5397        int digits;
5398
5399        /* The only case in which i == ascii_length is a backslash
5400           followed by a newline. */
5401        assert(i <= len);
5402
5403        /* Non-escape characters are interpreted as Unicode ordinals */
5404        if (*s != '\\') {
5405            if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5406                goto onError;
5407            continue;
5408        }
5409
5410        startinpos = s-starts;
5411        /* \ - Escapes */
5412        s++;
5413        c = *s++;
5414        if (s > end)
5415            c = '\0'; /* Invalid after \ */
5416
5417        /* The only case in which i == ascii_length is a backslash
5418           followed by a newline. */
5419        assert(i < len || (i == len && c == '\n'));
5420
5421        switch (c) {
5422
5423            /* \x escapes */
5424#define WRITECHAR(ch)                                   \
5425            do {                                        \
5426                if (unicode_putchar(&v, &i, ch) < 0)    \
5427                    goto onError;                       \
5428            }while(0)
5429
5430        case '\n': break;
5431        case '\\': WRITECHAR('\\'); break;
5432        case '\'': WRITECHAR('\''); break;
5433        case '\"': WRITECHAR('\"'); break;
5434        case 'b': WRITECHAR('\b'); break;
5435        /* FF */
5436        case 'f': WRITECHAR('\014'); break;
5437        case 't': WRITECHAR('\t'); break;
5438        case 'n': WRITECHAR('\n'); break;
5439        case 'r': WRITECHAR('\r'); break;
5440        /* VT */
5441        case 'v': WRITECHAR('\013'); break;
5442        /* BEL, not classic C */
5443        case 'a': WRITECHAR('\007'); break;
5444
5445            /* \OOO (octal) escapes */
5446        case '0': case '1': case '2': case '3':
5447        case '4': case '5': case '6': case '7':
5448            x = s[-1] - '0';
5449            if (s < end && '0' <= *s && *s <= '7') {
5450                x = (x<<3) + *s++ - '0';
5451                if (s < end && '0' <= *s && *s <= '7')
5452                    x = (x<<3) + *s++ - '0';
5453            }
5454            WRITECHAR(x);
5455            break;
5456
5457            /* hex escapes */
5458            /* \xXX */
5459        case 'x':
5460            digits = 2;
5461            message = "truncated \\xXX escape";
5462            goto hexescape;
5463
5464            /* \uXXXX */
5465        case 'u':
5466            digits = 4;
5467            message = "truncated \\uXXXX escape";
5468            goto hexescape;
5469
5470            /* \UXXXXXXXX */
5471        case 'U':
5472            digits = 8;
5473            message = "truncated \\UXXXXXXXX escape";
5474        hexescape:
5475            chr = 0;
5476            if (s+digits>end) {
5477                endinpos = size;
5478                if (unicode_decode_call_errorhandler(
5479                        errors, &errorHandler,
5480                        "unicodeescape", "end of string in escape sequence",
5481                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5482                        &v, &i))
5483                    goto onError;
5484                goto nextByte;
5485            }
5486            for (j = 0; j < digits; ++j) {
5487                c = (unsigned char) s[j];
5488                if (!Py_ISXDIGIT(c)) {
5489                    endinpos = (s+j+1)-starts;
5490                    if (unicode_decode_call_errorhandler(
5491                            errors, &errorHandler,
5492                            "unicodeescape", message,
5493                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5494                            &v, &i))
5495                        goto onError;
5496                    len = PyUnicode_GET_LENGTH(v);
5497                    goto nextByte;
5498                }
5499                chr = (chr<<4) & ~0xF;
5500                if (c >= '0' && c <= '9')
5501                    chr += c - '0';
5502                else if (c >= 'a' && c <= 'f')
5503                    chr += 10 + c - 'a';
5504                else
5505                    chr += 10 + c - 'A';
5506            }
5507            s += j;
5508            if (chr == 0xffffffff && PyErr_Occurred())
5509                /* _decoding_error will have already written into the
5510                   target buffer. */
5511                break;
5512        store:
5513            /* when we get here, chr is a 32-bit unicode character */
5514            if (chr <= MAX_UNICODE) {
5515                WRITECHAR(chr);
5516            } else {
5517                endinpos = s-starts;
5518                if (unicode_decode_call_errorhandler(
5519                        errors, &errorHandler,
5520                        "unicodeescape", "illegal Unicode character",
5521                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5522                        &v, &i))
5523                    goto onError;
5524            }
5525            break;
5526
5527            /* \N{name} */
5528        case 'N':
5529            message = "malformed \\N character escape";
5530            if (ucnhash_CAPI == NULL) {
5531                /* load the unicode data module */
5532                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5533                                                PyUnicodeData_CAPSULE_NAME, 1);
5534                if (ucnhash_CAPI == NULL)
5535                    goto ucnhashError;
5536            }
5537            if (*s == '{') {
5538                const char *start = s+1;
5539                /* look for the closing brace */
5540                while (*s != '}' && s < end)
5541                    s++;
5542                if (s > start && s < end && *s == '}') {
5543                    /* found a name.  look it up in the unicode database */
5544                    message = "unknown Unicode character name";
5545                    s++;
5546                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5547                                              &chr, 0))
5548                        goto store;
5549                }
5550            }
5551            endinpos = s-starts;
5552            if (unicode_decode_call_errorhandler(
5553                    errors, &errorHandler,
5554                    "unicodeescape", message,
5555                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5556                    &v, &i))
5557                goto onError;
5558            break;
5559
5560        default:
5561            if (s > end) {
5562                message = "\\ at end of string";
5563                s--;
5564                endinpos = s-starts;
5565                if (unicode_decode_call_errorhandler(
5566                        errors, &errorHandler,
5567                        "unicodeescape", message,
5568                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5569                        &v, &i))
5570                    goto onError;
5571            }
5572            else {
5573                WRITECHAR('\\');
5574                WRITECHAR(s[-1]);
5575            }
5576            break;
5577        }
5578      nextByte:
5579        ;
5580    }
5581#undef WRITECHAR
5582
5583    if (unicode_resize(&v, i) < 0)
5584        goto onError;
5585    Py_XDECREF(errorHandler);
5586    Py_XDECREF(exc);
5587    return unicode_result(v);
5588
5589  ucnhashError:
5590    PyErr_SetString(
5591        PyExc_UnicodeError,
5592        "\\N escapes not supported (can't load unicodedata module)"
5593        );
5594    Py_XDECREF(v);
5595    Py_XDECREF(errorHandler);
5596    Py_XDECREF(exc);
5597    return NULL;
5598
5599  onError:
5600    Py_XDECREF(v);
5601    Py_XDECREF(errorHandler);
5602    Py_XDECREF(exc);
5603    return NULL;
5604}
5605
5606/* Return a Unicode-Escape string version of the Unicode object.
5607
5608   If quotes is true, the string is enclosed in u"" or u'' quotes as
5609   appropriate.
5610
5611*/
5612
5613PyObject *
5614PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5615{
5616    Py_ssize_t i, len;
5617    PyObject *repr;
5618    char *p;
5619    int kind;
5620    void *data;
5621    Py_ssize_t expandsize = 0;
5622
5623    /* Initial allocation is based on the longest-possible character
5624       escape.
5625
5626       For UCS1 strings it's '\xxx', 4 bytes per source character.
5627       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5628       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5629    */
5630
5631    if (!PyUnicode_Check(unicode)) {
5632        PyErr_BadArgument();
5633        return NULL;
5634    }
5635    if (PyUnicode_READY(unicode) == -1)
5636        return NULL;
5637    len = PyUnicode_GET_LENGTH(unicode);
5638    kind = PyUnicode_KIND(unicode);
5639    data = PyUnicode_DATA(unicode);
5640    switch (kind) {
5641    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5642    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5643    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5644    }
5645
5646    if (len == 0)
5647        return PyBytes_FromStringAndSize(NULL, 0);
5648
5649    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5650        return PyErr_NoMemory();
5651
5652    repr = PyBytes_FromStringAndSize(NULL,
5653                                     2
5654                                     + expandsize*len
5655                                     + 1);
5656    if (repr == NULL)
5657        return NULL;
5658
5659    p = PyBytes_AS_STRING(repr);
5660
5661    for (i = 0; i < len; i++) {
5662        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5663
5664        /* Escape backslashes */
5665        if (ch == '\\') {
5666            *p++ = '\\';
5667            *p++ = (char) ch;
5668            continue;
5669        }
5670
5671        /* Map 21-bit characters to '\U00xxxxxx' */
5672        else if (ch >= 0x10000) {
5673            assert(ch <= MAX_UNICODE);
5674            *p++ = '\\';
5675            *p++ = 'U';
5676            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5677            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5678            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5679            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5680            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5681            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5682            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5683            *p++ = Py_hexdigits[ch & 0x0000000F];
5684            continue;
5685        }
5686
5687        /* Map 16-bit characters to '\uxxxx' */
5688        if (ch >= 256) {
5689            *p++ = '\\';
5690            *p++ = 'u';
5691            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5692            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5693            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5694            *p++ = Py_hexdigits[ch & 0x000F];
5695        }
5696
5697        /* Map special whitespace to '\t', \n', '\r' */
5698        else if (ch == '\t') {
5699            *p++ = '\\';
5700            *p++ = 't';
5701        }
5702        else if (ch == '\n') {
5703            *p++ = '\\';
5704            *p++ = 'n';
5705        }
5706        else if (ch == '\r') {
5707            *p++ = '\\';
5708            *p++ = 'r';
5709        }
5710
5711        /* Map non-printable US ASCII to '\xhh' */
5712        else if (ch < ' ' || ch >= 0x7F) {
5713            *p++ = '\\';
5714            *p++ = 'x';
5715            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5716            *p++ = Py_hexdigits[ch & 0x000F];
5717        }
5718
5719        /* Copy everything else as-is */
5720        else
5721            *p++ = (char) ch;
5722    }
5723
5724    assert(p - PyBytes_AS_STRING(repr) > 0);
5725    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5726        return NULL;
5727    return repr;
5728}
5729
5730PyObject *
5731PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5732                              Py_ssize_t size)
5733{
5734    PyObject *result;
5735    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5736    if (tmp == NULL)
5737        return NULL;
5738    result = PyUnicode_AsUnicodeEscapeString(tmp);
5739    Py_DECREF(tmp);
5740    return result;
5741}
5742
5743/* --- Raw Unicode Escape Codec ------------------------------------------- */
5744
5745PyObject *
5746PyUnicode_DecodeRawUnicodeEscape(const char *s,
5747                                 Py_ssize_t size,
5748                                 const char *errors)
5749{
5750    const char *starts = s;
5751    Py_ssize_t startinpos;
5752    Py_ssize_t endinpos;
5753    Py_ssize_t outpos;
5754    PyObject *v;
5755    const char *end;
5756    const char *bs;
5757    PyObject *errorHandler = NULL;
5758    PyObject *exc = NULL;
5759
5760    /* Escaped strings will always be longer than the resulting
5761       Unicode string, so we start with size here and then reduce the
5762       length after conversion to the true value. (But decoding error
5763       handler might have to resize the string) */
5764    v = PyUnicode_New(size, 127);
5765    if (v == NULL)
5766        goto onError;
5767    if (size == 0)
5768        return v;
5769    outpos = 0;
5770    end = s + size;
5771    while (s < end) {
5772        unsigned char c;
5773        Py_UCS4 x;
5774        int i;
5775        int count;
5776
5777        /* Non-escape characters are interpreted as Unicode ordinals */
5778        if (*s != '\\') {
5779            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5780                goto onError;
5781            continue;
5782        }
5783        startinpos = s-starts;
5784
5785        /* \u-escapes are only interpreted iff the number of leading
5786           backslashes if odd */
5787        bs = s;
5788        for (;s < end;) {
5789            if (*s != '\\')
5790                break;
5791            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5792                goto onError;
5793        }
5794        if (((s - bs) & 1) == 0 ||
5795            s >= end ||
5796            (*s != 'u' && *s != 'U')) {
5797            continue;
5798        }
5799        outpos--;
5800        count = *s=='u' ? 4 : 8;
5801        s++;
5802
5803        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5804        for (x = 0, i = 0; i < count; ++i, ++s) {
5805            c = (unsigned char)*s;
5806            if (!Py_ISXDIGIT(c)) {
5807                endinpos = s-starts;
5808                if (unicode_decode_call_errorhandler(
5809                        errors, &errorHandler,
5810                        "rawunicodeescape", "truncated \\uXXXX",
5811                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5812                        &v, &outpos))
5813                    goto onError;
5814                goto nextByte;
5815            }
5816            x = (x<<4) & ~0xF;
5817            if (c >= '0' && c <= '9')
5818                x += c - '0';
5819            else if (c >= 'a' && c <= 'f')
5820                x += 10 + c - 'a';
5821            else
5822                x += 10 + c - 'A';
5823        }
5824        if (x <= MAX_UNICODE) {
5825            if (unicode_putchar(&v, &outpos, x) < 0)
5826                goto onError;
5827        } else {
5828            endinpos = s-starts;
5829            if (unicode_decode_call_errorhandler(
5830                    errors, &errorHandler,
5831                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5832                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5833                    &v, &outpos))
5834                goto onError;
5835        }
5836      nextByte:
5837        ;
5838    }
5839    if (unicode_resize(&v, outpos) < 0)
5840        goto onError;
5841    Py_XDECREF(errorHandler);
5842    Py_XDECREF(exc);
5843    return unicode_result(v);
5844
5845  onError:
5846    Py_XDECREF(v);
5847    Py_XDECREF(errorHandler);
5848    Py_XDECREF(exc);
5849    return NULL;
5850}
5851
5852
5853PyObject *
5854PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
5855{
5856    PyObject *repr;
5857    char *p;
5858    char *q;
5859    Py_ssize_t expandsize, pos;
5860    int kind;
5861    void *data;
5862    Py_ssize_t len;
5863
5864    if (!PyUnicode_Check(unicode)) {
5865        PyErr_BadArgument();
5866        return NULL;
5867    }
5868    if (PyUnicode_READY(unicode) == -1)
5869        return NULL;
5870    kind = PyUnicode_KIND(unicode);
5871    data = PyUnicode_DATA(unicode);
5872    len = PyUnicode_GET_LENGTH(unicode);
5873    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5874       bytes, and 1 byte characters 4. */
5875    expandsize = kind * 2 + 2;
5876
5877    if (len > PY_SSIZE_T_MAX / expandsize)
5878        return PyErr_NoMemory();
5879
5880    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
5881    if (repr == NULL)
5882        return NULL;
5883    if (len == 0)
5884        return repr;
5885
5886    p = q = PyBytes_AS_STRING(repr);
5887    for (pos = 0; pos < len; pos++) {
5888        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
5889        /* Map 32-bit characters to '\Uxxxxxxxx' */
5890        if (ch >= 0x10000) {
5891            assert(ch <= MAX_UNICODE);
5892            *p++ = '\\';
5893            *p++ = 'U';
5894            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5895            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5896            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5897            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5898            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5899            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5900            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5901            *p++ = Py_hexdigits[ch & 15];
5902        }
5903        /* Map 16-bit characters to '\uxxxx' */
5904        else if (ch >= 256) {
5905            *p++ = '\\';
5906            *p++ = 'u';
5907            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5908            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5909            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5910            *p++ = Py_hexdigits[ch & 15];
5911        }
5912        /* Copy everything else as-is */
5913        else
5914            *p++ = (char) ch;
5915    }
5916
5917    assert(p > q);
5918    if (_PyBytes_Resize(&repr, p - q) < 0)
5919        return NULL;
5920    return repr;
5921}
5922
5923PyObject *
5924PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5925                                 Py_ssize_t size)
5926{
5927    PyObject *result;
5928    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5929    if (tmp == NULL)
5930        return NULL;
5931    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5932    Py_DECREF(tmp);
5933    return result;
5934}
5935
5936/* --- Unicode Internal Codec ------------------------------------------- */
5937
5938PyObject *
5939_PyUnicode_DecodeUnicodeInternal(const char *s,
5940                                 Py_ssize_t size,
5941                                 const char *errors)
5942{
5943    const char *starts = s;
5944    Py_ssize_t startinpos;
5945    Py_ssize_t endinpos;
5946    Py_ssize_t outpos;
5947    PyObject *v;
5948    const char *end;
5949    const char *reason;
5950    PyObject *errorHandler = NULL;
5951    PyObject *exc = NULL;
5952
5953    if (PyErr_WarnEx(PyExc_DeprecationWarning,
5954                     "unicode_internal codec has been deprecated",
5955                     1))
5956        return NULL;
5957
5958    /* XXX overflow detection missing */
5959    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
5960    if (v == NULL)
5961        goto onError;
5962    if (PyUnicode_GET_LENGTH(v) == 0)
5963        return v;
5964    outpos = 0;
5965    end = s + size;
5966
5967    while (s < end) {
5968        Py_UNICODE uch;
5969        Py_UCS4 ch;
5970        /* We copy the raw representation one byte at a time because the
5971           pointer may be unaligned (see test_codeccallbacks). */
5972        ((char *) &uch)[0] = s[0];
5973        ((char *) &uch)[1] = s[1];
5974#ifdef Py_UNICODE_WIDE
5975        ((char *) &uch)[2] = s[2];
5976        ((char *) &uch)[3] = s[3];
5977#endif
5978        ch = uch;
5979
5980        /* We have to sanity check the raw data, otherwise doom looms for
5981           some malformed UCS-4 data. */
5982        if (
5983#ifdef Py_UNICODE_WIDE
5984            ch > 0x10ffff ||
5985#endif
5986            end-s < Py_UNICODE_SIZE
5987            )
5988        {
5989            startinpos = s - starts;
5990            if (end-s < Py_UNICODE_SIZE) {
5991                endinpos = end-starts;
5992                reason = "truncated input";
5993            }
5994            else {
5995                endinpos = s - starts + Py_UNICODE_SIZE;
5996                reason = "illegal code point (> 0x10FFFF)";
5997            }
5998            if (unicode_decode_call_errorhandler(
5999                    errors, &errorHandler,
6000                    "unicode_internal", reason,
6001                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6002                    &v, &outpos))
6003                goto onError;
6004            continue;
6005        }
6006
6007        s += Py_UNICODE_SIZE;
6008#ifndef Py_UNICODE_WIDE
6009        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
6010        {
6011            Py_UNICODE uch2;
6012            ((char *) &uch2)[0] = s[0];
6013            ((char *) &uch2)[1] = s[1];
6014            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6015            {
6016                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6017                s += Py_UNICODE_SIZE;
6018            }
6019        }
6020#endif
6021
6022        if (unicode_putchar(&v, &outpos, ch) < 0)
6023            goto onError;
6024    }
6025
6026    if (unicode_resize(&v, outpos) < 0)
6027        goto onError;
6028    Py_XDECREF(errorHandler);
6029    Py_XDECREF(exc);
6030    return unicode_result(v);
6031
6032  onError:
6033    Py_XDECREF(v);
6034    Py_XDECREF(errorHandler);
6035    Py_XDECREF(exc);
6036    return NULL;
6037}
6038
6039/* --- Latin-1 Codec ------------------------------------------------------ */
6040
6041PyObject *
6042PyUnicode_DecodeLatin1(const char *s,
6043                       Py_ssize_t size,
6044                       const char *errors)
6045{
6046    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6047    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6048}
6049
6050/* create or adjust a UnicodeEncodeError */
6051static void
6052make_encode_exception(PyObject **exceptionObject,
6053                      const char *encoding,
6054                      PyObject *unicode,
6055                      Py_ssize_t startpos, Py_ssize_t endpos,
6056                      const char *reason)
6057{
6058    if (*exceptionObject == NULL) {
6059        *exceptionObject = PyObject_CallFunction(
6060            PyExc_UnicodeEncodeError, "sOnns",
6061            encoding, unicode, startpos, endpos, reason);
6062    }
6063    else {
6064        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6065            goto onError;
6066        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6067            goto onError;
6068        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6069            goto onError;
6070        return;
6071      onError:
6072        Py_DECREF(*exceptionObject);
6073        *exceptionObject = NULL;
6074    }
6075}
6076
6077/* raises a UnicodeEncodeError */
6078static void
6079raise_encode_exception(PyObject **exceptionObject,
6080                       const char *encoding,
6081                       PyObject *unicode,
6082                       Py_ssize_t startpos, Py_ssize_t endpos,
6083                       const char *reason)
6084{
6085    make_encode_exception(exceptionObject,
6086                          encoding, unicode, startpos, endpos, reason);
6087    if (*exceptionObject != NULL)
6088        PyCodec_StrictErrors(*exceptionObject);
6089}
6090
6091/* error handling callback helper:
6092   build arguments, call the callback and check the arguments,
6093   put the result into newpos and return the replacement string, which
6094   has to be freed by the caller */
6095static PyObject *
6096unicode_encode_call_errorhandler(const char *errors,
6097                                 PyObject **errorHandler,
6098                                 const char *encoding, const char *reason,
6099                                 PyObject *unicode, PyObject **exceptionObject,
6100                                 Py_ssize_t startpos, Py_ssize_t endpos,
6101                                 Py_ssize_t *newpos)
6102{
6103    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6104    Py_ssize_t len;
6105    PyObject *restuple;
6106    PyObject *resunicode;
6107
6108    if (*errorHandler == NULL) {
6109        *errorHandler = PyCodec_LookupError(errors);
6110        if (*errorHandler == NULL)
6111            return NULL;
6112    }
6113
6114    if (PyUnicode_READY(unicode) == -1)
6115        return NULL;
6116    len = PyUnicode_GET_LENGTH(unicode);
6117
6118    make_encode_exception(exceptionObject,
6119                          encoding, unicode, startpos, endpos, reason);
6120    if (*exceptionObject == NULL)
6121        return NULL;
6122
6123    restuple = PyObject_CallFunctionObjArgs(
6124        *errorHandler, *exceptionObject, NULL);
6125    if (restuple == NULL)
6126        return NULL;
6127    if (!PyTuple_Check(restuple)) {
6128        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6129        Py_DECREF(restuple);
6130        return NULL;
6131    }
6132    if (!PyArg_ParseTuple(restuple, argparse,
6133                          &resunicode, newpos)) {
6134        Py_DECREF(restuple);
6135        return NULL;
6136    }
6137    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6138        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6139        Py_DECREF(restuple);
6140        return NULL;
6141    }
6142    if (*newpos<0)
6143        *newpos = len + *newpos;
6144    if (*newpos<0 || *newpos>len) {
6145        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6146        Py_DECREF(restuple);
6147        return NULL;
6148    }
6149    Py_INCREF(resunicode);
6150    Py_DECREF(restuple);
6151    return resunicode;
6152}
6153
6154static PyObject *
6155unicode_encode_ucs1(PyObject *unicode,
6156                    const char *errors,
6157                    unsigned int limit)
6158{
6159    /* input state */
6160    Py_ssize_t pos=0, size;
6161    int kind;
6162    void *data;
6163    /* output object */
6164    PyObject *res;
6165    /* pointer into the output */
6166    char *str;
6167    /* current output position */
6168    Py_ssize_t ressize;
6169    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6170    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6171    PyObject *errorHandler = NULL;
6172    PyObject *exc = NULL;
6173    /* the following variable is used for caching string comparisons
6174     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6175    int known_errorHandler = -1;
6176
6177    if (PyUnicode_READY(unicode) == -1)
6178        return NULL;
6179    size = PyUnicode_GET_LENGTH(unicode);
6180    kind = PyUnicode_KIND(unicode);
6181    data = PyUnicode_DATA(unicode);
6182    /* allocate enough for a simple encoding without
6183       replacements, if we need more, we'll resize */
6184    if (size == 0)
6185        return PyBytes_FromStringAndSize(NULL, 0);
6186    res = PyBytes_FromStringAndSize(NULL, size);
6187    if (res == NULL)
6188        return NULL;
6189    str = PyBytes_AS_STRING(res);
6190    ressize = size;
6191
6192    while (pos < size) {
6193        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6194
6195        /* can we encode this? */
6196        if (c<limit) {
6197            /* no overflow check, because we know that the space is enough */
6198            *str++ = (char)c;
6199            ++pos;
6200        }
6201        else {
6202            Py_ssize_t requiredsize;
6203            PyObject *repunicode;
6204            Py_ssize_t repsize, newpos, respos, i;
6205            /* startpos for collecting unencodable chars */
6206            Py_ssize_t collstart = pos;
6207            Py_ssize_t collend = pos;
6208            /* find all unecodable characters */
6209            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6210                ++collend;
6211            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6212            if (known_errorHandler==-1) {
6213                if ((errors==NULL) || (!strcmp(errors, "strict")))
6214                    known_errorHandler = 1;
6215                else if (!strcmp(errors, "replace"))
6216                    known_errorHandler = 2;
6217                else if (!strcmp(errors, "ignore"))
6218                    known_errorHandler = 3;
6219                else if (!strcmp(errors, "xmlcharrefreplace"))
6220                    known_errorHandler = 4;
6221                else
6222                    known_errorHandler = 0;
6223            }
6224            switch (known_errorHandler) {
6225            case 1: /* strict */
6226                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6227                goto onError;
6228            case 2: /* replace */
6229                while (collstart++<collend)
6230                    *str++ = '?'; /* fall through */
6231            case 3: /* ignore */
6232                pos = collend;
6233                break;
6234            case 4: /* xmlcharrefreplace */
6235                respos = str - PyBytes_AS_STRING(res);
6236                /* determine replacement size */
6237                for (i = collstart, repsize = 0; i < collend; ++i) {
6238                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6239                    if (ch < 10)
6240                        repsize += 2+1+1;
6241                    else if (ch < 100)
6242                        repsize += 2+2+1;
6243                    else if (ch < 1000)
6244                        repsize += 2+3+1;
6245                    else if (ch < 10000)
6246                        repsize += 2+4+1;
6247                    else if (ch < 100000)
6248                        repsize += 2+5+1;
6249                    else if (ch < 1000000)
6250                        repsize += 2+6+1;
6251                    else {
6252                        assert(ch <= MAX_UNICODE);
6253                        repsize += 2+7+1;
6254                    }
6255                }
6256                requiredsize = respos+repsize+(size-collend);
6257                if (requiredsize > ressize) {
6258                    if (requiredsize<2*ressize)
6259                        requiredsize = 2*ressize;
6260                    if (_PyBytes_Resize(&res, requiredsize))
6261                        goto onError;
6262                    str = PyBytes_AS_STRING(res) + respos;
6263                    ressize = requiredsize;
6264                }
6265                /* generate replacement */
6266                for (i = collstart; i < collend; ++i) {
6267                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6268                }
6269                pos = collend;
6270                break;
6271            default:
6272                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6273                                                              encoding, reason, unicode, &exc,
6274                                                              collstart, collend, &newpos);
6275                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6276                                           PyUnicode_READY(repunicode) == -1))
6277                    goto onError;
6278                if (PyBytes_Check(repunicode)) {
6279                    /* Directly copy bytes result to output. */
6280                    repsize = PyBytes_Size(repunicode);
6281                    if (repsize > 1) {
6282                        /* Make room for all additional bytes. */
6283                        respos = str - PyBytes_AS_STRING(res);
6284                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6285                            Py_DECREF(repunicode);
6286                            goto onError;
6287                        }
6288                        str = PyBytes_AS_STRING(res) + respos;
6289                        ressize += repsize-1;
6290                    }
6291                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6292                    str += repsize;
6293                    pos = newpos;
6294                    Py_DECREF(repunicode);
6295                    break;
6296                }
6297                /* need more space? (at least enough for what we
6298                   have+the replacement+the rest of the string, so
6299                   we won't have to check space for encodable characters) */
6300                respos = str - PyBytes_AS_STRING(res);
6301                repsize = PyUnicode_GET_LENGTH(repunicode);
6302                requiredsize = respos+repsize+(size-collend);
6303                if (requiredsize > ressize) {
6304                    if (requiredsize<2*ressize)
6305                        requiredsize = 2*ressize;
6306                    if (_PyBytes_Resize(&res, requiredsize)) {
6307                        Py_DECREF(repunicode);
6308                        goto onError;
6309                    }
6310                    str = PyBytes_AS_STRING(res) + respos;
6311                    ressize = requiredsize;
6312                }
6313                /* check if there is anything unencodable in the replacement
6314                   and copy it to the output */
6315                for (i = 0; repsize-->0; ++i, ++str) {
6316                    c = PyUnicode_READ_CHAR(repunicode, i);
6317                    if (c >= limit) {
6318                        raise_encode_exception(&exc, encoding, unicode,
6319                                               pos, pos+1, reason);
6320                        Py_DECREF(repunicode);
6321                        goto onError;
6322                    }
6323                    *str = (char)c;
6324                }
6325                pos = newpos;
6326                Py_DECREF(repunicode);
6327            }
6328        }
6329    }
6330    /* Resize if we allocated to much */
6331    size = str - PyBytes_AS_STRING(res);
6332    if (size < ressize) { /* If this falls res will be NULL */
6333        assert(size >= 0);
6334        if (_PyBytes_Resize(&res, size) < 0)
6335            goto onError;
6336    }
6337
6338    Py_XDECREF(errorHandler);
6339    Py_XDECREF(exc);
6340    return res;
6341
6342  onError:
6343    Py_XDECREF(res);
6344    Py_XDECREF(errorHandler);
6345    Py_XDECREF(exc);
6346    return NULL;
6347}
6348
6349/* Deprecated */
6350PyObject *
6351PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6352                       Py_ssize_t size,
6353                       const char *errors)
6354{
6355    PyObject *result;
6356    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6357    if (unicode == NULL)
6358        return NULL;
6359    result = unicode_encode_ucs1(unicode, errors, 256);
6360    Py_DECREF(unicode);
6361    return result;
6362}
6363
6364PyObject *
6365_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6366{
6367    if (!PyUnicode_Check(unicode)) {
6368        PyErr_BadArgument();
6369        return NULL;
6370    }
6371    if (PyUnicode_READY(unicode) == -1)
6372        return NULL;
6373    /* Fast path: if it is a one-byte string, construct
6374       bytes object directly. */
6375    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6376        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6377                                         PyUnicode_GET_LENGTH(unicode));
6378    /* Non-Latin-1 characters present. Defer to above function to
6379       raise the exception. */
6380    return unicode_encode_ucs1(unicode, errors, 256);
6381}
6382
6383PyObject*
6384PyUnicode_AsLatin1String(PyObject *unicode)
6385{
6386    return _PyUnicode_AsLatin1String(unicode, NULL);
6387}
6388
6389/* --- 7-bit ASCII Codec -------------------------------------------------- */
6390
6391PyObject *
6392PyUnicode_DecodeASCII(const char *s,
6393                      Py_ssize_t size,
6394                      const char *errors)
6395{
6396    const char *starts = s;
6397    PyObject *unicode;
6398    int kind;
6399    void *data;
6400    Py_ssize_t startinpos;
6401    Py_ssize_t endinpos;
6402    Py_ssize_t outpos;
6403    const char *e;
6404    PyObject *errorHandler = NULL;
6405    PyObject *exc = NULL;
6406
6407    if (size == 0) {
6408        Py_INCREF(unicode_empty);
6409        return unicode_empty;
6410    }
6411
6412    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6413    if (size == 1 && (unsigned char)s[0] < 128)
6414        return get_latin1_char((unsigned char)s[0]);
6415
6416    unicode = PyUnicode_New(size, 127);
6417    if (unicode == NULL)
6418        goto onError;
6419
6420    e = s + size;
6421    data = PyUnicode_1BYTE_DATA(unicode);
6422    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6423    if (outpos == size)
6424        return unicode;
6425
6426    s += outpos;
6427    kind = PyUnicode_1BYTE_KIND;
6428    while (s < e) {
6429        register unsigned char c = (unsigned char)*s;
6430        if (c < 128) {
6431            PyUnicode_WRITE(kind, data, outpos++, c);
6432            ++s;
6433        }
6434        else {
6435            startinpos = s-starts;
6436            endinpos = startinpos + 1;
6437            if (unicode_decode_call_errorhandler(
6438                    errors, &errorHandler,
6439                    "ascii", "ordinal not in range(128)",
6440                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6441                    &unicode, &outpos))
6442                goto onError;
6443            kind = PyUnicode_KIND(unicode);
6444            data = PyUnicode_DATA(unicode);
6445        }
6446    }
6447    if (unicode_resize(&unicode, outpos) < 0)
6448        goto onError;
6449    Py_XDECREF(errorHandler);
6450    Py_XDECREF(exc);
6451    assert(_PyUnicode_CheckConsistency(unicode, 1));
6452    return unicode;
6453
6454  onError:
6455    Py_XDECREF(unicode);
6456    Py_XDECREF(errorHandler);
6457    Py_XDECREF(exc);
6458    return NULL;
6459}
6460
6461/* Deprecated */
6462PyObject *
6463PyUnicode_EncodeASCII(const Py_UNICODE *p,
6464                      Py_ssize_t size,
6465                      const char *errors)
6466{
6467    PyObject *result;
6468    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6469    if (unicode == NULL)
6470        return NULL;
6471    result = unicode_encode_ucs1(unicode, errors, 128);
6472    Py_DECREF(unicode);
6473    return result;
6474}
6475
6476PyObject *
6477_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6478{
6479    if (!PyUnicode_Check(unicode)) {
6480        PyErr_BadArgument();
6481        return NULL;
6482    }
6483    if (PyUnicode_READY(unicode) == -1)
6484        return NULL;
6485    /* Fast path: if it is an ASCII-only string, construct bytes object
6486       directly. Else defer to above function to raise the exception. */
6487    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6488        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6489                                         PyUnicode_GET_LENGTH(unicode));
6490    return unicode_encode_ucs1(unicode, errors, 128);
6491}
6492
6493PyObject *
6494PyUnicode_AsASCIIString(PyObject *unicode)
6495{
6496    return _PyUnicode_AsASCIIString(unicode, NULL);
6497}
6498
6499#ifdef HAVE_MBCS
6500
6501/* --- MBCS codecs for Windows -------------------------------------------- */
6502
6503#if SIZEOF_INT < SIZEOF_SIZE_T
6504#define NEED_RETRY
6505#endif
6506
6507#ifndef WC_ERR_INVALID_CHARS
6508#  define WC_ERR_INVALID_CHARS 0x0080
6509#endif
6510
6511static char*
6512code_page_name(UINT code_page, PyObject **obj)
6513{
6514    *obj = NULL;
6515    if (code_page == CP_ACP)
6516        return "mbcs";
6517    if (code_page == CP_UTF7)
6518        return "CP_UTF7";
6519    if (code_page == CP_UTF8)
6520        return "CP_UTF8";
6521
6522    *obj = PyBytes_FromFormat("cp%u", code_page);
6523    if (*obj == NULL)
6524        return NULL;
6525    return PyBytes_AS_STRING(*obj);
6526}
6527
6528static int
6529is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6530{
6531    const char *curr = s + offset;
6532    const char *prev;
6533
6534    if (!IsDBCSLeadByteEx(code_page, *curr))
6535        return 0;
6536
6537    prev = CharPrevExA(code_page, s, curr, 0);
6538    if (prev == curr)
6539        return 1;
6540    /* FIXME: This code is limited to "true" double-byte encodings,
6541       as it assumes an incomplete character consists of a single
6542       byte. */
6543    if (curr - prev == 2)
6544        return 1;
6545    if (!IsDBCSLeadByteEx(code_page, *prev))
6546        return 1;
6547    return 0;
6548}
6549
6550static DWORD
6551decode_code_page_flags(UINT code_page)
6552{
6553    if (code_page == CP_UTF7) {
6554        /* The CP_UTF7 decoder only supports flags=0 */
6555        return 0;
6556    }
6557    else
6558        return MB_ERR_INVALID_CHARS;
6559}
6560
6561/*
6562 * Decode a byte string from a Windows code page into unicode object in strict
6563 * mode.
6564 *
6565 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6566 * WindowsError and returns -1 on other error.
6567 */
6568static int
6569decode_code_page_strict(UINT code_page,
6570                        PyObject **v,
6571                        const char *in,
6572                        int insize)
6573{
6574    const DWORD flags = decode_code_page_flags(code_page);
6575    wchar_t *out;
6576    DWORD outsize;
6577
6578    /* First get the size of the result */
6579    assert(insize > 0);
6580    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6581    if (outsize <= 0)
6582        goto error;
6583
6584    if (*v == NULL) {
6585        /* Create unicode object */
6586        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6587        *v = (PyObject*)_PyUnicode_New(outsize);
6588        if (*v == NULL)
6589            return -1;
6590        out = PyUnicode_AS_UNICODE(*v);
6591    }
6592    else {
6593        /* Extend unicode object */
6594        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6595        if (unicode_resize(v, n + outsize) < 0)
6596            return -1;
6597        out = PyUnicode_AS_UNICODE(*v) + n;
6598    }
6599
6600    /* Do the conversion */
6601    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6602    if (outsize <= 0)
6603        goto error;
6604    return insize;
6605
6606error:
6607    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6608        return -2;
6609    PyErr_SetFromWindowsErr(0);
6610    return -1;
6611}
6612
6613/*
6614 * Decode a byte string from a code page into unicode object with an error
6615 * handler.
6616 *
6617 * Returns consumed size if succeed, or raise a WindowsError or
6618 * UnicodeDecodeError exception and returns -1 on error.
6619 */
6620static int
6621decode_code_page_errors(UINT code_page,
6622                        PyObject **v,
6623                        const char *in, const int size,
6624                        const char *errors)
6625{
6626    const char *startin = in;
6627    const char *endin = in + size;
6628    const DWORD flags = decode_code_page_flags(code_page);
6629    /* Ideally, we should get reason from FormatMessage. This is the Windows
6630       2000 English version of the message. */
6631    const char *reason = "No mapping for the Unicode character exists "
6632                         "in the target code page.";
6633    /* each step cannot decode more than 1 character, but a character can be
6634       represented as a surrogate pair */
6635    wchar_t buffer[2], *startout, *out;
6636    int insize, outsize;
6637    PyObject *errorHandler = NULL;
6638    PyObject *exc = NULL;
6639    PyObject *encoding_obj = NULL;
6640    char *encoding;
6641    DWORD err;
6642    int ret = -1;
6643
6644    assert(size > 0);
6645
6646    encoding = code_page_name(code_page, &encoding_obj);
6647    if (encoding == NULL)
6648        return -1;
6649
6650    if (errors == NULL || strcmp(errors, "strict") == 0) {
6651        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6652           UnicodeDecodeError. */
6653        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6654        if (exc != NULL) {
6655            PyCodec_StrictErrors(exc);
6656            Py_CLEAR(exc);
6657        }
6658        goto error;
6659    }
6660
6661    if (*v == NULL) {
6662        /* Create unicode object */
6663        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6664            PyErr_NoMemory();
6665            goto error;
6666        }
6667        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6668        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6669        if (*v == NULL)
6670            goto error;
6671        startout = PyUnicode_AS_UNICODE(*v);
6672    }
6673    else {
6674        /* Extend unicode object */
6675        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6676        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6677            PyErr_NoMemory();
6678            goto error;
6679        }
6680        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6681            goto error;
6682        startout = PyUnicode_AS_UNICODE(*v) + n;
6683    }
6684
6685    /* Decode the byte string character per character */
6686    out = startout;
6687    while (in < endin)
6688    {
6689        /* Decode a character */
6690        insize = 1;
6691        do
6692        {
6693            outsize = MultiByteToWideChar(code_page, flags,
6694                                          in, insize,
6695                                          buffer, Py_ARRAY_LENGTH(buffer));
6696            if (outsize > 0)
6697                break;
6698            err = GetLastError();
6699            if (err != ERROR_NO_UNICODE_TRANSLATION
6700                && err != ERROR_INSUFFICIENT_BUFFER)
6701            {
6702                PyErr_SetFromWindowsErr(0);
6703                goto error;
6704            }
6705            insize++;
6706        }
6707        /* 4=maximum length of a UTF-8 sequence */
6708        while (insize <= 4 && (in + insize) <= endin);
6709
6710        if (outsize <= 0) {
6711            Py_ssize_t startinpos, endinpos, outpos;
6712
6713            startinpos = in - startin;
6714            endinpos = startinpos + 1;
6715            outpos = out - PyUnicode_AS_UNICODE(*v);
6716            if (unicode_decode_call_errorhandler(
6717                    errors, &errorHandler,
6718                    encoding, reason,
6719                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
6720                    v, &outpos))
6721            {
6722                goto error;
6723            }
6724            out = PyUnicode_AS_UNICODE(*v) + outpos;
6725        }
6726        else {
6727            in += insize;
6728            memcpy(out, buffer, outsize * sizeof(wchar_t));
6729            out += outsize;
6730        }
6731    }
6732
6733    /* write a NUL character at the end */
6734    *out = 0;
6735
6736    /* Extend unicode object */
6737    outsize = out - startout;
6738    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
6739    if (unicode_resize(v, outsize) < 0)
6740        goto error;
6741    ret = size;
6742
6743error:
6744    Py_XDECREF(encoding_obj);
6745    Py_XDECREF(errorHandler);
6746    Py_XDECREF(exc);
6747    return ret;
6748}
6749
6750static PyObject *
6751decode_code_page_stateful(int code_page,
6752                          const char *s, Py_ssize_t size,
6753                          const char *errors, Py_ssize_t *consumed)
6754{
6755    PyObject *v = NULL;
6756    int chunk_size, final, converted, done;
6757
6758    if (code_page < 0) {
6759        PyErr_SetString(PyExc_ValueError, "invalid code page number");
6760        return NULL;
6761    }
6762
6763    if (consumed)
6764        *consumed = 0;
6765
6766    do
6767    {
6768#ifdef NEED_RETRY
6769        if (size > INT_MAX) {
6770            chunk_size = INT_MAX;
6771            final = 0;
6772            done = 0;
6773        }
6774        else
6775#endif
6776        {
6777            chunk_size = (int)size;
6778            final = (consumed == NULL);
6779            done = 1;
6780        }
6781
6782        /* Skip trailing lead-byte unless 'final' is set */
6783        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6784            --chunk_size;
6785
6786        if (chunk_size == 0 && done) {
6787            if (v != NULL)
6788                break;
6789            Py_INCREF(unicode_empty);
6790            return unicode_empty;
6791        }
6792
6793
6794        converted = decode_code_page_strict(code_page, &v,
6795                                            s, chunk_size);
6796        if (converted == -2)
6797            converted = decode_code_page_errors(code_page, &v,
6798                                                s, chunk_size,
6799                                                errors);
6800        assert(converted != 0);
6801
6802        if (converted < 0) {
6803            Py_XDECREF(v);
6804            return NULL;
6805        }
6806
6807        if (consumed)
6808            *consumed += converted;
6809
6810        s += converted;
6811        size -= converted;
6812    } while (!done);
6813
6814    return unicode_result(v);
6815}
6816
6817PyObject *
6818PyUnicode_DecodeCodePageStateful(int code_page,
6819                                 const char *s,
6820                                 Py_ssize_t size,
6821                                 const char *errors,
6822                                 Py_ssize_t *consumed)
6823{
6824    return decode_code_page_stateful(code_page, s, size, errors, consumed);
6825}
6826
6827PyObject *
6828PyUnicode_DecodeMBCSStateful(const char *s,
6829                             Py_ssize_t size,
6830                             const char *errors,
6831                             Py_ssize_t *consumed)
6832{
6833    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6834}
6835
6836PyObject *
6837PyUnicode_DecodeMBCS(const char *s,
6838                     Py_ssize_t size,
6839                     const char *errors)
6840{
6841    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6842}
6843
6844static DWORD
6845encode_code_page_flags(UINT code_page, const char *errors)
6846{
6847    if (code_page == CP_UTF8) {
6848        if (winver.dwMajorVersion >= 6)
6849            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6850               and later */
6851            return WC_ERR_INVALID_CHARS;
6852        else
6853            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6854            return 0;
6855    }
6856    else if (code_page == CP_UTF7) {
6857        /* CP_UTF7 only supports flags=0 */
6858        return 0;
6859    }
6860    else {
6861        if (errors != NULL && strcmp(errors, "replace") == 0)
6862            return 0;
6863        else
6864            return WC_NO_BEST_FIT_CHARS;
6865    }
6866}
6867
6868/*
6869 * Encode a Unicode string to a Windows code page into a byte string in strict
6870 * mode.
6871 *
6872 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6873 * a WindowsError and returns -1 on other error.
6874 */
6875static int
6876encode_code_page_strict(UINT code_page, PyObject **outbytes,
6877                        PyObject *unicode, Py_ssize_t offset, int len,
6878                        const char* errors)
6879{
6880    BOOL usedDefaultChar = FALSE;
6881    BOOL *pusedDefaultChar = &usedDefaultChar;
6882    int outsize;
6883    PyObject *exc = NULL;
6884    wchar_t *p;
6885    Py_ssize_t size;
6886    const DWORD flags = encode_code_page_flags(code_page, NULL);
6887    char *out;
6888    /* Create a substring so that we can get the UTF-16 representation
6889       of just the slice under consideration. */
6890    PyObject *substring;
6891
6892    assert(len > 0);
6893
6894    if (code_page != CP_UTF8 && code_page != CP_UTF7)
6895        pusedDefaultChar = &usedDefaultChar;
6896    else
6897        pusedDefaultChar = NULL;
6898
6899    substring = PyUnicode_Substring(unicode, offset, offset+len);
6900    if (substring == NULL)
6901        return -1;
6902    p = PyUnicode_AsUnicodeAndSize(substring, &size);
6903    if (p == NULL) {
6904        Py_DECREF(substring);
6905        return -1;
6906    }
6907
6908    /* First get the size of the result */
6909    outsize = WideCharToMultiByte(code_page, flags,
6910                                  p, size,
6911                                  NULL, 0,
6912                                  NULL, pusedDefaultChar);
6913    if (outsize <= 0)
6914        goto error;
6915    /* If we used a default char, then we failed! */
6916    if (pusedDefaultChar && *pusedDefaultChar) {
6917        Py_DECREF(substring);
6918        return -2;
6919    }
6920
6921    if (*outbytes == NULL) {
6922        /* Create string object */
6923        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
6924        if (*outbytes == NULL) {
6925            Py_DECREF(substring);
6926            return -1;
6927        }
6928        out = PyBytes_AS_STRING(*outbytes);
6929    }
6930    else {
6931        /* Extend string object */
6932        const Py_ssize_t n = PyBytes_Size(*outbytes);
6933        if (outsize > PY_SSIZE_T_MAX - n) {
6934            PyErr_NoMemory();
6935            Py_DECREF(substring);
6936            return -1;
6937        }
6938        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6939            Py_DECREF(substring);
6940            return -1;
6941        }
6942        out = PyBytes_AS_STRING(*outbytes) + n;
6943    }
6944
6945    /* Do the conversion */
6946    outsize = WideCharToMultiByte(code_page, flags,
6947                                  p, size,
6948                                  out, outsize,
6949                                  NULL, pusedDefaultChar);
6950    Py_CLEAR(substring);
6951    if (outsize <= 0)
6952        goto error;
6953    if (pusedDefaultChar && *pusedDefaultChar)
6954        return -2;
6955    return 0;
6956
6957error:
6958    Py_XDECREF(substring);
6959    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6960        return -2;
6961    PyErr_SetFromWindowsErr(0);
6962    return -1;
6963}
6964
6965/*
6966 * Encode a Unicode string to a Windows code page into a byte string using a
6967 * error handler.
6968 *
6969 * Returns consumed characters if succeed, or raise a WindowsError and returns
6970 * -1 on other error.
6971 */
6972static int
6973encode_code_page_errors(UINT code_page, PyObject **outbytes,
6974                        PyObject *unicode, Py_ssize_t unicode_offset,
6975                        Py_ssize_t insize, const char* errors)
6976{
6977    const DWORD flags = encode_code_page_flags(code_page, errors);
6978    Py_ssize_t pos = unicode_offset;
6979    Py_ssize_t endin = unicode_offset + insize;
6980    /* Ideally, we should get reason from FormatMessage. This is the Windows
6981       2000 English version of the message. */
6982    const char *reason = "invalid character";
6983    /* 4=maximum length of a UTF-8 sequence */
6984    char buffer[4];
6985    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
6986    Py_ssize_t outsize;
6987    char *out;
6988    PyObject *errorHandler = NULL;
6989    PyObject *exc = NULL;
6990    PyObject *encoding_obj = NULL;
6991    char *encoding;
6992    Py_ssize_t newpos, newoutsize;
6993    PyObject *rep;
6994    int ret = -1;
6995
6996    assert(insize > 0);
6997
6998    encoding = code_page_name(code_page, &encoding_obj);
6999    if (encoding == NULL)
7000        return -1;
7001
7002    if (errors == NULL || strcmp(errors, "strict") == 0) {
7003        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7004           then we raise a UnicodeEncodeError. */
7005        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7006        if (exc != NULL) {
7007            PyCodec_StrictErrors(exc);
7008            Py_DECREF(exc);
7009        }
7010        Py_XDECREF(encoding_obj);
7011        return -1;
7012    }
7013
7014    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7015        pusedDefaultChar = &usedDefaultChar;
7016    else
7017        pusedDefaultChar = NULL;
7018
7019    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7020        PyErr_NoMemory();
7021        goto error;
7022    }
7023    outsize = insize * Py_ARRAY_LENGTH(buffer);
7024
7025    if (*outbytes == NULL) {
7026        /* Create string object */
7027        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7028        if (*outbytes == NULL)
7029            goto error;
7030        out = PyBytes_AS_STRING(*outbytes);
7031    }
7032    else {
7033        /* Extend string object */
7034        Py_ssize_t n = PyBytes_Size(*outbytes);
7035        if (n > PY_SSIZE_T_MAX - outsize) {
7036            PyErr_NoMemory();
7037            goto error;
7038        }
7039        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7040            goto error;
7041        out = PyBytes_AS_STRING(*outbytes) + n;
7042    }
7043
7044    /* Encode the string character per character */
7045    while (pos < endin)
7046    {
7047        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7048        wchar_t chars[2];
7049        int charsize;
7050        if (ch < 0x10000) {
7051            chars[0] = (wchar_t)ch;
7052            charsize = 1;
7053        }
7054        else {
7055            ch -= 0x10000;
7056            chars[0] = 0xd800 + (ch >> 10);
7057            chars[1] = 0xdc00 + (ch & 0x3ff);
7058            charsize = 2;
7059        }
7060
7061        outsize = WideCharToMultiByte(code_page, flags,
7062                                      chars, charsize,
7063                                      buffer, Py_ARRAY_LENGTH(buffer),
7064                                      NULL, pusedDefaultChar);
7065        if (outsize > 0) {
7066            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7067            {
7068                pos++;
7069                memcpy(out, buffer, outsize);
7070                out += outsize;
7071                continue;
7072            }
7073        }
7074        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7075            PyErr_SetFromWindowsErr(0);
7076            goto error;
7077        }
7078
7079        rep = unicode_encode_call_errorhandler(
7080                  errors, &errorHandler, encoding, reason,
7081                  unicode, &exc,
7082                  pos, pos + 1, &newpos);
7083        if (rep == NULL)
7084            goto error;
7085        pos = newpos;
7086
7087        if (PyBytes_Check(rep)) {
7088            outsize = PyBytes_GET_SIZE(rep);
7089            if (outsize != 1) {
7090                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7091                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7092                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7093                    Py_DECREF(rep);
7094                    goto error;
7095                }
7096                out = PyBytes_AS_STRING(*outbytes) + offset;
7097            }
7098            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7099            out += outsize;
7100        }
7101        else {
7102            Py_ssize_t i;
7103            enum PyUnicode_Kind kind;
7104            void *data;
7105
7106            if (PyUnicode_READY(rep) == -1) {
7107                Py_DECREF(rep);
7108                goto error;
7109            }
7110
7111            outsize = PyUnicode_GET_LENGTH(rep);
7112            if (outsize != 1) {
7113                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7114                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7115                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7116                    Py_DECREF(rep);
7117                    goto error;
7118                }
7119                out = PyBytes_AS_STRING(*outbytes) + offset;
7120            }
7121            kind = PyUnicode_KIND(rep);
7122            data = PyUnicode_DATA(rep);
7123            for (i=0; i < outsize; i++) {
7124                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7125                if (ch > 127) {
7126                    raise_encode_exception(&exc,
7127                        encoding, unicode,
7128                        pos, pos + 1,
7129                        "unable to encode error handler result to ASCII");
7130                    Py_DECREF(rep);
7131                    goto error;
7132                }
7133                *out = (unsigned char)ch;
7134                out++;
7135            }
7136        }
7137        Py_DECREF(rep);
7138    }
7139    /* write a NUL byte */
7140    *out = 0;
7141    outsize = out - PyBytes_AS_STRING(*outbytes);
7142    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7143    if (_PyBytes_Resize(outbytes, outsize) < 0)
7144        goto error;
7145    ret = 0;
7146
7147error:
7148    Py_XDECREF(encoding_obj);
7149    Py_XDECREF(errorHandler);
7150    Py_XDECREF(exc);
7151    return ret;
7152}
7153
7154static PyObject *
7155encode_code_page(int code_page,
7156                 PyObject *unicode,
7157                 const char *errors)
7158{
7159    Py_ssize_t len;
7160    PyObject *outbytes = NULL;
7161    Py_ssize_t offset;
7162    int chunk_len, ret, done;
7163
7164    if (PyUnicode_READY(unicode) == -1)
7165        return NULL;
7166    len = PyUnicode_GET_LENGTH(unicode);
7167
7168    if (code_page < 0) {
7169        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7170        return NULL;
7171    }
7172
7173    if (len == 0)
7174        return PyBytes_FromStringAndSize(NULL, 0);
7175
7176    offset = 0;
7177    do
7178    {
7179#ifdef NEED_RETRY
7180        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7181           chunks. */
7182        if (len > INT_MAX/2) {
7183            chunk_len = INT_MAX/2;
7184            done = 0;
7185        }
7186        else
7187#endif
7188        {
7189            chunk_len = (int)len;
7190            done = 1;
7191        }
7192
7193        ret = encode_code_page_strict(code_page, &outbytes,
7194                                      unicode, offset, chunk_len,
7195                                      errors);
7196        if (ret == -2)
7197            ret = encode_code_page_errors(code_page, &outbytes,
7198                                          unicode, offset,
7199                                          chunk_len, errors);
7200        if (ret < 0) {
7201            Py_XDECREF(outbytes);
7202            return NULL;
7203        }
7204
7205        offset += chunk_len;
7206        len -= chunk_len;
7207    } while (!done);
7208
7209    return outbytes;
7210}
7211
7212PyObject *
7213PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7214                     Py_ssize_t size,
7215                     const char *errors)
7216{
7217    PyObject *unicode, *res;
7218    unicode = PyUnicode_FromUnicode(p, size);
7219    if (unicode == NULL)
7220        return NULL;
7221    res = encode_code_page(CP_ACP, unicode, errors);
7222    Py_DECREF(unicode);
7223    return res;
7224}
7225
7226PyObject *
7227PyUnicode_EncodeCodePage(int code_page,
7228                         PyObject *unicode,
7229                         const char *errors)
7230{
7231    return encode_code_page(code_page, unicode, errors);
7232}
7233
7234PyObject *
7235PyUnicode_AsMBCSString(PyObject *unicode)
7236{
7237    if (!PyUnicode_Check(unicode)) {
7238        PyErr_BadArgument();
7239        return NULL;
7240    }
7241    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7242}
7243
7244#undef NEED_RETRY
7245
7246#endif /* HAVE_MBCS */
7247
7248/* --- Character Mapping Codec -------------------------------------------- */
7249
7250PyObject *
7251PyUnicode_DecodeCharmap(const char *s,
7252                        Py_ssize_t size,
7253                        PyObject *mapping,
7254                        const char *errors)
7255{
7256    const char *starts = s;
7257    Py_ssize_t startinpos;
7258    Py_ssize_t endinpos;
7259    Py_ssize_t outpos;
7260    const char *e;
7261    PyObject *v;
7262    Py_ssize_t extrachars = 0;
7263    PyObject *errorHandler = NULL;
7264    PyObject *exc = NULL;
7265
7266    /* Default to Latin-1 */
7267    if (mapping == NULL)
7268        return PyUnicode_DecodeLatin1(s, size, errors);
7269
7270    v = PyUnicode_New(size, 127);
7271    if (v == NULL)
7272        goto onError;
7273    if (size == 0)
7274        return v;
7275    outpos = 0;
7276    e = s + size;
7277    if (PyUnicode_CheckExact(mapping)) {
7278        Py_ssize_t maplen;
7279        enum PyUnicode_Kind mapkind;
7280        void *mapdata;
7281        Py_UCS4 x;
7282
7283        if (PyUnicode_READY(mapping) == -1)
7284            return NULL;
7285
7286        maplen = PyUnicode_GET_LENGTH(mapping);
7287        mapdata = PyUnicode_DATA(mapping);
7288        mapkind = PyUnicode_KIND(mapping);
7289        while (s < e) {
7290            unsigned char ch;
7291            if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7292                enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7293                if (outkind == PyUnicode_1BYTE_KIND) {
7294                    void *outdata = PyUnicode_DATA(v);
7295                    Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7296                    while (s < e) {
7297                        unsigned char ch = *s;
7298                        x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7299                        if (x > maxchar)
7300                            goto Error;
7301                        PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7302                        ++s;
7303                    }
7304                    break;
7305                }
7306                else if (outkind == PyUnicode_2BYTE_KIND) {
7307                    void *outdata = PyUnicode_DATA(v);
7308                    while (s < e) {
7309                        unsigned char ch = *s;
7310                        x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7311                        if (x == 0xFFFE)
7312                            goto Error;
7313                        PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7314                        ++s;
7315                    }
7316                    break;
7317                }
7318            }
7319            ch = *s;
7320
7321            if (ch < maplen)
7322                x = PyUnicode_READ(mapkind, mapdata, ch);
7323            else
7324                x = 0xfffe; /* invalid value */
7325Error:
7326            if (x == 0xfffe)
7327            {
7328                /* undefined mapping */
7329                startinpos = s-starts;
7330                endinpos = startinpos+1;
7331                if (unicode_decode_call_errorhandler(
7332                        errors, &errorHandler,
7333                        "charmap", "character maps to <undefined>",
7334                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7335                        &v, &outpos)) {
7336                    goto onError;
7337                }
7338                continue;
7339            }
7340
7341            if (unicode_putchar(&v, &outpos, x) < 0)
7342                goto onError;
7343            ++s;
7344        }
7345    }
7346    else {
7347        while (s < e) {
7348            unsigned char ch = *s;
7349            PyObject *w, *x;
7350
7351            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7352            w = PyLong_FromLong((long)ch);
7353            if (w == NULL)
7354                goto onError;
7355            x = PyObject_GetItem(mapping, w);
7356            Py_DECREF(w);
7357            if (x == NULL) {
7358                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7359                    /* No mapping found means: mapping is undefined. */
7360                    PyErr_Clear();
7361                    x = Py_None;
7362                    Py_INCREF(x);
7363                } else
7364                    goto onError;
7365            }
7366
7367            /* Apply mapping */
7368            if (PyLong_Check(x)) {
7369                long value = PyLong_AS_LONG(x);
7370                if (value < 0 || value > MAX_UNICODE) {
7371                    PyErr_Format(PyExc_TypeError,
7372                                 "character mapping must be in range(0x%lx)",
7373                                 (unsigned long)MAX_UNICODE + 1);
7374                    Py_DECREF(x);
7375                    goto onError;
7376                }
7377                if (unicode_putchar(&v, &outpos, value) < 0)
7378                    goto onError;
7379            }
7380            else if (x == Py_None) {
7381                /* undefined mapping */
7382                startinpos = s-starts;
7383                endinpos = startinpos+1;
7384                if (unicode_decode_call_errorhandler(
7385                        errors, &errorHandler,
7386                        "charmap", "character maps to <undefined>",
7387                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7388                        &v, &outpos)) {
7389                    Py_DECREF(x);
7390                    goto onError;
7391                }
7392                Py_DECREF(x);
7393                continue;
7394            }
7395            else if (PyUnicode_Check(x)) {
7396                Py_ssize_t targetsize;
7397
7398                if (PyUnicode_READY(x) == -1)
7399                    goto onError;
7400                targetsize = PyUnicode_GET_LENGTH(x);
7401
7402                if (targetsize == 1) {
7403                    /* 1-1 mapping */
7404                    if (unicode_putchar(&v, &outpos,
7405                                        PyUnicode_READ_CHAR(x, 0)) < 0)
7406                        goto onError;
7407                }
7408                else if (targetsize > 1) {
7409                    /* 1-n mapping */
7410                    if (targetsize > extrachars) {
7411                        /* resize first */
7412                        Py_ssize_t needed = (targetsize - extrachars) + \
7413                            (targetsize << 2);
7414                        extrachars += needed;
7415                        /* XXX overflow detection missing */
7416                        if (unicode_resize(&v,
7417                                           PyUnicode_GET_LENGTH(v) + needed) < 0)
7418                        {
7419                            Py_DECREF(x);
7420                            goto onError;
7421                        }
7422                    }
7423                    if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7424                        goto onError;
7425                    PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7426                    outpos += targetsize;
7427                    extrachars -= targetsize;
7428                }
7429                /* 1-0 mapping: skip the character */
7430            }
7431            else {
7432                /* wrong return value */
7433                PyErr_SetString(PyExc_TypeError,
7434                                "character mapping must return integer, None or str");
7435                Py_DECREF(x);
7436                goto onError;
7437            }
7438            Py_DECREF(x);
7439            ++s;
7440        }
7441    }
7442    if (unicode_resize(&v, outpos) < 0)
7443        goto onError;
7444    Py_XDECREF(errorHandler);
7445    Py_XDECREF(exc);
7446    return unicode_result(v);
7447
7448  onError:
7449    Py_XDECREF(errorHandler);
7450    Py_XDECREF(exc);
7451    Py_XDECREF(v);
7452    return NULL;
7453}
7454
7455/* Charmap encoding: the lookup table */
7456
7457struct encoding_map {
7458    PyObject_HEAD
7459    unsigned char level1[32];
7460    int count2, count3;
7461    unsigned char level23[1];
7462};
7463
7464static PyObject*
7465encoding_map_size(PyObject *obj, PyObject* args)
7466{
7467    struct encoding_map *map = (struct encoding_map*)obj;
7468    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7469                           128*map->count3);
7470}
7471
7472static PyMethodDef encoding_map_methods[] = {
7473    {"size", encoding_map_size, METH_NOARGS,
7474     PyDoc_STR("Return the size (in bytes) of this object") },
7475    { 0 }
7476};
7477
7478static void
7479encoding_map_dealloc(PyObject* o)
7480{
7481    PyObject_FREE(o);
7482}
7483
7484static PyTypeObject EncodingMapType = {
7485    PyVarObject_HEAD_INIT(NULL, 0)
7486    "EncodingMap",          /*tp_name*/
7487    sizeof(struct encoding_map),   /*tp_basicsize*/
7488    0,                      /*tp_itemsize*/
7489    /* methods */
7490    encoding_map_dealloc,   /*tp_dealloc*/
7491    0,                      /*tp_print*/
7492    0,                      /*tp_getattr*/
7493    0,                      /*tp_setattr*/
7494    0,                      /*tp_reserved*/
7495    0,                      /*tp_repr*/
7496    0,                      /*tp_as_number*/
7497    0,                      /*tp_as_sequence*/
7498    0,                      /*tp_as_mapping*/
7499    0,                      /*tp_hash*/
7500    0,                      /*tp_call*/
7501    0,                      /*tp_str*/
7502    0,                      /*tp_getattro*/
7503    0,                      /*tp_setattro*/
7504    0,                      /*tp_as_buffer*/
7505    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7506    0,                      /*tp_doc*/
7507    0,                      /*tp_traverse*/
7508    0,                      /*tp_clear*/
7509    0,                      /*tp_richcompare*/
7510    0,                      /*tp_weaklistoffset*/
7511    0,                      /*tp_iter*/
7512    0,                      /*tp_iternext*/
7513    encoding_map_methods,   /*tp_methods*/
7514    0,                      /*tp_members*/
7515    0,                      /*tp_getset*/
7516    0,                      /*tp_base*/
7517    0,                      /*tp_dict*/
7518    0,                      /*tp_descr_get*/
7519    0,                      /*tp_descr_set*/
7520    0,                      /*tp_dictoffset*/
7521    0,                      /*tp_init*/
7522    0,                      /*tp_alloc*/
7523    0,                      /*tp_new*/
7524    0,                      /*tp_free*/
7525    0,                      /*tp_is_gc*/
7526};
7527
7528PyObject*
7529PyUnicode_BuildEncodingMap(PyObject* string)
7530{
7531    PyObject *result;
7532    struct encoding_map *mresult;
7533    int i;
7534    int need_dict = 0;
7535    unsigned char level1[32];
7536    unsigned char level2[512];
7537    unsigned char *mlevel1, *mlevel2, *mlevel3;
7538    int count2 = 0, count3 = 0;
7539    int kind;
7540    void *data;
7541    Py_ssize_t length;
7542    Py_UCS4 ch;
7543
7544    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7545        PyErr_BadArgument();
7546        return NULL;
7547    }
7548    kind = PyUnicode_KIND(string);
7549    data = PyUnicode_DATA(string);
7550    length = PyUnicode_GET_LENGTH(string);
7551    length = Py_MIN(length, 256);
7552    memset(level1, 0xFF, sizeof level1);
7553    memset(level2, 0xFF, sizeof level2);
7554
7555    /* If there isn't a one-to-one mapping of NULL to \0,
7556       or if there are non-BMP characters, we need to use
7557       a mapping dictionary. */
7558    if (PyUnicode_READ(kind, data, 0) != 0)
7559        need_dict = 1;
7560    for (i = 1; i < length; i++) {
7561        int l1, l2;
7562        ch = PyUnicode_READ(kind, data, i);
7563        if (ch == 0 || ch > 0xFFFF) {
7564            need_dict = 1;
7565            break;
7566        }
7567        if (ch == 0xFFFE)
7568            /* unmapped character */
7569            continue;
7570        l1 = ch >> 11;
7571        l2 = ch >> 7;
7572        if (level1[l1] == 0xFF)
7573            level1[l1] = count2++;
7574        if (level2[l2] == 0xFF)
7575            level2[l2] = count3++;
7576    }
7577
7578    if (count2 >= 0xFF || count3 >= 0xFF)
7579        need_dict = 1;
7580
7581    if (need_dict) {
7582        PyObject *result = PyDict_New();
7583        PyObject *key, *value;
7584        if (!result)
7585            return NULL;
7586        for (i = 0; i < length; i++) {
7587            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7588            value = PyLong_FromLong(i);
7589            if (!key || !value)
7590                goto failed1;
7591            if (PyDict_SetItem(result, key, value) == -1)
7592                goto failed1;
7593            Py_DECREF(key);
7594            Py_DECREF(value);
7595        }
7596        return result;
7597      failed1:
7598        Py_XDECREF(key);
7599        Py_XDECREF(value);
7600        Py_DECREF(result);
7601        return NULL;
7602    }
7603
7604    /* Create a three-level trie */
7605    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7606                             16*count2 + 128*count3 - 1);
7607    if (!result)
7608        return PyErr_NoMemory();
7609    PyObject_Init(result, &EncodingMapType);
7610    mresult = (struct encoding_map*)result;
7611    mresult->count2 = count2;
7612    mresult->count3 = count3;
7613    mlevel1 = mresult->level1;
7614    mlevel2 = mresult->level23;
7615    mlevel3 = mresult->level23 + 16*count2;
7616    memcpy(mlevel1, level1, 32);
7617    memset(mlevel2, 0xFF, 16*count2);
7618    memset(mlevel3, 0, 128*count3);
7619    count3 = 0;
7620    for (i = 1; i < length; i++) {
7621        int o1, o2, o3, i2, i3;
7622        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7623        if (ch == 0xFFFE)
7624            /* unmapped character */
7625            continue;
7626        o1 = ch>>11;
7627        o2 = (ch>>7) & 0xF;
7628        i2 = 16*mlevel1[o1] + o2;
7629        if (mlevel2[i2] == 0xFF)
7630            mlevel2[i2] = count3++;
7631        o3 = ch & 0x7F;
7632        i3 = 128*mlevel2[i2] + o3;
7633        mlevel3[i3] = i;
7634    }
7635    return result;
7636}
7637
7638static int
7639encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7640{
7641    struct encoding_map *map = (struct encoding_map*)mapping;
7642    int l1 = c>>11;
7643    int l2 = (c>>7) & 0xF;
7644    int l3 = c & 0x7F;
7645    int i;
7646
7647    if (c > 0xFFFF)
7648        return -1;
7649    if (c == 0)
7650        return 0;
7651    /* level 1*/
7652    i = map->level1[l1];
7653    if (i == 0xFF) {
7654        return -1;
7655    }
7656    /* level 2*/
7657    i = map->level23[16*i+l2];
7658    if (i == 0xFF) {
7659        return -1;
7660    }
7661    /* level 3 */
7662    i = map->level23[16*map->count2 + 128*i + l3];
7663    if (i == 0) {
7664        return -1;
7665    }
7666    return i;
7667}
7668
7669/* Lookup the character ch in the mapping. If the character
7670   can't be found, Py_None is returned (or NULL, if another
7671   error occurred). */
7672static PyObject *
7673charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7674{
7675    PyObject *w = PyLong_FromLong((long)c);
7676    PyObject *x;
7677
7678    if (w == NULL)
7679        return NULL;
7680    x = PyObject_GetItem(mapping, w);
7681    Py_DECREF(w);
7682    if (x == NULL) {
7683        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7684            /* No mapping found means: mapping is undefined. */
7685            PyErr_Clear();
7686            x = Py_None;
7687            Py_INCREF(x);
7688            return x;
7689        } else
7690            return NULL;
7691    }
7692    else if (x == Py_None)
7693        return x;
7694    else if (PyLong_Check(x)) {
7695        long value = PyLong_AS_LONG(x);
7696        if (value < 0 || value > 255) {
7697            PyErr_SetString(PyExc_TypeError,
7698                            "character mapping must be in range(256)");
7699            Py_DECREF(x);
7700            return NULL;
7701        }
7702        return x;
7703    }
7704    else if (PyBytes_Check(x))
7705        return x;
7706    else {
7707        /* wrong return value */
7708        PyErr_Format(PyExc_TypeError,
7709                     "character mapping must return integer, bytes or None, not %.400s",
7710                     x->ob_type->tp_name);
7711        Py_DECREF(x);
7712        return NULL;
7713    }
7714}
7715
7716static int
7717charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7718{
7719    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7720    /* exponentially overallocate to minimize reallocations */
7721    if (requiredsize < 2*outsize)
7722        requiredsize = 2*outsize;
7723    if (_PyBytes_Resize(outobj, requiredsize))
7724        return -1;
7725    return 0;
7726}
7727
7728typedef enum charmapencode_result {
7729    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7730} charmapencode_result;
7731/* lookup the character, put the result in the output string and adjust
7732   various state variables. Resize the output bytes object if not enough
7733   space is available. Return a new reference to the object that
7734   was put in the output buffer, or Py_None, if the mapping was undefined
7735   (in which case no character was written) or NULL, if a
7736   reallocation error occurred. The caller must decref the result */
7737static charmapencode_result
7738charmapencode_output(Py_UCS4 c, PyObject *mapping,
7739                     PyObject **outobj, Py_ssize_t *outpos)
7740{
7741    PyObject *rep;
7742    char *outstart;
7743    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7744
7745    if (Py_TYPE(mapping) == &EncodingMapType) {
7746        int res = encoding_map_lookup(c, mapping);
7747        Py_ssize_t requiredsize = *outpos+1;
7748        if (res == -1)
7749            return enc_FAILED;
7750        if (outsize<requiredsize)
7751            if (charmapencode_resize(outobj, outpos, requiredsize))
7752                return enc_EXCEPTION;
7753        outstart = PyBytes_AS_STRING(*outobj);
7754        outstart[(*outpos)++] = (char)res;
7755        return enc_SUCCESS;
7756    }
7757
7758    rep = charmapencode_lookup(c, mapping);
7759    if (rep==NULL)
7760        return enc_EXCEPTION;
7761    else if (rep==Py_None) {
7762        Py_DECREF(rep);
7763        return enc_FAILED;
7764    } else {
7765        if (PyLong_Check(rep)) {
7766            Py_ssize_t requiredsize = *outpos+1;
7767            if (outsize<requiredsize)
7768                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7769                    Py_DECREF(rep);
7770                    return enc_EXCEPTION;
7771                }
7772            outstart = PyBytes_AS_STRING(*outobj);
7773            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7774        }
7775        else {
7776            const char *repchars = PyBytes_AS_STRING(rep);
7777            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7778            Py_ssize_t requiredsize = *outpos+repsize;
7779            if (outsize<requiredsize)
7780                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7781                    Py_DECREF(rep);
7782                    return enc_EXCEPTION;
7783                }
7784            outstart = PyBytes_AS_STRING(*outobj);
7785            memcpy(outstart + *outpos, repchars, repsize);
7786            *outpos += repsize;
7787        }
7788    }
7789    Py_DECREF(rep);
7790    return enc_SUCCESS;
7791}
7792
7793/* handle an error in PyUnicode_EncodeCharmap
7794   Return 0 on success, -1 on error */
7795static int
7796charmap_encoding_error(
7797    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
7798    PyObject **exceptionObject,
7799    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7800    PyObject **res, Py_ssize_t *respos)
7801{
7802    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7803    Py_ssize_t size, repsize;
7804    Py_ssize_t newpos;
7805    enum PyUnicode_Kind kind;
7806    void *data;
7807    Py_ssize_t index;
7808    /* startpos for collecting unencodable chars */
7809    Py_ssize_t collstartpos = *inpos;
7810    Py_ssize_t collendpos = *inpos+1;
7811    Py_ssize_t collpos;
7812    char *encoding = "charmap";
7813    char *reason = "character maps to <undefined>";
7814    charmapencode_result x;
7815    Py_UCS4 ch;
7816    int val;
7817
7818    if (PyUnicode_READY(unicode) == -1)
7819        return -1;
7820    size = PyUnicode_GET_LENGTH(unicode);
7821    /* find all unencodable characters */
7822    while (collendpos < size) {
7823        PyObject *rep;
7824        if (Py_TYPE(mapping) == &EncodingMapType) {
7825            ch = PyUnicode_READ_CHAR(unicode, collendpos);
7826            val = encoding_map_lookup(ch, mapping);
7827            if (val != -1)
7828                break;
7829            ++collendpos;
7830            continue;
7831        }
7832
7833        ch = PyUnicode_READ_CHAR(unicode, collendpos);
7834        rep = charmapencode_lookup(ch, mapping);
7835        if (rep==NULL)
7836            return -1;
7837        else if (rep!=Py_None) {
7838            Py_DECREF(rep);
7839            break;
7840        }
7841        Py_DECREF(rep);
7842        ++collendpos;
7843    }
7844    /* cache callback name lookup
7845     * (if not done yet, i.e. it's the first error) */
7846    if (*known_errorHandler==-1) {
7847        if ((errors==NULL) || (!strcmp(errors, "strict")))
7848            *known_errorHandler = 1;
7849        else if (!strcmp(errors, "replace"))
7850            *known_errorHandler = 2;
7851        else if (!strcmp(errors, "ignore"))
7852            *known_errorHandler = 3;
7853        else if (!strcmp(errors, "xmlcharrefreplace"))
7854            *known_errorHandler = 4;
7855        else
7856            *known_errorHandler = 0;
7857    }
7858    switch (*known_errorHandler) {
7859    case 1: /* strict */
7860        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7861        return -1;
7862    case 2: /* replace */
7863        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7864            x = charmapencode_output('?', mapping, res, respos);
7865            if (x==enc_EXCEPTION) {
7866                return -1;
7867            }
7868            else if (x==enc_FAILED) {
7869                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7870                return -1;
7871            }
7872        }
7873        /* fall through */
7874    case 3: /* ignore */
7875        *inpos = collendpos;
7876        break;
7877    case 4: /* xmlcharrefreplace */
7878        /* generate replacement (temporarily (mis)uses p) */
7879        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7880            char buffer[2+29+1+1];
7881            char *cp;
7882            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
7883            for (cp = buffer; *cp; ++cp) {
7884                x = charmapencode_output(*cp, mapping, res, respos);
7885                if (x==enc_EXCEPTION)
7886                    return -1;
7887                else if (x==enc_FAILED) {
7888                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7889                    return -1;
7890                }
7891            }
7892        }
7893        *inpos = collendpos;
7894        break;
7895    default:
7896        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7897                                                      encoding, reason, unicode, exceptionObject,
7898                                                      collstartpos, collendpos, &newpos);
7899        if (repunicode == NULL)
7900            return -1;
7901        if (PyBytes_Check(repunicode)) {
7902            /* Directly copy bytes result to output. */
7903            Py_ssize_t outsize = PyBytes_Size(*res);
7904            Py_ssize_t requiredsize;
7905            repsize = PyBytes_Size(repunicode);
7906            requiredsize = *respos + repsize;
7907            if (requiredsize > outsize)
7908                /* Make room for all additional bytes. */
7909                if (charmapencode_resize(res, respos, requiredsize)) {
7910                    Py_DECREF(repunicode);
7911                    return -1;
7912                }
7913            memcpy(PyBytes_AsString(*res) + *respos,
7914                   PyBytes_AsString(repunicode),  repsize);
7915            *respos += repsize;
7916            *inpos = newpos;
7917            Py_DECREF(repunicode);
7918            break;
7919        }
7920        /* generate replacement  */
7921        if (PyUnicode_READY(repunicode) == -1) {
7922            Py_DECREF(repunicode);
7923            return -1;
7924        }
7925        repsize = PyUnicode_GET_LENGTH(repunicode);
7926        data = PyUnicode_DATA(repunicode);
7927        kind = PyUnicode_KIND(repunicode);
7928        for (index = 0; index < repsize; index++) {
7929            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7930            x = charmapencode_output(repch, mapping, res, respos);
7931            if (x==enc_EXCEPTION) {
7932                Py_DECREF(repunicode);
7933                return -1;
7934            }
7935            else if (x==enc_FAILED) {
7936                Py_DECREF(repunicode);
7937                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7938                return -1;
7939            }
7940        }
7941        *inpos = newpos;
7942        Py_DECREF(repunicode);
7943    }
7944    return 0;
7945}
7946
7947PyObject *
7948_PyUnicode_EncodeCharmap(PyObject *unicode,
7949                         PyObject *mapping,
7950                         const char *errors)
7951{
7952    /* output object */
7953    PyObject *res = NULL;
7954    /* current input position */
7955    Py_ssize_t inpos = 0;
7956    Py_ssize_t size;
7957    /* current output position */
7958    Py_ssize_t respos = 0;
7959    PyObject *errorHandler = NULL;
7960    PyObject *exc = NULL;
7961    /* the following variable is used for caching string comparisons
7962     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7963     * 3=ignore, 4=xmlcharrefreplace */
7964    int known_errorHandler = -1;
7965
7966    if (PyUnicode_READY(unicode) == -1)
7967        return NULL;
7968    size = PyUnicode_GET_LENGTH(unicode);
7969
7970    /* Default to Latin-1 */
7971    if (mapping == NULL)
7972        return unicode_encode_ucs1(unicode, errors, 256);
7973
7974    /* allocate enough for a simple encoding without
7975       replacements, if we need more, we'll resize */
7976    res = PyBytes_FromStringAndSize(NULL, size);
7977    if (res == NULL)
7978        goto onError;
7979    if (size == 0)
7980        return res;
7981
7982    while (inpos<size) {
7983        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
7984        /* try to encode it */
7985        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
7986        if (x==enc_EXCEPTION) /* error */
7987            goto onError;
7988        if (x==enc_FAILED) { /* unencodable character */
7989            if (charmap_encoding_error(unicode, &inpos, mapping,
7990                                       &exc,
7991                                       &known_errorHandler, &errorHandler, errors,
7992                                       &res, &respos)) {
7993                goto onError;
7994            }
7995        }
7996        else
7997            /* done with this character => adjust input position */
7998            ++inpos;
7999    }
8000
8001    /* Resize if we allocated to much */
8002    if (respos<PyBytes_GET_SIZE(res))
8003        if (_PyBytes_Resize(&res, respos) < 0)
8004            goto onError;
8005
8006    Py_XDECREF(exc);
8007    Py_XDECREF(errorHandler);
8008    return res;
8009
8010  onError:
8011    Py_XDECREF(res);
8012    Py_XDECREF(exc);
8013    Py_XDECREF(errorHandler);
8014    return NULL;
8015}
8016
8017/* Deprecated */
8018PyObject *
8019PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8020                        Py_ssize_t size,
8021                        PyObject *mapping,
8022                        const char *errors)
8023{
8024    PyObject *result;
8025    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8026    if (unicode == NULL)
8027        return NULL;
8028    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8029    Py_DECREF(unicode);
8030    return result;
8031}
8032
8033PyObject *
8034PyUnicode_AsCharmapString(PyObject *unicode,
8035                          PyObject *mapping)
8036{
8037    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8038        PyErr_BadArgument();
8039        return NULL;
8040    }
8041    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8042}
8043
8044/* create or adjust a UnicodeTranslateError */
8045static void
8046make_translate_exception(PyObject **exceptionObject,
8047                         PyObject *unicode,
8048                         Py_ssize_t startpos, Py_ssize_t endpos,
8049                         const char *reason)
8050{
8051    if (*exceptionObject == NULL) {
8052        *exceptionObject = _PyUnicodeTranslateError_Create(
8053            unicode, startpos, endpos, reason);
8054    }
8055    else {
8056        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8057            goto onError;
8058        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8059            goto onError;
8060        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8061            goto onError;
8062        return;
8063      onError:
8064        Py_DECREF(*exceptionObject);
8065        *exceptionObject = NULL;
8066    }
8067}
8068
8069/* error handling callback helper:
8070   build arguments, call the callback and check the arguments,
8071   put the result into newpos and return the replacement string, which
8072   has to be freed by the caller */
8073static PyObject *
8074unicode_translate_call_errorhandler(const char *errors,
8075                                    PyObject **errorHandler,
8076                                    const char *reason,
8077                                    PyObject *unicode, PyObject **exceptionObject,
8078                                    Py_ssize_t startpos, Py_ssize_t endpos,
8079                                    Py_ssize_t *newpos)
8080{
8081    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8082
8083    Py_ssize_t i_newpos;
8084    PyObject *restuple;
8085    PyObject *resunicode;
8086
8087    if (*errorHandler == NULL) {
8088        *errorHandler = PyCodec_LookupError(errors);
8089        if (*errorHandler == NULL)
8090            return NULL;
8091    }
8092
8093    make_translate_exception(exceptionObject,
8094                             unicode, startpos, endpos, reason);
8095    if (*exceptionObject == NULL)
8096        return NULL;
8097
8098    restuple = PyObject_CallFunctionObjArgs(
8099        *errorHandler, *exceptionObject, NULL);
8100    if (restuple == NULL)
8101        return NULL;
8102    if (!PyTuple_Check(restuple)) {
8103        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8104        Py_DECREF(restuple);
8105        return NULL;
8106    }
8107    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8108                          &resunicode, &i_newpos)) {
8109        Py_DECREF(restuple);
8110        return NULL;
8111    }
8112    if (i_newpos<0)
8113        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8114    else
8115        *newpos = i_newpos;
8116    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8117        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8118        Py_DECREF(restuple);
8119        return NULL;
8120    }
8121    Py_INCREF(resunicode);
8122    Py_DECREF(restuple);
8123    return resunicode;
8124}
8125
8126/* Lookup the character ch in the mapping and put the result in result,
8127   which must be decrefed by the caller.
8128   Return 0 on success, -1 on error */
8129static int
8130charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8131{
8132    PyObject *w = PyLong_FromLong((long)c);
8133    PyObject *x;
8134
8135    if (w == NULL)
8136        return -1;
8137    x = PyObject_GetItem(mapping, w);
8138    Py_DECREF(w);
8139    if (x == NULL) {
8140        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8141            /* No mapping found means: use 1:1 mapping. */
8142            PyErr_Clear();
8143            *result = NULL;
8144            return 0;
8145        } else
8146            return -1;
8147    }
8148    else if (x == Py_None) {
8149        *result = x;
8150        return 0;
8151    }
8152    else if (PyLong_Check(x)) {
8153        long value = PyLong_AS_LONG(x);
8154        long max = PyUnicode_GetMax();
8155        if (value < 0 || value > max) {
8156            PyErr_Format(PyExc_TypeError,
8157                         "character mapping must be in range(0x%x)", max+1);
8158            Py_DECREF(x);
8159            return -1;
8160        }
8161        *result = x;
8162        return 0;
8163    }
8164    else if (PyUnicode_Check(x)) {
8165        *result = x;
8166        return 0;
8167    }
8168    else {
8169        /* wrong return value */
8170        PyErr_SetString(PyExc_TypeError,
8171                        "character mapping must return integer, None or str");
8172        Py_DECREF(x);
8173        return -1;
8174    }
8175}
8176/* ensure that *outobj is at least requiredsize characters long,
8177   if not reallocate and adjust various state variables.
8178   Return 0 on success, -1 on error */
8179static int
8180charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8181                               Py_ssize_t requiredsize)
8182{
8183    Py_ssize_t oldsize = *psize;
8184    Py_UCS4 *new_outobj;
8185    if (requiredsize > oldsize) {
8186        /* exponentially overallocate to minimize reallocations */
8187        if (requiredsize < 2 * oldsize)
8188            requiredsize = 2 * oldsize;
8189        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8190        if (new_outobj == 0)
8191            return -1;
8192        *outobj = new_outobj;
8193        *psize = requiredsize;
8194    }
8195    return 0;
8196}
8197/* lookup the character, put the result in the output string and adjust
8198   various state variables. Return a new reference to the object that
8199   was put in the output buffer in *result, or Py_None, if the mapping was
8200   undefined (in which case no character was written).
8201   The called must decref result.
8202   Return 0 on success, -1 on error. */
8203static int
8204charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8205                        PyObject *mapping, Py_UCS4 **output,
8206                        Py_ssize_t *osize, Py_ssize_t *opos,
8207                        PyObject **res)
8208{
8209    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8210    if (charmaptranslate_lookup(curinp, mapping, res))
8211        return -1;
8212    if (*res==NULL) {
8213        /* not found => default to 1:1 mapping */
8214        (*output)[(*opos)++] = curinp;
8215    }
8216    else if (*res==Py_None)
8217        ;
8218    else if (PyLong_Check(*res)) {
8219        /* no overflow check, because we know that the space is enough */
8220        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8221    }
8222    else if (PyUnicode_Check(*res)) {
8223        Py_ssize_t repsize;
8224        if (PyUnicode_READY(*res) == -1)
8225            return -1;
8226        repsize = PyUnicode_GET_LENGTH(*res);
8227        if (repsize==1) {
8228            /* no overflow check, because we know that the space is enough */
8229            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8230        }
8231        else if (repsize!=0) {
8232            /* more than one character */
8233            Py_ssize_t requiredsize = *opos +
8234                (PyUnicode_GET_LENGTH(input) - ipos) +
8235                repsize - 1;
8236            Py_ssize_t i;
8237            if (charmaptranslate_makespace(output, osize, requiredsize))
8238                return -1;
8239            for(i = 0; i < repsize; i++)
8240                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8241        }
8242    }
8243    else
8244        return -1;
8245    return 0;
8246}
8247
8248PyObject *
8249_PyUnicode_TranslateCharmap(PyObject *input,
8250                            PyObject *mapping,
8251                            const char *errors)
8252{
8253    /* input object */
8254    char *idata;
8255    Py_ssize_t size, i;
8256    int kind;
8257    /* output buffer */
8258    Py_UCS4 *output = NULL;
8259    Py_ssize_t osize;
8260    PyObject *res;
8261    /* current output position */
8262    Py_ssize_t opos;
8263    char *reason = "character maps to <undefined>";
8264    PyObject *errorHandler = NULL;
8265    PyObject *exc = NULL;
8266    /* the following variable is used for caching string comparisons
8267     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8268     * 3=ignore, 4=xmlcharrefreplace */
8269    int known_errorHandler = -1;
8270
8271    if (mapping == NULL) {
8272        PyErr_BadArgument();
8273        return NULL;
8274    }
8275
8276    if (PyUnicode_READY(input) == -1)
8277        return NULL;
8278    idata = (char*)PyUnicode_DATA(input);
8279    kind = PyUnicode_KIND(input);
8280    size = PyUnicode_GET_LENGTH(input);
8281    i = 0;
8282
8283    if (size == 0) {
8284        Py_INCREF(input);
8285        return input;
8286    }
8287
8288    /* allocate enough for a simple 1:1 translation without
8289       replacements, if we need more, we'll resize */
8290    osize = size;
8291    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8292    opos = 0;
8293    if (output == NULL) {
8294        PyErr_NoMemory();
8295        goto onError;
8296    }
8297
8298    while (i<size) {
8299        /* try to encode it */
8300        PyObject *x = NULL;
8301        if (charmaptranslate_output(input, i, mapping,
8302                                    &output, &osize, &opos, &x)) {
8303            Py_XDECREF(x);
8304            goto onError;
8305        }
8306        Py_XDECREF(x);
8307        if (x!=Py_None) /* it worked => adjust input pointer */
8308            ++i;
8309        else { /* untranslatable character */
8310            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8311            Py_ssize_t repsize;
8312            Py_ssize_t newpos;
8313            Py_ssize_t uni2;
8314            /* startpos for collecting untranslatable chars */
8315            Py_ssize_t collstart = i;
8316            Py_ssize_t collend = i+1;
8317            Py_ssize_t coll;
8318
8319            /* find all untranslatable characters */
8320            while (collend < size) {
8321                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8322                    goto onError;
8323                Py_XDECREF(x);
8324                if (x!=Py_None)
8325                    break;
8326                ++collend;
8327            }
8328            /* cache callback name lookup
8329             * (if not done yet, i.e. it's the first error) */
8330            if (known_errorHandler==-1) {
8331                if ((errors==NULL) || (!strcmp(errors, "strict")))
8332                    known_errorHandler = 1;
8333                else if (!strcmp(errors, "replace"))
8334                    known_errorHandler = 2;
8335                else if (!strcmp(errors, "ignore"))
8336                    known_errorHandler = 3;
8337                else if (!strcmp(errors, "xmlcharrefreplace"))
8338                    known_errorHandler = 4;
8339                else
8340                    known_errorHandler = 0;
8341            }
8342            switch (known_errorHandler) {
8343            case 1: /* strict */
8344                make_translate_exception(&exc,
8345                                         input, collstart, collend, reason);
8346                if (exc != NULL)
8347                    PyCodec_StrictErrors(exc);
8348                goto onError;
8349            case 2: /* replace */
8350                /* No need to check for space, this is a 1:1 replacement */
8351                for (coll = collstart; coll<collend; coll++)
8352                    output[opos++] = '?';
8353                /* fall through */
8354            case 3: /* ignore */
8355                i = collend;
8356                break;
8357            case 4: /* xmlcharrefreplace */
8358                /* generate replacement (temporarily (mis)uses i) */
8359                for (i = collstart; i < collend; ++i) {
8360                    char buffer[2+29+1+1];
8361                    char *cp;
8362                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8363                    if (charmaptranslate_makespace(&output, &osize,
8364                                                   opos+strlen(buffer)+(size-collend)))
8365                        goto onError;
8366                    for (cp = buffer; *cp; ++cp)
8367                        output[opos++] = *cp;
8368                }
8369                i = collend;
8370                break;
8371            default:
8372                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8373                                                                 reason, input, &exc,
8374                                                                 collstart, collend, &newpos);
8375                if (repunicode == NULL)
8376                    goto onError;
8377                if (PyUnicode_READY(repunicode) == -1) {
8378                    Py_DECREF(repunicode);
8379                    goto onError;
8380                }
8381                /* generate replacement  */
8382                repsize = PyUnicode_GET_LENGTH(repunicode);
8383                if (charmaptranslate_makespace(&output, &osize,
8384                                               opos+repsize+(size-collend))) {
8385                    Py_DECREF(repunicode);
8386                    goto onError;
8387                }
8388                for (uni2 = 0; repsize-->0; ++uni2)
8389                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8390                i = newpos;
8391                Py_DECREF(repunicode);
8392            }
8393        }
8394    }
8395    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8396    if (!res)
8397        goto onError;
8398    PyMem_Free(output);
8399    Py_XDECREF(exc);
8400    Py_XDECREF(errorHandler);
8401    return res;
8402
8403  onError:
8404    PyMem_Free(output);
8405    Py_XDECREF(exc);
8406    Py_XDECREF(errorHandler);
8407    return NULL;
8408}
8409
8410/* Deprecated. Use PyUnicode_Translate instead. */
8411PyObject *
8412PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8413                           Py_ssize_t size,
8414                           PyObject *mapping,
8415                           const char *errors)
8416{
8417    PyObject *result;
8418    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8419    if (!unicode)
8420        return NULL;
8421    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8422    Py_DECREF(unicode);
8423    return result;
8424}
8425
8426PyObject *
8427PyUnicode_Translate(PyObject *str,
8428                    PyObject *mapping,
8429                    const char *errors)
8430{
8431    PyObject *result;
8432
8433    str = PyUnicode_FromObject(str);
8434    if (str == NULL)
8435        return NULL;
8436    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8437    Py_DECREF(str);
8438    return result;
8439}
8440
8441static Py_UCS4
8442fix_decimal_and_space_to_ascii(PyObject *self)
8443{
8444    /* No need to call PyUnicode_READY(self) because this function is only
8445       called as a callback from fixup() which does it already. */
8446    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8447    const int kind = PyUnicode_KIND(self);
8448    void *data = PyUnicode_DATA(self);
8449    Py_UCS4 maxchar = 127, ch, fixed;
8450    int modified = 0;
8451    Py_ssize_t i;
8452
8453    for (i = 0; i < len; ++i) {
8454        ch = PyUnicode_READ(kind, data, i);
8455        fixed = 0;
8456        if (ch > 127) {
8457            if (Py_UNICODE_ISSPACE(ch))
8458                fixed = ' ';
8459            else {
8460                const int decimal = Py_UNICODE_TODECIMAL(ch);
8461                if (decimal >= 0)
8462                    fixed = '0' + decimal;
8463            }
8464            if (fixed != 0) {
8465                modified = 1;
8466                maxchar = MAX_MAXCHAR(maxchar, fixed);
8467                PyUnicode_WRITE(kind, data, i, fixed);
8468            }
8469            else
8470                maxchar = MAX_MAXCHAR(maxchar, ch);
8471        }
8472    }
8473
8474    return (modified) ? maxchar : 0;
8475}
8476
8477PyObject *
8478_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8479{
8480    if (!PyUnicode_Check(unicode)) {
8481        PyErr_BadInternalCall();
8482        return NULL;
8483    }
8484    if (PyUnicode_READY(unicode) == -1)
8485        return NULL;
8486    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8487        /* If the string is already ASCII, just return the same string */
8488        Py_INCREF(unicode);
8489        return unicode;
8490    }
8491    return fixup(unicode, fix_decimal_and_space_to_ascii);
8492}
8493
8494PyObject *
8495PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8496                                  Py_ssize_t length)
8497{
8498    PyObject *decimal;
8499    Py_ssize_t i;
8500    Py_UCS4 maxchar;
8501    enum PyUnicode_Kind kind;
8502    void *data;
8503
8504    maxchar = 127;
8505    for (i = 0; i < length; i++) {
8506        Py_UNICODE ch = s[i];
8507        if (ch > 127) {
8508            int decimal = Py_UNICODE_TODECIMAL(ch);
8509            if (decimal >= 0)
8510                ch = '0' + decimal;
8511            maxchar = MAX_MAXCHAR(maxchar, ch);
8512        }
8513    }
8514
8515    /* Copy to a new string */
8516    decimal = PyUnicode_New(length, maxchar);
8517    if (decimal == NULL)
8518        return decimal;
8519    kind = PyUnicode_KIND(decimal);
8520    data = PyUnicode_DATA(decimal);
8521    /* Iterate over code points */
8522    for (i = 0; i < length; i++) {
8523        Py_UNICODE ch = s[i];
8524        if (ch > 127) {
8525            int decimal = Py_UNICODE_TODECIMAL(ch);
8526            if (decimal >= 0)
8527                ch = '0' + decimal;
8528        }
8529        PyUnicode_WRITE(kind, data, i, ch);
8530    }
8531    return unicode_result(decimal);
8532}
8533/* --- Decimal Encoder ---------------------------------------------------- */
8534
8535int
8536PyUnicode_EncodeDecimal(Py_UNICODE *s,
8537                        Py_ssize_t length,
8538                        char *output,
8539                        const char *errors)
8540{
8541    PyObject *unicode;
8542    Py_ssize_t i;
8543    enum PyUnicode_Kind kind;
8544    void *data;
8545
8546    if (output == NULL) {
8547        PyErr_BadArgument();
8548        return -1;
8549    }
8550
8551    unicode = PyUnicode_FromUnicode(s, length);
8552    if (unicode == NULL)
8553        return -1;
8554
8555    if (PyUnicode_READY(unicode) == -1) {
8556        Py_DECREF(unicode);
8557        return -1;
8558    }
8559    kind = PyUnicode_KIND(unicode);
8560    data = PyUnicode_DATA(unicode);
8561
8562    for (i=0; i < length; ) {
8563        PyObject *exc;
8564        Py_UCS4 ch;
8565        int decimal;
8566        Py_ssize_t startpos;
8567
8568        ch = PyUnicode_READ(kind, data, i);
8569
8570        if (Py_UNICODE_ISSPACE(ch)) {
8571            *output++ = ' ';
8572            i++;
8573            continue;
8574        }
8575        decimal = Py_UNICODE_TODECIMAL(ch);
8576        if (decimal >= 0) {
8577            *output++ = '0' + decimal;
8578            i++;
8579            continue;
8580        }
8581        if (0 < ch && ch < 256) {
8582            *output++ = (char)ch;
8583            i++;
8584            continue;
8585        }
8586
8587        startpos = i;
8588        exc = NULL;
8589        raise_encode_exception(&exc, "decimal", unicode,
8590                               startpos, startpos+1,
8591                               "invalid decimal Unicode string");
8592        Py_XDECREF(exc);
8593        Py_DECREF(unicode);
8594        return -1;
8595    }
8596    /* 0-terminate the output string */
8597    *output++ = '\0';
8598    Py_DECREF(unicode);
8599    return 0;
8600}
8601
8602/* --- Helpers ------------------------------------------------------------ */
8603
8604static Py_ssize_t
8605any_find_slice(int direction, PyObject* s1, PyObject* s2,
8606               Py_ssize_t start,
8607               Py_ssize_t end)
8608{
8609    int kind1, kind2, kind;
8610    void *buf1, *buf2;
8611    Py_ssize_t len1, len2, result;
8612
8613    kind1 = PyUnicode_KIND(s1);
8614    kind2 = PyUnicode_KIND(s2);
8615    kind = kind1 > kind2 ? kind1 : kind2;
8616    buf1 = PyUnicode_DATA(s1);
8617    buf2 = PyUnicode_DATA(s2);
8618    if (kind1 != kind)
8619        buf1 = _PyUnicode_AsKind(s1, kind);
8620    if (!buf1)
8621        return -2;
8622    if (kind2 != kind)
8623        buf2 = _PyUnicode_AsKind(s2, kind);
8624    if (!buf2) {
8625        if (kind1 != kind) PyMem_Free(buf1);
8626        return -2;
8627    }
8628    len1 = PyUnicode_GET_LENGTH(s1);
8629    len2 = PyUnicode_GET_LENGTH(s2);
8630
8631    if (direction > 0) {
8632        switch (kind) {
8633        case PyUnicode_1BYTE_KIND:
8634            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8635                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8636            else
8637                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8638            break;
8639        case PyUnicode_2BYTE_KIND:
8640            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8641            break;
8642        case PyUnicode_4BYTE_KIND:
8643            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8644            break;
8645        default:
8646            assert(0); result = -2;
8647        }
8648    }
8649    else {
8650        switch (kind) {
8651        case PyUnicode_1BYTE_KIND:
8652            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8653                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8654            else
8655                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8656            break;
8657        case PyUnicode_2BYTE_KIND:
8658            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8659            break;
8660        case PyUnicode_4BYTE_KIND:
8661            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8662            break;
8663        default:
8664            assert(0); result = -2;
8665        }
8666    }
8667
8668    if (kind1 != kind)
8669        PyMem_Free(buf1);
8670    if (kind2 != kind)
8671        PyMem_Free(buf2);
8672
8673    return result;
8674}
8675
8676Py_ssize_t
8677_PyUnicode_InsertThousandsGrouping(
8678    PyObject *unicode, Py_ssize_t index,
8679    Py_ssize_t n_buffer,
8680    void *digits, Py_ssize_t n_digits,
8681    Py_ssize_t min_width,
8682    const char *grouping, PyObject *thousands_sep,
8683    Py_UCS4 *maxchar)
8684{
8685    unsigned int kind, thousands_sep_kind;
8686    char *data, *thousands_sep_data;
8687    Py_ssize_t thousands_sep_len;
8688    Py_ssize_t len;
8689
8690    if (unicode != NULL) {
8691        kind = PyUnicode_KIND(unicode);
8692        data = (char *) PyUnicode_DATA(unicode) + index * kind;
8693    }
8694    else {
8695        kind = PyUnicode_1BYTE_KIND;
8696        data = NULL;
8697    }
8698    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8699    thousands_sep_data = PyUnicode_DATA(thousands_sep);
8700    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8701    if (unicode != NULL && thousands_sep_kind != kind) {
8702        if (thousands_sep_kind < kind) {
8703            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8704            if (!thousands_sep_data)
8705                return -1;
8706        }
8707        else {
8708            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8709            if (!data)
8710                return -1;
8711        }
8712    }
8713
8714    switch (kind) {
8715    case PyUnicode_1BYTE_KIND:
8716        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8717            len = asciilib_InsertThousandsGrouping(
8718                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
8719                min_width, grouping,
8720                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8721        else
8722            len = ucs1lib_InsertThousandsGrouping(
8723                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8724                min_width, grouping,
8725                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8726        break;
8727    case PyUnicode_2BYTE_KIND:
8728        len = ucs2lib_InsertThousandsGrouping(
8729            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
8730            min_width, grouping,
8731            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
8732        break;
8733    case PyUnicode_4BYTE_KIND:
8734        len = ucs4lib_InsertThousandsGrouping(
8735            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
8736            min_width, grouping,
8737            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
8738        break;
8739    default:
8740        assert(0);
8741        return -1;
8742    }
8743    if (unicode != NULL && thousands_sep_kind != kind) {
8744        if (thousands_sep_kind < kind)
8745            PyMem_Free(thousands_sep_data);
8746        else
8747            PyMem_Free(data);
8748    }
8749    if (unicode == NULL) {
8750        *maxchar = 127;
8751        if (len != n_digits) {
8752            *maxchar = MAX_MAXCHAR(*maxchar,
8753                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
8754        }
8755    }
8756    return len;
8757}
8758
8759
8760/* helper macro to fixup start/end slice values */
8761#define ADJUST_INDICES(start, end, len)         \
8762    if (end > len)                              \
8763        end = len;                              \
8764    else if (end < 0) {                         \
8765        end += len;                             \
8766        if (end < 0)                            \
8767            end = 0;                            \
8768    }                                           \
8769    if (start < 0) {                            \
8770        start += len;                           \
8771        if (start < 0)                          \
8772            start = 0;                          \
8773    }
8774
8775Py_ssize_t
8776PyUnicode_Count(PyObject *str,
8777                PyObject *substr,
8778                Py_ssize_t start,
8779                Py_ssize_t end)
8780{
8781    Py_ssize_t result;
8782    PyObject* str_obj;
8783    PyObject* sub_obj;
8784    int kind1, kind2, kind;
8785    void *buf1 = NULL, *buf2 = NULL;
8786    Py_ssize_t len1, len2;
8787
8788    str_obj = PyUnicode_FromObject(str);
8789    if (!str_obj)
8790        return -1;
8791    sub_obj = PyUnicode_FromObject(substr);
8792    if (!sub_obj) {
8793        Py_DECREF(str_obj);
8794        return -1;
8795    }
8796    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
8797        Py_DECREF(sub_obj);
8798        Py_DECREF(str_obj);
8799        return -1;
8800    }
8801
8802    kind1 = PyUnicode_KIND(str_obj);
8803    kind2 = PyUnicode_KIND(sub_obj);
8804    kind = kind1;
8805    buf1 = PyUnicode_DATA(str_obj);
8806    buf2 = PyUnicode_DATA(sub_obj);
8807    if (kind2 != kind) {
8808        if (kind2 > kind) {
8809            Py_DECREF(sub_obj);
8810            Py_DECREF(str_obj);
8811            return 0;
8812        }
8813        buf2 = _PyUnicode_AsKind(sub_obj, kind);
8814    }
8815    if (!buf2)
8816        goto onError;
8817    len1 = PyUnicode_GET_LENGTH(str_obj);
8818    len2 = PyUnicode_GET_LENGTH(sub_obj);
8819
8820    ADJUST_INDICES(start, end, len1);
8821    switch (kind) {
8822    case PyUnicode_1BYTE_KIND:
8823        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8824            result = asciilib_count(
8825                ((Py_UCS1*)buf1) + start, end - start,
8826                buf2, len2, PY_SSIZE_T_MAX
8827                );
8828        else
8829            result = ucs1lib_count(
8830                ((Py_UCS1*)buf1) + start, end - start,
8831                buf2, len2, PY_SSIZE_T_MAX
8832                );
8833        break;
8834    case PyUnicode_2BYTE_KIND:
8835        result = ucs2lib_count(
8836            ((Py_UCS2*)buf1) + start, end - start,
8837            buf2, len2, PY_SSIZE_T_MAX
8838            );
8839        break;
8840    case PyUnicode_4BYTE_KIND:
8841        result = ucs4lib_count(
8842            ((Py_UCS4*)buf1) + start, end - start,
8843            buf2, len2, PY_SSIZE_T_MAX
8844            );
8845        break;
8846    default:
8847        assert(0); result = 0;
8848    }
8849
8850    Py_DECREF(sub_obj);
8851    Py_DECREF(str_obj);
8852
8853    if (kind2 != kind)
8854        PyMem_Free(buf2);
8855
8856    return result;
8857  onError:
8858    Py_DECREF(sub_obj);
8859    Py_DECREF(str_obj);
8860    if (kind2 != kind && buf2)
8861        PyMem_Free(buf2);
8862    return -1;
8863}
8864
8865Py_ssize_t
8866PyUnicode_Find(PyObject *str,
8867               PyObject *sub,
8868               Py_ssize_t start,
8869               Py_ssize_t end,
8870               int direction)
8871{
8872    Py_ssize_t result;
8873
8874    str = PyUnicode_FromObject(str);
8875    if (!str)
8876        return -2;
8877    sub = PyUnicode_FromObject(sub);
8878    if (!sub) {
8879        Py_DECREF(str);
8880        return -2;
8881    }
8882    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8883        Py_DECREF(sub);
8884        Py_DECREF(str);
8885        return -2;
8886    }
8887
8888    result = any_find_slice(direction,
8889        str, sub, start, end
8890        );
8891
8892    Py_DECREF(str);
8893    Py_DECREF(sub);
8894
8895    return result;
8896}
8897
8898Py_ssize_t
8899PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8900                   Py_ssize_t start, Py_ssize_t end,
8901                   int direction)
8902{
8903    int kind;
8904    Py_ssize_t result;
8905    if (PyUnicode_READY(str) == -1)
8906        return -2;
8907    if (start < 0 || end < 0) {
8908        PyErr_SetString(PyExc_IndexError, "string index out of range");
8909        return -2;
8910    }
8911    if (end > PyUnicode_GET_LENGTH(str))
8912        end = PyUnicode_GET_LENGTH(str);
8913    kind = PyUnicode_KIND(str);
8914    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8915                      kind, end-start, ch, direction);
8916    if (result == -1)
8917        return -1;
8918    else
8919        return start + result;
8920}
8921
8922static int
8923tailmatch(PyObject *self,
8924          PyObject *substring,
8925          Py_ssize_t start,
8926          Py_ssize_t end,
8927          int direction)
8928{
8929    int kind_self;
8930    int kind_sub;
8931    void *data_self;
8932    void *data_sub;
8933    Py_ssize_t offset;
8934    Py_ssize_t i;
8935    Py_ssize_t end_sub;
8936
8937    if (PyUnicode_READY(self) == -1 ||
8938        PyUnicode_READY(substring) == -1)
8939        return 0;
8940
8941    if (PyUnicode_GET_LENGTH(substring) == 0)
8942        return 1;
8943
8944    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8945    end -= PyUnicode_GET_LENGTH(substring);
8946    if (end < start)
8947        return 0;
8948
8949    kind_self = PyUnicode_KIND(self);
8950    data_self = PyUnicode_DATA(self);
8951    kind_sub = PyUnicode_KIND(substring);
8952    data_sub = PyUnicode_DATA(substring);
8953    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8954
8955    if (direction > 0)
8956        offset = end;
8957    else
8958        offset = start;
8959
8960    if (PyUnicode_READ(kind_self, data_self, offset) ==
8961        PyUnicode_READ(kind_sub, data_sub, 0) &&
8962        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8963        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8964        /* If both are of the same kind, memcmp is sufficient */
8965        if (kind_self == kind_sub) {
8966            return ! memcmp((char *)data_self +
8967                                (offset * PyUnicode_KIND(substring)),
8968                            data_sub,
8969                            PyUnicode_GET_LENGTH(substring) *
8970                                PyUnicode_KIND(substring));
8971        }
8972        /* otherwise we have to compare each character by first accesing it */
8973        else {
8974            /* We do not need to compare 0 and len(substring)-1 because
8975               the if statement above ensured already that they are equal
8976               when we end up here. */
8977            /* TODO: honor direction and do a forward or backwards search */
8978            for (i = 1; i < end_sub; ++i) {
8979                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8980                    PyUnicode_READ(kind_sub, data_sub, i))
8981                    return 0;
8982            }
8983            return 1;
8984        }
8985    }
8986
8987    return 0;
8988}
8989
8990Py_ssize_t
8991PyUnicode_Tailmatch(PyObject *str,
8992                    PyObject *substr,
8993                    Py_ssize_t start,
8994                    Py_ssize_t end,
8995                    int direction)
8996{
8997    Py_ssize_t result;
8998
8999    str = PyUnicode_FromObject(str);
9000    if (str == NULL)
9001        return -1;
9002    substr = PyUnicode_FromObject(substr);
9003    if (substr == NULL) {
9004        Py_DECREF(str);
9005        return -1;
9006    }
9007
9008    result = tailmatch(str, substr,
9009                       start, end, direction);
9010    Py_DECREF(str);
9011    Py_DECREF(substr);
9012    return result;
9013}
9014
9015/* Apply fixfct filter to the Unicode object self and return a
9016   reference to the modified object */
9017
9018static PyObject *
9019fixup(PyObject *self,
9020      Py_UCS4 (*fixfct)(PyObject *s))
9021{
9022    PyObject *u;
9023    Py_UCS4 maxchar_old, maxchar_new = 0;
9024    PyObject *v;
9025
9026    u = _PyUnicode_Copy(self);
9027    if (u == NULL)
9028        return NULL;
9029    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9030
9031    /* fix functions return the new maximum character in a string,
9032       if the kind of the resulting unicode object does not change,
9033       everything is fine.  Otherwise we need to change the string kind
9034       and re-run the fix function. */
9035    maxchar_new = fixfct(u);
9036
9037    if (maxchar_new == 0) {
9038        /* no changes */;
9039        if (PyUnicode_CheckExact(self)) {
9040            Py_DECREF(u);
9041            Py_INCREF(self);
9042            return self;
9043        }
9044        else
9045            return u;
9046    }
9047
9048    maxchar_new = align_maxchar(maxchar_new);
9049
9050    if (maxchar_new == maxchar_old)
9051        return u;
9052
9053    /* In case the maximum character changed, we need to
9054       convert the string to the new category. */
9055    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9056    if (v == NULL) {
9057        Py_DECREF(u);
9058        return NULL;
9059    }
9060    if (maxchar_new > maxchar_old) {
9061        /* If the maxchar increased so that the kind changed, not all
9062           characters are representable anymore and we need to fix the
9063           string again. This only happens in very few cases. */
9064        _PyUnicode_FastCopyCharacters(v, 0,
9065                                      self, 0, PyUnicode_GET_LENGTH(self));
9066        maxchar_old = fixfct(v);
9067        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9068    }
9069    else {
9070        _PyUnicode_FastCopyCharacters(v, 0,
9071                                      u, 0, PyUnicode_GET_LENGTH(self));
9072    }
9073    Py_DECREF(u);
9074    assert(_PyUnicode_CheckConsistency(v, 1));
9075    return v;
9076}
9077
9078static PyObject *
9079ascii_upper_or_lower(PyObject *self, int lower)
9080{
9081    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9082    char *resdata, *data = PyUnicode_DATA(self);
9083    PyObject *res;
9084
9085    res = PyUnicode_New(len, 127);
9086    if (res == NULL)
9087        return NULL;
9088    resdata = PyUnicode_DATA(res);
9089    if (lower)
9090        _Py_bytes_lower(resdata, data, len);
9091    else
9092        _Py_bytes_upper(resdata, data, len);
9093    return res;
9094}
9095
9096static Py_UCS4
9097handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9098{
9099    Py_ssize_t j;
9100    int final_sigma;
9101    Py_UCS4 c;
9102    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9103
9104     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9105
9106    where ! is a negation and \p{xxx} is a character with property xxx.
9107    */
9108    for (j = i - 1; j >= 0; j--) {
9109        c = PyUnicode_READ(kind, data, j);
9110        if (!_PyUnicode_IsCaseIgnorable(c))
9111            break;
9112    }
9113    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9114    if (final_sigma) {
9115        for (j = i + 1; j < length; j++) {
9116            c = PyUnicode_READ(kind, data, j);
9117            if (!_PyUnicode_IsCaseIgnorable(c))
9118                break;
9119        }
9120        final_sigma = j == length || !_PyUnicode_IsCased(c);
9121    }
9122    return (final_sigma) ? 0x3C2 : 0x3C3;
9123}
9124
9125static int
9126lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9127           Py_UCS4 c, Py_UCS4 *mapped)
9128{
9129    /* Obscure special case. */
9130    if (c == 0x3A3) {
9131        mapped[0] = handle_capital_sigma(kind, data, length, i);
9132        return 1;
9133    }
9134    return _PyUnicode_ToLowerFull(c, mapped);
9135}
9136
9137static Py_ssize_t
9138do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9139{
9140    Py_ssize_t i, k = 0;
9141    int n_res, j;
9142    Py_UCS4 c, mapped[3];
9143
9144    c = PyUnicode_READ(kind, data, 0);
9145    n_res = _PyUnicode_ToUpperFull(c, mapped);
9146    for (j = 0; j < n_res; j++) {
9147        *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9148        res[k++] = mapped[j];
9149    }
9150    for (i = 1; i < length; i++) {
9151        c = PyUnicode_READ(kind, data, i);
9152        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9153        for (j = 0; j < n_res; j++) {
9154            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9155            res[k++] = mapped[j];
9156        }
9157    }
9158    return k;
9159}
9160
9161static Py_ssize_t
9162do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9163    Py_ssize_t i, k = 0;
9164
9165    for (i = 0; i < length; i++) {
9166        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9167        int n_res, j;
9168        if (Py_UNICODE_ISUPPER(c)) {
9169            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9170        }
9171        else if (Py_UNICODE_ISLOWER(c)) {
9172            n_res = _PyUnicode_ToUpperFull(c, mapped);
9173        }
9174        else {
9175            n_res = 1;
9176            mapped[0] = c;
9177        }
9178        for (j = 0; j < n_res; j++) {
9179            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9180            res[k++] = mapped[j];
9181        }
9182    }
9183    return k;
9184}
9185
9186static Py_ssize_t
9187do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9188                  Py_UCS4 *maxchar, int lower)
9189{
9190    Py_ssize_t i, k = 0;
9191
9192    for (i = 0; i < length; i++) {
9193        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9194        int n_res, j;
9195        if (lower)
9196            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9197        else
9198            n_res = _PyUnicode_ToUpperFull(c, mapped);
9199        for (j = 0; j < n_res; j++) {
9200            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9201            res[k++] = mapped[j];
9202        }
9203    }
9204    return k;
9205}
9206
9207static Py_ssize_t
9208do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9209{
9210    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9211}
9212
9213static Py_ssize_t
9214do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9215{
9216    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9217}
9218
9219static Py_ssize_t
9220do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9221{
9222    Py_ssize_t i, k = 0;
9223
9224    for (i = 0; i < length; i++) {
9225        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9226        Py_UCS4 mapped[3];
9227        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9228        for (j = 0; j < n_res; j++) {
9229            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9230            res[k++] = mapped[j];
9231        }
9232    }
9233    return k;
9234}
9235
9236static Py_ssize_t
9237do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9238{
9239    Py_ssize_t i, k = 0;
9240    int previous_is_cased;
9241
9242    previous_is_cased = 0;
9243    for (i = 0; i < length; i++) {
9244        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9245        Py_UCS4 mapped[3];
9246        int n_res, j;
9247
9248        if (previous_is_cased)
9249            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9250        else
9251            n_res = _PyUnicode_ToTitleFull(c, mapped);
9252
9253        for (j = 0; j < n_res; j++) {
9254            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9255            res[k++] = mapped[j];
9256        }
9257
9258        previous_is_cased = _PyUnicode_IsCased(c);
9259    }
9260    return k;
9261}
9262
9263static PyObject *
9264case_operation(PyObject *self,
9265               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9266{
9267    PyObject *res = NULL;
9268    Py_ssize_t length, newlength = 0;
9269    int kind, outkind;
9270    void *data, *outdata;
9271    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9272
9273    assert(PyUnicode_IS_READY(self));
9274
9275    kind = PyUnicode_KIND(self);
9276    data = PyUnicode_DATA(self);
9277    length = PyUnicode_GET_LENGTH(self);
9278    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9279    if (tmp == NULL)
9280        return PyErr_NoMemory();
9281    newlength = perform(kind, data, length, tmp, &maxchar);
9282    res = PyUnicode_New(newlength, maxchar);
9283    if (res == NULL)
9284        goto leave;
9285    tmpend = tmp + newlength;
9286    outdata = PyUnicode_DATA(res);
9287    outkind = PyUnicode_KIND(res);
9288    switch (outkind) {
9289    case PyUnicode_1BYTE_KIND:
9290        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9291        break;
9292    case PyUnicode_2BYTE_KIND:
9293        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9294        break;
9295    case PyUnicode_4BYTE_KIND:
9296        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9297        break;
9298    default:
9299        assert(0);
9300        break;
9301    }
9302  leave:
9303    PyMem_FREE(tmp);
9304    return res;
9305}
9306
9307PyObject *
9308PyUnicode_Join(PyObject *separator, PyObject *seq)
9309{
9310    PyObject *sep = NULL;
9311    Py_ssize_t seplen;
9312    PyObject *res = NULL; /* the result */
9313    PyObject *fseq;          /* PySequence_Fast(seq) */
9314    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9315    PyObject **items;
9316    PyObject *item;
9317    Py_ssize_t sz, i, res_offset;
9318    Py_UCS4 maxchar;
9319    Py_UCS4 item_maxchar;
9320    int use_memcpy;
9321    unsigned char *res_data = NULL, *sep_data = NULL;
9322    PyObject *last_obj;
9323    unsigned int kind = 0;
9324
9325    fseq = PySequence_Fast(seq, "");
9326    if (fseq == NULL) {
9327        return NULL;
9328    }
9329
9330    /* NOTE: the following code can't call back into Python code,
9331     * so we are sure that fseq won't be mutated.
9332     */
9333
9334    seqlen = PySequence_Fast_GET_SIZE(fseq);
9335    /* If empty sequence, return u"". */
9336    if (seqlen == 0) {
9337        Py_DECREF(fseq);
9338        Py_INCREF(unicode_empty);
9339        res = unicode_empty;
9340        return res;
9341    }
9342
9343    /* If singleton sequence with an exact Unicode, return that. */
9344    last_obj = NULL;
9345    items = PySequence_Fast_ITEMS(fseq);
9346    if (seqlen == 1) {
9347        if (PyUnicode_CheckExact(items[0])) {
9348            res = items[0];
9349            Py_INCREF(res);
9350            Py_DECREF(fseq);
9351            return res;
9352        }
9353        seplen = 0;
9354        maxchar = 0;
9355    }
9356    else {
9357        /* Set up sep and seplen */
9358        if (separator == NULL) {
9359            /* fall back to a blank space separator */
9360            sep = PyUnicode_FromOrdinal(' ');
9361            if (!sep)
9362                goto onError;
9363            seplen = 1;
9364            maxchar = 32;
9365        }
9366        else {
9367            if (!PyUnicode_Check(separator)) {
9368                PyErr_Format(PyExc_TypeError,
9369                             "separator: expected str instance,"
9370                             " %.80s found",
9371                             Py_TYPE(separator)->tp_name);
9372                goto onError;
9373            }
9374            if (PyUnicode_READY(separator))
9375                goto onError;
9376            sep = separator;
9377            seplen = PyUnicode_GET_LENGTH(separator);
9378            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9379            /* inc refcount to keep this code path symmetric with the
9380               above case of a blank separator */
9381            Py_INCREF(sep);
9382        }
9383        last_obj = sep;
9384    }
9385
9386    /* There are at least two things to join, or else we have a subclass
9387     * of str in the sequence.
9388     * Do a pre-pass to figure out the total amount of space we'll
9389     * need (sz), and see whether all argument are strings.
9390     */
9391    sz = 0;
9392#ifdef Py_DEBUG
9393    use_memcpy = 0;
9394#else
9395    use_memcpy = 1;
9396#endif
9397    for (i = 0; i < seqlen; i++) {
9398        const Py_ssize_t old_sz = sz;
9399        item = items[i];
9400        if (!PyUnicode_Check(item)) {
9401            PyErr_Format(PyExc_TypeError,
9402                         "sequence item %zd: expected str instance,"
9403                         " %.80s found",
9404                         i, Py_TYPE(item)->tp_name);
9405            goto onError;
9406        }
9407        if (PyUnicode_READY(item) == -1)
9408            goto onError;
9409        sz += PyUnicode_GET_LENGTH(item);
9410        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9411        maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
9412        if (i != 0)
9413            sz += seplen;
9414        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9415            PyErr_SetString(PyExc_OverflowError,
9416                            "join() result is too long for a Python string");
9417            goto onError;
9418        }
9419        if (use_memcpy && last_obj != NULL) {
9420            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9421                use_memcpy = 0;
9422        }
9423        last_obj = item;
9424    }
9425
9426    res = PyUnicode_New(sz, maxchar);
9427    if (res == NULL)
9428        goto onError;
9429
9430    /* Catenate everything. */
9431#ifdef Py_DEBUG
9432    use_memcpy = 0;
9433#else
9434    if (use_memcpy) {
9435        res_data = PyUnicode_1BYTE_DATA(res);
9436        kind = PyUnicode_KIND(res);
9437        if (seplen != 0)
9438            sep_data = PyUnicode_1BYTE_DATA(sep);
9439    }
9440#endif
9441    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9442        Py_ssize_t itemlen;
9443        item = items[i];
9444        /* Copy item, and maybe the separator. */
9445        if (i && seplen != 0) {
9446            if (use_memcpy) {
9447                Py_MEMCPY(res_data,
9448                          sep_data,
9449                          kind * seplen);
9450                res_data += kind * seplen;
9451            }
9452            else {
9453                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9454                res_offset += seplen;
9455            }
9456        }
9457        itemlen = PyUnicode_GET_LENGTH(item);
9458        if (itemlen != 0) {
9459            if (use_memcpy) {
9460                Py_MEMCPY(res_data,
9461                          PyUnicode_DATA(item),
9462                          kind * itemlen);
9463                res_data += kind * itemlen;
9464            }
9465            else {
9466                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9467                res_offset += itemlen;
9468            }
9469        }
9470    }
9471    if (use_memcpy)
9472        assert(res_data == PyUnicode_1BYTE_DATA(res)
9473                           + kind * PyUnicode_GET_LENGTH(res));
9474    else
9475        assert(res_offset == PyUnicode_GET_LENGTH(res));
9476
9477    Py_DECREF(fseq);
9478    Py_XDECREF(sep);
9479    assert(_PyUnicode_CheckConsistency(res, 1));
9480    return res;
9481
9482  onError:
9483    Py_DECREF(fseq);
9484    Py_XDECREF(sep);
9485    Py_XDECREF(res);
9486    return NULL;
9487}
9488
9489#define FILL(kind, data, value, start, length) \
9490    do { \
9491        Py_ssize_t i_ = 0; \
9492        assert(kind != PyUnicode_WCHAR_KIND); \
9493        switch ((kind)) { \
9494        case PyUnicode_1BYTE_KIND: { \
9495            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9496            memset(to_, (unsigned char)value, (length)); \
9497            break; \
9498        } \
9499        case PyUnicode_2BYTE_KIND: { \
9500            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9501            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9502            break; \
9503        } \
9504        case PyUnicode_4BYTE_KIND: { \
9505            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9506            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9507            break; \
9508        default: assert(0); \
9509        } \
9510        } \
9511    } while (0)
9512
9513void
9514_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9515                    Py_UCS4 fill_char)
9516{
9517    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9518    const void *data = PyUnicode_DATA(unicode);
9519    assert(PyUnicode_IS_READY(unicode));
9520    assert(unicode_modifiable(unicode));
9521    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9522    assert(start >= 0);
9523    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9524    FILL(kind, data, fill_char, start, length);
9525}
9526
9527Py_ssize_t
9528PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9529               Py_UCS4 fill_char)
9530{
9531    Py_ssize_t maxlen;
9532
9533    if (!PyUnicode_Check(unicode)) {
9534        PyErr_BadInternalCall();
9535        return -1;
9536    }
9537    if (PyUnicode_READY(unicode) == -1)
9538        return -1;
9539    if (unicode_check_modifiable(unicode))
9540        return -1;
9541
9542    if (start < 0) {
9543        PyErr_SetString(PyExc_IndexError, "string index out of range");
9544        return -1;
9545    }
9546    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9547        PyErr_SetString(PyExc_ValueError,
9548                         "fill character is bigger than "
9549                         "the string maximum character");
9550        return -1;
9551    }
9552
9553    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9554    length = Py_MIN(maxlen, length);
9555    if (length <= 0)
9556        return 0;
9557
9558    _PyUnicode_FastFill(unicode, start, length, fill_char);
9559    return length;
9560}
9561
9562static PyObject *
9563pad(PyObject *self,
9564    Py_ssize_t left,
9565    Py_ssize_t right,
9566    Py_UCS4 fill)
9567{
9568    PyObject *u;
9569    Py_UCS4 maxchar;
9570    int kind;
9571    void *data;
9572
9573    if (left < 0)
9574        left = 0;
9575    if (right < 0)
9576        right = 0;
9577
9578    if (left == 0 && right == 0)
9579        return unicode_result_unchanged(self);
9580
9581    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9582        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9583        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9584        return NULL;
9585    }
9586    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9587    maxchar = MAX_MAXCHAR(maxchar, fill);
9588    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9589    if (!u)
9590        return NULL;
9591
9592    kind = PyUnicode_KIND(u);
9593    data = PyUnicode_DATA(u);
9594    if (left)
9595        FILL(kind, data, fill, 0, left);
9596    if (right)
9597        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9598    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9599    assert(_PyUnicode_CheckConsistency(u, 1));
9600    return u;
9601}
9602
9603PyObject *
9604PyUnicode_Splitlines(PyObject *string, int keepends)
9605{
9606    PyObject *list;
9607
9608    string = PyUnicode_FromObject(string);
9609    if (string == NULL)
9610        return NULL;
9611    if (PyUnicode_READY(string) == -1) {
9612        Py_DECREF(string);
9613        return NULL;
9614    }
9615
9616    switch (PyUnicode_KIND(string)) {
9617    case PyUnicode_1BYTE_KIND:
9618        if (PyUnicode_IS_ASCII(string))
9619            list = asciilib_splitlines(
9620                string, PyUnicode_1BYTE_DATA(string),
9621                PyUnicode_GET_LENGTH(string), keepends);
9622        else
9623            list = ucs1lib_splitlines(
9624                string, PyUnicode_1BYTE_DATA(string),
9625                PyUnicode_GET_LENGTH(string), keepends);
9626        break;
9627    case PyUnicode_2BYTE_KIND:
9628        list = ucs2lib_splitlines(
9629            string, PyUnicode_2BYTE_DATA(string),
9630            PyUnicode_GET_LENGTH(string), keepends);
9631        break;
9632    case PyUnicode_4BYTE_KIND:
9633        list = ucs4lib_splitlines(
9634            string, PyUnicode_4BYTE_DATA(string),
9635            PyUnicode_GET_LENGTH(string), keepends);
9636        break;
9637    default:
9638        assert(0);
9639        list = 0;
9640    }
9641    Py_DECREF(string);
9642    return list;
9643}
9644
9645static PyObject *
9646split(PyObject *self,
9647      PyObject *substring,
9648      Py_ssize_t maxcount)
9649{
9650    int kind1, kind2, kind;
9651    void *buf1, *buf2;
9652    Py_ssize_t len1, len2;
9653    PyObject* out;
9654
9655    if (maxcount < 0)
9656        maxcount = PY_SSIZE_T_MAX;
9657
9658    if (PyUnicode_READY(self) == -1)
9659        return NULL;
9660
9661    if (substring == NULL)
9662        switch (PyUnicode_KIND(self)) {
9663        case PyUnicode_1BYTE_KIND:
9664            if (PyUnicode_IS_ASCII(self))
9665                return asciilib_split_whitespace(
9666                    self,  PyUnicode_1BYTE_DATA(self),
9667                    PyUnicode_GET_LENGTH(self), maxcount
9668                    );
9669            else
9670                return ucs1lib_split_whitespace(
9671                    self,  PyUnicode_1BYTE_DATA(self),
9672                    PyUnicode_GET_LENGTH(self), maxcount
9673                    );
9674        case PyUnicode_2BYTE_KIND:
9675            return ucs2lib_split_whitespace(
9676                self,  PyUnicode_2BYTE_DATA(self),
9677                PyUnicode_GET_LENGTH(self), maxcount
9678                );
9679        case PyUnicode_4BYTE_KIND:
9680            return ucs4lib_split_whitespace(
9681                self,  PyUnicode_4BYTE_DATA(self),
9682                PyUnicode_GET_LENGTH(self), maxcount
9683                );
9684        default:
9685            assert(0);
9686            return NULL;
9687        }
9688
9689    if (PyUnicode_READY(substring) == -1)
9690        return NULL;
9691
9692    kind1 = PyUnicode_KIND(self);
9693    kind2 = PyUnicode_KIND(substring);
9694    kind = kind1 > kind2 ? kind1 : kind2;
9695    buf1 = PyUnicode_DATA(self);
9696    buf2 = PyUnicode_DATA(substring);
9697    if (kind1 != kind)
9698        buf1 = _PyUnicode_AsKind(self, kind);
9699    if (!buf1)
9700        return NULL;
9701    if (kind2 != kind)
9702        buf2 = _PyUnicode_AsKind(substring, kind);
9703    if (!buf2) {
9704        if (kind1 != kind) PyMem_Free(buf1);
9705        return NULL;
9706    }
9707    len1 = PyUnicode_GET_LENGTH(self);
9708    len2 = PyUnicode_GET_LENGTH(substring);
9709
9710    switch (kind) {
9711    case PyUnicode_1BYTE_KIND:
9712        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9713            out = asciilib_split(
9714                self,  buf1, len1, buf2, len2, maxcount);
9715        else
9716            out = ucs1lib_split(
9717                self,  buf1, len1, buf2, len2, maxcount);
9718        break;
9719    case PyUnicode_2BYTE_KIND:
9720        out = ucs2lib_split(
9721            self,  buf1, len1, buf2, len2, maxcount);
9722        break;
9723    case PyUnicode_4BYTE_KIND:
9724        out = ucs4lib_split(
9725            self,  buf1, len1, buf2, len2, maxcount);
9726        break;
9727    default:
9728        out = NULL;
9729    }
9730    if (kind1 != kind)
9731        PyMem_Free(buf1);
9732    if (kind2 != kind)
9733        PyMem_Free(buf2);
9734    return out;
9735}
9736
9737static PyObject *
9738rsplit(PyObject *self,
9739       PyObject *substring,
9740       Py_ssize_t maxcount)
9741{
9742    int kind1, kind2, kind;
9743    void *buf1, *buf2;
9744    Py_ssize_t len1, len2;
9745    PyObject* out;
9746
9747    if (maxcount < 0)
9748        maxcount = PY_SSIZE_T_MAX;
9749
9750    if (PyUnicode_READY(self) == -1)
9751        return NULL;
9752
9753    if (substring == NULL)
9754        switch (PyUnicode_KIND(self)) {
9755        case PyUnicode_1BYTE_KIND:
9756            if (PyUnicode_IS_ASCII(self))
9757                return asciilib_rsplit_whitespace(
9758                    self,  PyUnicode_1BYTE_DATA(self),
9759                    PyUnicode_GET_LENGTH(self), maxcount
9760                    );
9761            else
9762                return ucs1lib_rsplit_whitespace(
9763                    self,  PyUnicode_1BYTE_DATA(self),
9764                    PyUnicode_GET_LENGTH(self), maxcount
9765                    );
9766        case PyUnicode_2BYTE_KIND:
9767            return ucs2lib_rsplit_whitespace(
9768                self,  PyUnicode_2BYTE_DATA(self),
9769                PyUnicode_GET_LENGTH(self), maxcount
9770                );
9771        case PyUnicode_4BYTE_KIND:
9772            return ucs4lib_rsplit_whitespace(
9773                self,  PyUnicode_4BYTE_DATA(self),
9774                PyUnicode_GET_LENGTH(self), maxcount
9775                );
9776        default:
9777            assert(0);
9778            return NULL;
9779        }
9780
9781    if (PyUnicode_READY(substring) == -1)
9782        return NULL;
9783
9784    kind1 = PyUnicode_KIND(self);
9785    kind2 = PyUnicode_KIND(substring);
9786    kind = kind1 > kind2 ? kind1 : kind2;
9787    buf1 = PyUnicode_DATA(self);
9788    buf2 = PyUnicode_DATA(substring);
9789    if (kind1 != kind)
9790        buf1 = _PyUnicode_AsKind(self, kind);
9791    if (!buf1)
9792        return NULL;
9793    if (kind2 != kind)
9794        buf2 = _PyUnicode_AsKind(substring, kind);
9795    if (!buf2) {
9796        if (kind1 != kind) PyMem_Free(buf1);
9797        return NULL;
9798    }
9799    len1 = PyUnicode_GET_LENGTH(self);
9800    len2 = PyUnicode_GET_LENGTH(substring);
9801
9802    switch (kind) {
9803    case PyUnicode_1BYTE_KIND:
9804        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9805            out = asciilib_rsplit(
9806                self,  buf1, len1, buf2, len2, maxcount);
9807        else
9808            out = ucs1lib_rsplit(
9809                self,  buf1, len1, buf2, len2, maxcount);
9810        break;
9811    case PyUnicode_2BYTE_KIND:
9812        out = ucs2lib_rsplit(
9813            self,  buf1, len1, buf2, len2, maxcount);
9814        break;
9815    case PyUnicode_4BYTE_KIND:
9816        out = ucs4lib_rsplit(
9817            self,  buf1, len1, buf2, len2, maxcount);
9818        break;
9819    default:
9820        out = NULL;
9821    }
9822    if (kind1 != kind)
9823        PyMem_Free(buf1);
9824    if (kind2 != kind)
9825        PyMem_Free(buf2);
9826    return out;
9827}
9828
9829static Py_ssize_t
9830anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9831            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9832{
9833    switch (kind) {
9834    case PyUnicode_1BYTE_KIND:
9835        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9836            return asciilib_find(buf1, len1, buf2, len2, offset);
9837        else
9838            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9839    case PyUnicode_2BYTE_KIND:
9840        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9841    case PyUnicode_4BYTE_KIND:
9842        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9843    }
9844    assert(0);
9845    return -1;
9846}
9847
9848static Py_ssize_t
9849anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9850             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9851{
9852    switch (kind) {
9853    case PyUnicode_1BYTE_KIND:
9854        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9855            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9856        else
9857            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9858    case PyUnicode_2BYTE_KIND:
9859        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9860    case PyUnicode_4BYTE_KIND:
9861        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9862    }
9863    assert(0);
9864    return 0;
9865}
9866
9867static PyObject *
9868replace(PyObject *self, PyObject *str1,
9869        PyObject *str2, Py_ssize_t maxcount)
9870{
9871    PyObject *u;
9872    char *sbuf = PyUnicode_DATA(self);
9873    char *buf1 = PyUnicode_DATA(str1);
9874    char *buf2 = PyUnicode_DATA(str2);
9875    int srelease = 0, release1 = 0, release2 = 0;
9876    int skind = PyUnicode_KIND(self);
9877    int kind1 = PyUnicode_KIND(str1);
9878    int kind2 = PyUnicode_KIND(str2);
9879    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9880    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9881    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9882    int mayshrink;
9883    Py_UCS4 maxchar, maxchar_str2;
9884
9885    if (maxcount < 0)
9886        maxcount = PY_SSIZE_T_MAX;
9887    else if (maxcount == 0 || slen == 0)
9888        goto nothing;
9889
9890    if (str1 == str2)
9891        goto nothing;
9892    if (skind < kind1)
9893        /* substring too wide to be present */
9894        goto nothing;
9895
9896    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9897    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9898    /* Replacing str1 with str2 may cause a maxchar reduction in the
9899       result string. */
9900    mayshrink = (maxchar_str2 < maxchar);
9901    maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
9902
9903    if (len1 == len2) {
9904        /* same length */
9905        if (len1 == 0)
9906            goto nothing;
9907        if (len1 == 1) {
9908            /* replace characters */
9909            Py_UCS4 u1, u2;
9910            int rkind;
9911            Py_ssize_t index, pos;
9912            char *src;
9913
9914            u1 = PyUnicode_READ_CHAR(str1, 0);
9915            pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9916            if (pos < 0)
9917                goto nothing;
9918            u2 = PyUnicode_READ_CHAR(str2, 0);
9919            u = PyUnicode_New(slen, maxchar);
9920            if (!u)
9921                goto error;
9922            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
9923            rkind = PyUnicode_KIND(u);
9924
9925            PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9926            index = 0;
9927            src = sbuf;
9928            while (--maxcount)
9929            {
9930                pos++;
9931                src += pos * PyUnicode_KIND(self);
9932                slen -= pos;
9933                index += pos;
9934                pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9935                if (pos < 0)
9936                    break;
9937                PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9938            }
9939        }
9940        else {
9941            int rkind = skind;
9942            char *res;
9943            Py_ssize_t i;
9944
9945            if (kind1 < rkind) {
9946                /* widen substring */
9947                buf1 = _PyUnicode_AsKind(str1, rkind);
9948                if (!buf1) goto error;
9949                release1 = 1;
9950            }
9951            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
9952            if (i < 0)
9953                goto nothing;
9954            if (rkind > kind2) {
9955                /* widen replacement */
9956                buf2 = _PyUnicode_AsKind(str2, rkind);
9957                if (!buf2) goto error;
9958                release2 = 1;
9959            }
9960            else if (rkind < kind2) {
9961                /* widen self and buf1 */
9962                rkind = kind2;
9963                if (release1) PyMem_Free(buf1);
9964                sbuf = _PyUnicode_AsKind(self, rkind);
9965                if (!sbuf) goto error;
9966                srelease = 1;
9967                buf1 = _PyUnicode_AsKind(str1, rkind);
9968                if (!buf1) goto error;
9969                release1 = 1;
9970            }
9971            u = PyUnicode_New(slen, maxchar);
9972            if (!u)
9973                goto error;
9974            assert(PyUnicode_KIND(u) == rkind);
9975            res = PyUnicode_DATA(u);
9976
9977            memcpy(res, sbuf, rkind * slen);
9978            /* change everything in-place, starting with this one */
9979            memcpy(res + rkind * i,
9980                   buf2,
9981                   rkind * len2);
9982            i += len1;
9983
9984            while ( --maxcount > 0) {
9985                i = anylib_find(rkind, self,
9986                                sbuf+rkind*i, slen-i,
9987                                str1, buf1, len1, i);
9988                if (i == -1)
9989                    break;
9990                memcpy(res + rkind * i,
9991                       buf2,
9992                       rkind * len2);
9993                i += len1;
9994            }
9995        }
9996    }
9997    else {
9998        Py_ssize_t n, i, j, ires;
9999        Py_ssize_t new_size;
10000        int rkind = skind;
10001        char *res;
10002
10003        if (kind1 < rkind) {
10004            /* widen substring */
10005            buf1 = _PyUnicode_AsKind(str1, rkind);
10006            if (!buf1) goto error;
10007            release1 = 1;
10008        }
10009        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10010        if (n == 0)
10011            goto nothing;
10012        if (kind2 < rkind) {
10013            /* widen replacement */
10014            buf2 = _PyUnicode_AsKind(str2, rkind);
10015            if (!buf2) goto error;
10016            release2 = 1;
10017        }
10018        else if (kind2 > rkind) {
10019            /* widen self and buf1 */
10020            rkind = kind2;
10021            sbuf = _PyUnicode_AsKind(self, rkind);
10022            if (!sbuf) goto error;
10023            srelease = 1;
10024            if (release1) PyMem_Free(buf1);
10025            buf1 = _PyUnicode_AsKind(str1, rkind);
10026            if (!buf1) goto error;
10027            release1 = 1;
10028        }
10029        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10030           PyUnicode_GET_LENGTH(str1))); */
10031        if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10032                PyErr_SetString(PyExc_OverflowError,
10033                                "replace string is too long");
10034                goto error;
10035        }
10036        new_size = slen + n * (len2 - len1);
10037        if (new_size == 0) {
10038            Py_INCREF(unicode_empty);
10039            u = unicode_empty;
10040            goto done;
10041        }
10042        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10043            PyErr_SetString(PyExc_OverflowError,
10044                            "replace string is too long");
10045            goto error;
10046        }
10047        u = PyUnicode_New(new_size, maxchar);
10048        if (!u)
10049            goto error;
10050        assert(PyUnicode_KIND(u) == rkind);
10051        res = PyUnicode_DATA(u);
10052        ires = i = 0;
10053        if (len1 > 0) {
10054            while (n-- > 0) {
10055                /* look for next match */
10056                j = anylib_find(rkind, self,
10057                                sbuf + rkind * i, slen-i,
10058                                str1, buf1, len1, i);
10059                if (j == -1)
10060                    break;
10061                else if (j > i) {
10062                    /* copy unchanged part [i:j] */
10063                    memcpy(res + rkind * ires,
10064                           sbuf + rkind * i,
10065                           rkind * (j-i));
10066                    ires += j - i;
10067                }
10068                /* copy substitution string */
10069                if (len2 > 0) {
10070                    memcpy(res + rkind * ires,
10071                           buf2,
10072                           rkind * len2);
10073                    ires += len2;
10074                }
10075                i = j + len1;
10076            }
10077            if (i < slen)
10078                /* copy tail [i:] */
10079                memcpy(res + rkind * ires,
10080                       sbuf + rkind * i,
10081                       rkind * (slen-i));
10082        }
10083        else {
10084            /* interleave */
10085            while (n > 0) {
10086                memcpy(res + rkind * ires,
10087                       buf2,
10088                       rkind * len2);
10089                ires += len2;
10090                if (--n <= 0)
10091                    break;
10092                memcpy(res + rkind * ires,
10093                       sbuf + rkind * i,
10094                       rkind);
10095                ires++;
10096                i++;
10097            }
10098            memcpy(res + rkind * ires,
10099                   sbuf + rkind * i,
10100                   rkind * (slen-i));
10101        }
10102    }
10103
10104    if (mayshrink) {
10105        unicode_adjust_maxchar(&u);
10106        if (u == NULL)
10107            goto error;
10108    }
10109
10110  done:
10111    if (srelease)
10112        PyMem_FREE(sbuf);
10113    if (release1)
10114        PyMem_FREE(buf1);
10115    if (release2)
10116        PyMem_FREE(buf2);
10117    assert(_PyUnicode_CheckConsistency(u, 1));
10118    return u;
10119
10120  nothing:
10121    /* nothing to replace; return original string (when possible) */
10122    if (srelease)
10123        PyMem_FREE(sbuf);
10124    if (release1)
10125        PyMem_FREE(buf1);
10126    if (release2)
10127        PyMem_FREE(buf2);
10128    return unicode_result_unchanged(self);
10129
10130  error:
10131    if (srelease && sbuf)
10132        PyMem_FREE(sbuf);
10133    if (release1 && buf1)
10134        PyMem_FREE(buf1);
10135    if (release2 && buf2)
10136        PyMem_FREE(buf2);
10137    return NULL;
10138}
10139
10140/* --- Unicode Object Methods --------------------------------------------- */
10141
10142PyDoc_STRVAR(title__doc__,
10143             "S.title() -> str\n\
10144\n\
10145Return a titlecased version of S, i.e. words start with title case\n\
10146characters, all remaining cased characters have lower case.");
10147
10148static PyObject*
10149unicode_title(PyObject *self)
10150{
10151    if (PyUnicode_READY(self) == -1)
10152        return NULL;
10153    return case_operation(self, do_title);
10154}
10155
10156PyDoc_STRVAR(capitalize__doc__,
10157             "S.capitalize() -> str\n\
10158\n\
10159Return a capitalized version of S, i.e. make the first character\n\
10160have upper case and the rest lower case.");
10161
10162static PyObject*
10163unicode_capitalize(PyObject *self)
10164{
10165    if (PyUnicode_READY(self) == -1)
10166        return NULL;
10167    if (PyUnicode_GET_LENGTH(self) == 0)
10168        return unicode_result_unchanged(self);
10169    return case_operation(self, do_capitalize);
10170}
10171
10172PyDoc_STRVAR(casefold__doc__,
10173             "S.casefold() -> str\n\
10174\n\
10175Return a version of S suitable for caseless comparisons.");
10176
10177static PyObject *
10178unicode_casefold(PyObject *self)
10179{
10180    if (PyUnicode_READY(self) == -1)
10181        return NULL;
10182    if (PyUnicode_IS_ASCII(self))
10183        return ascii_upper_or_lower(self, 1);
10184    return case_operation(self, do_casefold);
10185}
10186
10187
10188/* Argument converter.  Coerces to a single unicode character */
10189
10190static int
10191convert_uc(PyObject *obj, void *addr)
10192{
10193    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10194    PyObject *uniobj;
10195
10196    uniobj = PyUnicode_FromObject(obj);
10197    if (uniobj == NULL) {
10198        PyErr_SetString(PyExc_TypeError,
10199                        "The fill character cannot be converted to Unicode");
10200        return 0;
10201    }
10202    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10203        PyErr_SetString(PyExc_TypeError,
10204                        "The fill character must be exactly one character long");
10205        Py_DECREF(uniobj);
10206        return 0;
10207    }
10208    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10209    Py_DECREF(uniobj);
10210    return 1;
10211}
10212
10213PyDoc_STRVAR(center__doc__,
10214             "S.center(width[, fillchar]) -> str\n\
10215\n\
10216Return S centered in a string of length width. Padding is\n\
10217done using the specified fill character (default is a space)");
10218
10219static PyObject *
10220unicode_center(PyObject *self, PyObject *args)
10221{
10222    Py_ssize_t marg, left;
10223    Py_ssize_t width;
10224    Py_UCS4 fillchar = ' ';
10225
10226    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10227        return NULL;
10228
10229    if (PyUnicode_READY(self) == -1)
10230        return NULL;
10231
10232    if (PyUnicode_GET_LENGTH(self) >= width)
10233        return unicode_result_unchanged(self);
10234
10235    marg = width - PyUnicode_GET_LENGTH(self);
10236    left = marg / 2 + (marg & width & 1);
10237
10238    return pad(self, left, marg - left, fillchar);
10239}
10240
10241/* This function assumes that str1 and str2 are readied by the caller. */
10242
10243static int
10244unicode_compare(PyObject *str1, PyObject *str2)
10245{
10246    int kind1, kind2;
10247    void *data1, *data2;
10248    Py_ssize_t len1, len2;
10249    Py_ssize_t i, len;
10250
10251    /* a string is equal to itself */
10252    if (str1 == str2)
10253        return 0;
10254
10255    kind1 = PyUnicode_KIND(str1);
10256    kind2 = PyUnicode_KIND(str2);
10257    data1 = PyUnicode_DATA(str1);
10258    data2 = PyUnicode_DATA(str2);
10259    len1 = PyUnicode_GET_LENGTH(str1);
10260    len2 = PyUnicode_GET_LENGTH(str2);
10261    len = Py_MIN(len1, len2);
10262
10263    if (kind1 == 1 && kind2 == 1) {
10264        int cmp = memcmp(data1, data2, len);
10265        /* normalize result of memcmp() into the range [-1; 1] */
10266        if (cmp < 0)
10267            return -1;
10268        if (cmp > 0)
10269            return 1;
10270    }
10271    else {
10272        for (i = 0; i < len; ++i) {
10273            Py_UCS4 c1, c2;
10274            c1 = PyUnicode_READ(kind1, data1, i);
10275            c2 = PyUnicode_READ(kind2, data2, i);
10276
10277            if (c1 != c2)
10278                return (c1 < c2) ? -1 : 1;
10279        }
10280    }
10281
10282    if (len1 == len2)
10283        return 0;
10284    if (len1 < len2)
10285        return -1;
10286    else
10287        return 1;
10288}
10289
10290static int
10291unicode_compare_eq(PyObject *str1, PyObject *str2)
10292{
10293    int kind;
10294    void *data1, *data2;
10295    Py_ssize_t len;
10296    int cmp;
10297
10298    /* a string is equal to itself */
10299    if (str1 == str2)
10300        return 1;
10301
10302    len = PyUnicode_GET_LENGTH(str1);
10303    if (PyUnicode_GET_LENGTH(str2) != len)
10304        return 0;
10305    kind = PyUnicode_KIND(str1);
10306    if (PyUnicode_KIND(str2) != kind)
10307        return 0;
10308    data1 = PyUnicode_DATA(str1);
10309    data2 = PyUnicode_DATA(str2);
10310
10311    cmp = memcmp(data1, data2, len * kind);
10312    return (cmp == 0);
10313}
10314
10315
10316int
10317PyUnicode_Compare(PyObject *left, PyObject *right)
10318{
10319    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10320        if (PyUnicode_READY(left) == -1 ||
10321            PyUnicode_READY(right) == -1)
10322            return -1;
10323        return unicode_compare(left, right);
10324    }
10325    PyErr_Format(PyExc_TypeError,
10326                 "Can't compare %.100s and %.100s",
10327                 left->ob_type->tp_name,
10328                 right->ob_type->tp_name);
10329    return -1;
10330}
10331
10332int
10333PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10334{
10335    Py_ssize_t i;
10336    int kind;
10337    void *data;
10338    Py_UCS4 chr;
10339
10340    assert(_PyUnicode_CHECK(uni));
10341    if (PyUnicode_READY(uni) == -1)
10342        return -1;
10343    kind = PyUnicode_KIND(uni);
10344    data = PyUnicode_DATA(uni);
10345    /* Compare Unicode string and source character set string */
10346    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10347        if (chr != str[i])
10348            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10349    /* This check keeps Python strings that end in '\0' from comparing equal
10350     to C strings identical up to that point. */
10351    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10352        return 1; /* uni is longer */
10353    if (str[i])
10354        return -1; /* str is longer */
10355    return 0;
10356}
10357
10358
10359#define TEST_COND(cond)                         \
10360    ((cond) ? Py_True : Py_False)
10361
10362PyObject *
10363PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10364{
10365    int result;
10366    PyObject *v;
10367
10368    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10369        Py_RETURN_NOTIMPLEMENTED;
10370
10371    if (PyUnicode_READY(left) == -1 ||
10372        PyUnicode_READY(right) == -1)
10373        return NULL;
10374
10375    if (op == Py_EQ || op == Py_NE) {
10376        result = unicode_compare_eq(left, right);
10377        if (op == Py_EQ)
10378            v = TEST_COND(result);
10379        else
10380            v = TEST_COND(!result);
10381    }
10382    else {
10383        result = unicode_compare(left, right);
10384
10385        /* Convert the return value to a Boolean */
10386        switch (op) {
10387        case Py_LE:
10388            v = TEST_COND(result <= 0);
10389            break;
10390        case Py_GE:
10391            v = TEST_COND(result >= 0);
10392            break;
10393        case Py_LT:
10394            v = TEST_COND(result == -1);
10395            break;
10396        case Py_GT:
10397            v = TEST_COND(result == 1);
10398            break;
10399        default:
10400            PyErr_BadArgument();
10401            return NULL;
10402        }
10403    }
10404    Py_INCREF(v);
10405    return v;
10406}
10407
10408int
10409PyUnicode_Contains(PyObject *container, PyObject *element)
10410{
10411    PyObject *str, *sub;
10412    int kind1, kind2, kind;
10413    void *buf1, *buf2;
10414    Py_ssize_t len1, len2;
10415    int result;
10416
10417    /* Coerce the two arguments */
10418    sub = PyUnicode_FromObject(element);
10419    if (!sub) {
10420        PyErr_Format(PyExc_TypeError,
10421                     "'in <string>' requires string as left operand, not %s",
10422                     element->ob_type->tp_name);
10423        return -1;
10424    }
10425
10426    str = PyUnicode_FromObject(container);
10427    if (!str) {
10428        Py_DECREF(sub);
10429        return -1;
10430    }
10431    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10432        Py_DECREF(sub);
10433        Py_DECREF(str);
10434    }
10435
10436    kind1 = PyUnicode_KIND(str);
10437    kind2 = PyUnicode_KIND(sub);
10438    kind = kind1;
10439    buf1 = PyUnicode_DATA(str);
10440    buf2 = PyUnicode_DATA(sub);
10441    if (kind2 != kind) {
10442        if (kind2 > kind) {
10443            Py_DECREF(sub);
10444            Py_DECREF(str);
10445            return 0;
10446        }
10447        buf2 = _PyUnicode_AsKind(sub, kind);
10448    }
10449    if (!buf2) {
10450        Py_DECREF(sub);
10451        Py_DECREF(str);
10452        return -1;
10453    }
10454    len1 = PyUnicode_GET_LENGTH(str);
10455    len2 = PyUnicode_GET_LENGTH(sub);
10456
10457    switch (kind) {
10458    case PyUnicode_1BYTE_KIND:
10459        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10460        break;
10461    case PyUnicode_2BYTE_KIND:
10462        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10463        break;
10464    case PyUnicode_4BYTE_KIND:
10465        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10466        break;
10467    default:
10468        result = -1;
10469        assert(0);
10470    }
10471
10472    Py_DECREF(str);
10473    Py_DECREF(sub);
10474
10475    if (kind2 != kind)
10476        PyMem_Free(buf2);
10477
10478    return result;
10479}
10480
10481/* Concat to string or Unicode object giving a new Unicode object. */
10482
10483PyObject *
10484PyUnicode_Concat(PyObject *left, PyObject *right)
10485{
10486    PyObject *u = NULL, *v = NULL, *w;
10487    Py_UCS4 maxchar, maxchar2;
10488    Py_ssize_t u_len, v_len, new_len;
10489
10490    /* Coerce the two arguments */
10491    u = PyUnicode_FromObject(left);
10492    if (u == NULL)
10493        goto onError;
10494    v = PyUnicode_FromObject(right);
10495    if (v == NULL)
10496        goto onError;
10497
10498    /* Shortcuts */
10499    if (v == unicode_empty) {
10500        Py_DECREF(v);
10501        return u;
10502    }
10503    if (u == unicode_empty) {
10504        Py_DECREF(u);
10505        return v;
10506    }
10507
10508    u_len = PyUnicode_GET_LENGTH(u);
10509    v_len = PyUnicode_GET_LENGTH(v);
10510    if (u_len > PY_SSIZE_T_MAX - v_len) {
10511        PyErr_SetString(PyExc_OverflowError,
10512                        "strings are too large to concat");
10513        goto onError;
10514    }
10515    new_len = u_len + v_len;
10516
10517    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10518    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10519    maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10520
10521    /* Concat the two Unicode strings */
10522    w = PyUnicode_New(new_len, maxchar);
10523    if (w == NULL)
10524        goto onError;
10525    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10526    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
10527    Py_DECREF(u);
10528    Py_DECREF(v);
10529    assert(_PyUnicode_CheckConsistency(w, 1));
10530    return w;
10531
10532  onError:
10533    Py_XDECREF(u);
10534    Py_XDECREF(v);
10535    return NULL;
10536}
10537
10538void
10539PyUnicode_Append(PyObject **p_left, PyObject *right)
10540{
10541    PyObject *left, *res;
10542    Py_UCS4 maxchar, maxchar2;
10543    Py_ssize_t left_len, right_len, new_len;
10544
10545    if (p_left == NULL) {
10546        if (!PyErr_Occurred())
10547            PyErr_BadInternalCall();
10548        return;
10549    }
10550    left = *p_left;
10551    if (right == NULL || !PyUnicode_Check(left)) {
10552        if (!PyErr_Occurred())
10553            PyErr_BadInternalCall();
10554        goto error;
10555    }
10556
10557    if (PyUnicode_READY(left) == -1)
10558        goto error;
10559    if (PyUnicode_READY(right) == -1)
10560        goto error;
10561
10562    /* Shortcuts */
10563    if (left == unicode_empty) {
10564        Py_DECREF(left);
10565        Py_INCREF(right);
10566        *p_left = right;
10567        return;
10568    }
10569    if (right == unicode_empty)
10570        return;
10571
10572    left_len = PyUnicode_GET_LENGTH(left);
10573    right_len = PyUnicode_GET_LENGTH(right);
10574    if (left_len > PY_SSIZE_T_MAX - right_len) {
10575        PyErr_SetString(PyExc_OverflowError,
10576                        "strings are too large to concat");
10577        goto error;
10578    }
10579    new_len = left_len + right_len;
10580
10581    if (unicode_modifiable(left)
10582        && PyUnicode_CheckExact(right)
10583        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
10584        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10585           to change the structure size, but characters are stored just after
10586           the structure, and so it requires to move all characters which is
10587           not so different than duplicating the string. */
10588        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10589    {
10590        /* append inplace */
10591        if (unicode_resize(p_left, new_len) != 0) {
10592            /* XXX if _PyUnicode_Resize() fails, 'left' has been
10593             * deallocated so it cannot be put back into
10594             * 'variable'.  The MemoryError is raised when there
10595             * is no value in 'variable', which might (very
10596             * remotely) be a cause of incompatibilities.
10597             */
10598            goto error;
10599        }
10600        /* copy 'right' into the newly allocated area of 'left' */
10601        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
10602    }
10603    else {
10604        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10605        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10606        maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10607
10608        /* Concat the two Unicode strings */
10609        res = PyUnicode_New(new_len, maxchar);
10610        if (res == NULL)
10611            goto error;
10612        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10613        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
10614        Py_DECREF(left);
10615        *p_left = res;
10616    }
10617    assert(_PyUnicode_CheckConsistency(*p_left, 1));
10618    return;
10619
10620error:
10621    Py_CLEAR(*p_left);
10622}
10623
10624void
10625PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10626{
10627    PyUnicode_Append(pleft, right);
10628    Py_XDECREF(right);
10629}
10630
10631PyDoc_STRVAR(count__doc__,
10632             "S.count(sub[, start[, end]]) -> int\n\
10633\n\
10634Return the number of non-overlapping occurrences of substring sub in\n\
10635string S[start:end].  Optional arguments start and end are\n\
10636interpreted as in slice notation.");
10637
10638static PyObject *
10639unicode_count(PyObject *self, PyObject *args)
10640{
10641    PyObject *substring;
10642    Py_ssize_t start = 0;
10643    Py_ssize_t end = PY_SSIZE_T_MAX;
10644    PyObject *result;
10645    int kind1, kind2, kind;
10646    void *buf1, *buf2;
10647    Py_ssize_t len1, len2, iresult;
10648
10649    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10650                                            &start, &end))
10651        return NULL;
10652
10653    kind1 = PyUnicode_KIND(self);
10654    kind2 = PyUnicode_KIND(substring);
10655    if (kind2 > kind1)
10656        return PyLong_FromLong(0);
10657    kind = kind1;
10658    buf1 = PyUnicode_DATA(self);
10659    buf2 = PyUnicode_DATA(substring);
10660    if (kind2 != kind)
10661        buf2 = _PyUnicode_AsKind(substring, kind);
10662    if (!buf2) {
10663        Py_DECREF(substring);
10664        return NULL;
10665    }
10666    len1 = PyUnicode_GET_LENGTH(self);
10667    len2 = PyUnicode_GET_LENGTH(substring);
10668
10669    ADJUST_INDICES(start, end, len1);
10670    switch (kind) {
10671    case PyUnicode_1BYTE_KIND:
10672        iresult = ucs1lib_count(
10673            ((Py_UCS1*)buf1) + start, end - start,
10674            buf2, len2, PY_SSIZE_T_MAX
10675            );
10676        break;
10677    case PyUnicode_2BYTE_KIND:
10678        iresult = ucs2lib_count(
10679            ((Py_UCS2*)buf1) + start, end - start,
10680            buf2, len2, PY_SSIZE_T_MAX
10681            );
10682        break;
10683    case PyUnicode_4BYTE_KIND:
10684        iresult = ucs4lib_count(
10685            ((Py_UCS4*)buf1) + start, end - start,
10686            buf2, len2, PY_SSIZE_T_MAX
10687            );
10688        break;
10689    default:
10690        assert(0); iresult = 0;
10691    }
10692
10693    result = PyLong_FromSsize_t(iresult);
10694
10695    if (kind2 != kind)
10696        PyMem_Free(buf2);
10697
10698    Py_DECREF(substring);
10699
10700    return result;
10701}
10702
10703PyDoc_STRVAR(encode__doc__,
10704             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10705\n\
10706Encode S using the codec registered for encoding. Default encoding\n\
10707is 'utf-8'. errors may be given to set a different error\n\
10708handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10709a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10710'xmlcharrefreplace' as well as any other name registered with\n\
10711codecs.register_error that can handle UnicodeEncodeErrors.");
10712
10713static PyObject *
10714unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10715{
10716    static char *kwlist[] = {"encoding", "errors", 0};
10717    char *encoding = NULL;
10718    char *errors = NULL;
10719
10720    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10721                                     kwlist, &encoding, &errors))
10722        return NULL;
10723    return PyUnicode_AsEncodedString(self, encoding, errors);
10724}
10725
10726PyDoc_STRVAR(expandtabs__doc__,
10727             "S.expandtabs([tabsize]) -> str\n\
10728\n\
10729Return a copy of S where all tab characters are expanded using spaces.\n\
10730If tabsize is not given, a tab size of 8 characters is assumed.");
10731
10732static PyObject*
10733unicode_expandtabs(PyObject *self, PyObject *args)
10734{
10735    Py_ssize_t i, j, line_pos, src_len, incr;
10736    Py_UCS4 ch;
10737    PyObject *u;
10738    void *src_data, *dest_data;
10739    int tabsize = 8;
10740    int kind;
10741    int found;
10742
10743    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10744        return NULL;
10745
10746    if (PyUnicode_READY(self) == -1)
10747        return NULL;
10748
10749    /* First pass: determine size of output string */
10750    src_len = PyUnicode_GET_LENGTH(self);
10751    i = j = line_pos = 0;
10752    kind = PyUnicode_KIND(self);
10753    src_data = PyUnicode_DATA(self);
10754    found = 0;
10755    for (; i < src_len; i++) {
10756        ch = PyUnicode_READ(kind, src_data, i);
10757        if (ch == '\t') {
10758            found = 1;
10759            if (tabsize > 0) {
10760                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10761                if (j > PY_SSIZE_T_MAX - incr)
10762                    goto overflow;
10763                line_pos += incr;
10764                j += incr;
10765            }
10766        }
10767        else {
10768            if (j > PY_SSIZE_T_MAX - 1)
10769                goto overflow;
10770            line_pos++;
10771            j++;
10772            if (ch == '\n' || ch == '\r')
10773                line_pos = 0;
10774        }
10775    }
10776    if (!found)
10777        return unicode_result_unchanged(self);
10778
10779    /* Second pass: create output string and fill it */
10780    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10781    if (!u)
10782        return NULL;
10783    dest_data = PyUnicode_DATA(u);
10784
10785    i = j = line_pos = 0;
10786
10787    for (; i < src_len; i++) {
10788        ch = PyUnicode_READ(kind, src_data, i);
10789        if (ch == '\t') {
10790            if (tabsize > 0) {
10791                incr = tabsize - (line_pos % tabsize);
10792                line_pos += incr;
10793                FILL(kind, dest_data, ' ', j, incr);
10794                j += incr;
10795            }
10796        }
10797        else {
10798            line_pos++;
10799            PyUnicode_WRITE(kind, dest_data, j, ch);
10800            j++;
10801            if (ch == '\n' || ch == '\r')
10802                line_pos = 0;
10803        }
10804    }
10805    assert (j == PyUnicode_GET_LENGTH(u));
10806    return unicode_result(u);
10807
10808  overflow:
10809    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10810    return NULL;
10811}
10812
10813PyDoc_STRVAR(find__doc__,
10814             "S.find(sub[, start[, end]]) -> int\n\
10815\n\
10816Return the lowest index in S where substring sub is found,\n\
10817such that sub is contained within S[start:end].  Optional\n\
10818arguments start and end are interpreted as in slice notation.\n\
10819\n\
10820Return -1 on failure.");
10821
10822static PyObject *
10823unicode_find(PyObject *self, PyObject *args)
10824{
10825    PyObject *substring;
10826    Py_ssize_t start;
10827    Py_ssize_t end;
10828    Py_ssize_t result;
10829
10830    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10831                                            &start, &end))
10832        return NULL;
10833
10834    if (PyUnicode_READY(self) == -1)
10835        return NULL;
10836    if (PyUnicode_READY(substring) == -1)
10837        return NULL;
10838
10839    result = any_find_slice(1, self, substring, start, end);
10840
10841    Py_DECREF(substring);
10842
10843    if (result == -2)
10844        return NULL;
10845
10846    return PyLong_FromSsize_t(result);
10847}
10848
10849static PyObject *
10850unicode_getitem(PyObject *self, Py_ssize_t index)
10851{
10852    void *data;
10853    enum PyUnicode_Kind kind;
10854    Py_UCS4 ch;
10855    PyObject *res;
10856
10857    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10858        PyErr_BadArgument();
10859        return NULL;
10860    }
10861    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10862        PyErr_SetString(PyExc_IndexError, "string index out of range");
10863        return NULL;
10864    }
10865    kind = PyUnicode_KIND(self);
10866    data = PyUnicode_DATA(self);
10867    ch = PyUnicode_READ(kind, data, index);
10868    if (ch < 256)
10869        return get_latin1_char(ch);
10870
10871    res = PyUnicode_New(1, ch);
10872    if (res == NULL)
10873        return NULL;
10874    kind = PyUnicode_KIND(res);
10875    data = PyUnicode_DATA(res);
10876    PyUnicode_WRITE(kind, data, 0, ch);
10877    assert(_PyUnicode_CheckConsistency(res, 1));
10878    return res;
10879}
10880
10881/* Believe it or not, this produces the same value for ASCII strings
10882   as bytes_hash(). */
10883static Py_hash_t
10884unicode_hash(PyObject *self)
10885{
10886    Py_ssize_t len;
10887    Py_uhash_t x;
10888
10889#ifdef Py_DEBUG
10890    assert(_Py_HashSecret_Initialized);
10891#endif
10892    if (_PyUnicode_HASH(self) != -1)
10893        return _PyUnicode_HASH(self);
10894    if (PyUnicode_READY(self) == -1)
10895        return -1;
10896    len = PyUnicode_GET_LENGTH(self);
10897    /*
10898      We make the hash of the empty string be 0, rather than using
10899      (prefix ^ suffix), since this slightly obfuscates the hash secret
10900    */
10901    if (len == 0) {
10902        _PyUnicode_HASH(self) = 0;
10903        return 0;
10904    }
10905
10906    /* The hash function as a macro, gets expanded three times below. */
10907#define HASH(P)                                            \
10908    x ^= (Py_uhash_t) *P << 7;                             \
10909    while (--len >= 0)                                     \
10910        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
10911
10912    x = (Py_uhash_t) _Py_HashSecret.prefix;
10913    switch (PyUnicode_KIND(self)) {
10914    case PyUnicode_1BYTE_KIND: {
10915        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10916        HASH(c);
10917        break;
10918    }
10919    case PyUnicode_2BYTE_KIND: {
10920        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10921        HASH(s);
10922        break;
10923    }
10924    default: {
10925        Py_UCS4 *l;
10926        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10927               "Impossible switch case in unicode_hash");
10928        l = PyUnicode_4BYTE_DATA(self);
10929        HASH(l);
10930        break;
10931    }
10932    }
10933    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10934    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
10935
10936    if (x == -1)
10937        x = -2;
10938    _PyUnicode_HASH(self) = x;
10939    return x;
10940}
10941#undef HASH
10942
10943PyDoc_STRVAR(index__doc__,
10944             "S.index(sub[, start[, end]]) -> int\n\
10945\n\
10946Like S.find() but raise ValueError when the substring is not found.");
10947
10948static PyObject *
10949unicode_index(PyObject *self, PyObject *args)
10950{
10951    Py_ssize_t result;
10952    PyObject *substring;
10953    Py_ssize_t start;
10954    Py_ssize_t end;
10955
10956    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10957                                            &start, &end))
10958        return NULL;
10959
10960    if (PyUnicode_READY(self) == -1)
10961        return NULL;
10962    if (PyUnicode_READY(substring) == -1)
10963        return NULL;
10964
10965    result = any_find_slice(1, self, substring, start, end);
10966
10967    Py_DECREF(substring);
10968
10969    if (result == -2)
10970        return NULL;
10971
10972    if (result < 0) {
10973        PyErr_SetString(PyExc_ValueError, "substring not found");
10974        return NULL;
10975    }
10976
10977    return PyLong_FromSsize_t(result);
10978}
10979
10980PyDoc_STRVAR(islower__doc__,
10981             "S.islower() -> bool\n\
10982\n\
10983Return True if all cased characters in S are lowercase and there is\n\
10984at least one cased character in S, False otherwise.");
10985
10986static PyObject*
10987unicode_islower(PyObject *self)
10988{
10989    Py_ssize_t i, length;
10990    int kind;
10991    void *data;
10992    int cased;
10993
10994    if (PyUnicode_READY(self) == -1)
10995        return NULL;
10996    length = PyUnicode_GET_LENGTH(self);
10997    kind = PyUnicode_KIND(self);
10998    data = PyUnicode_DATA(self);
10999
11000    /* Shortcut for single character strings */
11001    if (length == 1)
11002        return PyBool_FromLong(
11003            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11004
11005    /* Special case for empty strings */
11006    if (length == 0)
11007        return PyBool_FromLong(0);
11008
11009    cased = 0;
11010    for (i = 0; i < length; i++) {
11011        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11012
11013        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11014            return PyBool_FromLong(0);
11015        else if (!cased && Py_UNICODE_ISLOWER(ch))
11016            cased = 1;
11017    }
11018    return PyBool_FromLong(cased);
11019}
11020
11021PyDoc_STRVAR(isupper__doc__,
11022             "S.isupper() -> bool\n\
11023\n\
11024Return True if all cased characters in S are uppercase and there is\n\
11025at least one cased character in S, False otherwise.");
11026
11027static PyObject*
11028unicode_isupper(PyObject *self)
11029{
11030    Py_ssize_t i, length;
11031    int kind;
11032    void *data;
11033    int cased;
11034
11035    if (PyUnicode_READY(self) == -1)
11036        return NULL;
11037    length = PyUnicode_GET_LENGTH(self);
11038    kind = PyUnicode_KIND(self);
11039    data = PyUnicode_DATA(self);
11040
11041    /* Shortcut for single character strings */
11042    if (length == 1)
11043        return PyBool_FromLong(
11044            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11045
11046    /* Special case for empty strings */
11047    if (length == 0)
11048        return PyBool_FromLong(0);
11049
11050    cased = 0;
11051    for (i = 0; i < length; i++) {
11052        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11053
11054        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11055            return PyBool_FromLong(0);
11056        else if (!cased && Py_UNICODE_ISUPPER(ch))
11057            cased = 1;
11058    }
11059    return PyBool_FromLong(cased);
11060}
11061
11062PyDoc_STRVAR(istitle__doc__,
11063             "S.istitle() -> bool\n\
11064\n\
11065Return True if S is a titlecased string and there is at least one\n\
11066character in S, i.e. upper- and titlecase characters may only\n\
11067follow uncased characters and lowercase characters only cased ones.\n\
11068Return False otherwise.");
11069
11070static PyObject*
11071unicode_istitle(PyObject *self)
11072{
11073    Py_ssize_t i, length;
11074    int kind;
11075    void *data;
11076    int cased, previous_is_cased;
11077
11078    if (PyUnicode_READY(self) == -1)
11079        return NULL;
11080    length = PyUnicode_GET_LENGTH(self);
11081    kind = PyUnicode_KIND(self);
11082    data = PyUnicode_DATA(self);
11083
11084    /* Shortcut for single character strings */
11085    if (length == 1) {
11086        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11087        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11088                               (Py_UNICODE_ISUPPER(ch) != 0));
11089    }
11090
11091    /* Special case for empty strings */
11092    if (length == 0)
11093        return PyBool_FromLong(0);
11094
11095    cased = 0;
11096    previous_is_cased = 0;
11097    for (i = 0; i < length; i++) {
11098        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11099
11100        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11101            if (previous_is_cased)
11102                return PyBool_FromLong(0);
11103            previous_is_cased = 1;
11104            cased = 1;
11105        }
11106        else if (Py_UNICODE_ISLOWER(ch)) {
11107            if (!previous_is_cased)
11108                return PyBool_FromLong(0);
11109            previous_is_cased = 1;
11110            cased = 1;
11111        }
11112        else
11113            previous_is_cased = 0;
11114    }
11115    return PyBool_FromLong(cased);
11116}
11117
11118PyDoc_STRVAR(isspace__doc__,
11119             "S.isspace() -> bool\n\
11120\n\
11121Return True if all characters in S are whitespace\n\
11122and there is at least one character in S, False otherwise.");
11123
11124static PyObject*
11125unicode_isspace(PyObject *self)
11126{
11127    Py_ssize_t i, length;
11128    int kind;
11129    void *data;
11130
11131    if (PyUnicode_READY(self) == -1)
11132        return NULL;
11133    length = PyUnicode_GET_LENGTH(self);
11134    kind = PyUnicode_KIND(self);
11135    data = PyUnicode_DATA(self);
11136
11137    /* Shortcut for single character strings */
11138    if (length == 1)
11139        return PyBool_FromLong(
11140            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11141
11142    /* Special case for empty strings */
11143    if (length == 0)
11144        return PyBool_FromLong(0);
11145
11146    for (i = 0; i < length; i++) {
11147        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11148        if (!Py_UNICODE_ISSPACE(ch))
11149            return PyBool_FromLong(0);
11150    }
11151    return PyBool_FromLong(1);
11152}
11153
11154PyDoc_STRVAR(isalpha__doc__,
11155             "S.isalpha() -> bool\n\
11156\n\
11157Return True if all characters in S are alphabetic\n\
11158and there is at least one character in S, False otherwise.");
11159
11160static PyObject*
11161unicode_isalpha(PyObject *self)
11162{
11163    Py_ssize_t i, length;
11164    int kind;
11165    void *data;
11166
11167    if (PyUnicode_READY(self) == -1)
11168        return NULL;
11169    length = PyUnicode_GET_LENGTH(self);
11170    kind = PyUnicode_KIND(self);
11171    data = PyUnicode_DATA(self);
11172
11173    /* Shortcut for single character strings */
11174    if (length == 1)
11175        return PyBool_FromLong(
11176            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11177
11178    /* Special case for empty strings */
11179    if (length == 0)
11180        return PyBool_FromLong(0);
11181
11182    for (i = 0; i < length; i++) {
11183        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11184            return PyBool_FromLong(0);
11185    }
11186    return PyBool_FromLong(1);
11187}
11188
11189PyDoc_STRVAR(isalnum__doc__,
11190             "S.isalnum() -> bool\n\
11191\n\
11192Return True if all characters in S are alphanumeric\n\
11193and there is at least one character in S, False otherwise.");
11194
11195static PyObject*
11196unicode_isalnum(PyObject *self)
11197{
11198    int kind;
11199    void *data;
11200    Py_ssize_t len, i;
11201
11202    if (PyUnicode_READY(self) == -1)
11203        return NULL;
11204
11205    kind = PyUnicode_KIND(self);
11206    data = PyUnicode_DATA(self);
11207    len = PyUnicode_GET_LENGTH(self);
11208
11209    /* Shortcut for single character strings */
11210    if (len == 1) {
11211        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11212        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11213    }
11214
11215    /* Special case for empty strings */
11216    if (len == 0)
11217        return PyBool_FromLong(0);
11218
11219    for (i = 0; i < len; i++) {
11220        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11221        if (!Py_UNICODE_ISALNUM(ch))
11222            return PyBool_FromLong(0);
11223    }
11224    return PyBool_FromLong(1);
11225}
11226
11227PyDoc_STRVAR(isdecimal__doc__,
11228             "S.isdecimal() -> bool\n\
11229\n\
11230Return True if there are only decimal characters in S,\n\
11231False otherwise.");
11232
11233static PyObject*
11234unicode_isdecimal(PyObject *self)
11235{
11236    Py_ssize_t i, length;
11237    int kind;
11238    void *data;
11239
11240    if (PyUnicode_READY(self) == -1)
11241        return NULL;
11242    length = PyUnicode_GET_LENGTH(self);
11243    kind = PyUnicode_KIND(self);
11244    data = PyUnicode_DATA(self);
11245
11246    /* Shortcut for single character strings */
11247    if (length == 1)
11248        return PyBool_FromLong(
11249            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11250
11251    /* Special case for empty strings */
11252    if (length == 0)
11253        return PyBool_FromLong(0);
11254
11255    for (i = 0; i < length; i++) {
11256        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11257            return PyBool_FromLong(0);
11258    }
11259    return PyBool_FromLong(1);
11260}
11261
11262PyDoc_STRVAR(isdigit__doc__,
11263             "S.isdigit() -> bool\n\
11264\n\
11265Return True if all characters in S are digits\n\
11266and there is at least one character in S, False otherwise.");
11267
11268static PyObject*
11269unicode_isdigit(PyObject *self)
11270{
11271    Py_ssize_t i, length;
11272    int kind;
11273    void *data;
11274
11275    if (PyUnicode_READY(self) == -1)
11276        return NULL;
11277    length = PyUnicode_GET_LENGTH(self);
11278    kind = PyUnicode_KIND(self);
11279    data = PyUnicode_DATA(self);
11280
11281    /* Shortcut for single character strings */
11282    if (length == 1) {
11283        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11284        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11285    }
11286
11287    /* Special case for empty strings */
11288    if (length == 0)
11289        return PyBool_FromLong(0);
11290
11291    for (i = 0; i < length; i++) {
11292        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11293            return PyBool_FromLong(0);
11294    }
11295    return PyBool_FromLong(1);
11296}
11297
11298PyDoc_STRVAR(isnumeric__doc__,
11299             "S.isnumeric() -> bool\n\
11300\n\
11301Return True if there are only numeric characters in S,\n\
11302False otherwise.");
11303
11304static PyObject*
11305unicode_isnumeric(PyObject *self)
11306{
11307    Py_ssize_t i, length;
11308    int kind;
11309    void *data;
11310
11311    if (PyUnicode_READY(self) == -1)
11312        return NULL;
11313    length = PyUnicode_GET_LENGTH(self);
11314    kind = PyUnicode_KIND(self);
11315    data = PyUnicode_DATA(self);
11316
11317    /* Shortcut for single character strings */
11318    if (length == 1)
11319        return PyBool_FromLong(
11320            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11321
11322    /* Special case for empty strings */
11323    if (length == 0)
11324        return PyBool_FromLong(0);
11325
11326    for (i = 0; i < length; i++) {
11327        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11328            return PyBool_FromLong(0);
11329    }
11330    return PyBool_FromLong(1);
11331}
11332
11333int
11334PyUnicode_IsIdentifier(PyObject *self)
11335{
11336    int kind;
11337    void *data;
11338    Py_ssize_t i;
11339    Py_UCS4 first;
11340
11341    if (PyUnicode_READY(self) == -1) {
11342        Py_FatalError("identifier not ready");
11343        return 0;
11344    }
11345
11346    /* Special case for empty strings */
11347    if (PyUnicode_GET_LENGTH(self) == 0)
11348        return 0;
11349    kind = PyUnicode_KIND(self);
11350    data = PyUnicode_DATA(self);
11351
11352    /* PEP 3131 says that the first character must be in
11353       XID_Start and subsequent characters in XID_Continue,
11354       and for the ASCII range, the 2.x rules apply (i.e
11355       start with letters and underscore, continue with
11356       letters, digits, underscore). However, given the current
11357       definition of XID_Start and XID_Continue, it is sufficient
11358       to check just for these, except that _ must be allowed
11359       as starting an identifier.  */
11360    first = PyUnicode_READ(kind, data, 0);
11361    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11362        return 0;
11363
11364    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11365        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11366            return 0;
11367    return 1;
11368}
11369
11370PyDoc_STRVAR(isidentifier__doc__,
11371             "S.isidentifier() -> bool\n\
11372\n\
11373Return True if S is a valid identifier according\n\
11374to the language definition.");
11375
11376static PyObject*
11377unicode_isidentifier(PyObject *self)
11378{
11379    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11380}
11381
11382PyDoc_STRVAR(isprintable__doc__,
11383             "S.isprintable() -> bool\n\
11384\n\
11385Return True if all characters in S are considered\n\
11386printable in repr() or S is empty, False otherwise.");
11387
11388static PyObject*
11389unicode_isprintable(PyObject *self)
11390{
11391    Py_ssize_t i, length;
11392    int kind;
11393    void *data;
11394
11395    if (PyUnicode_READY(self) == -1)
11396        return NULL;
11397    length = PyUnicode_GET_LENGTH(self);
11398    kind = PyUnicode_KIND(self);
11399    data = PyUnicode_DATA(self);
11400
11401    /* Shortcut for single character strings */
11402    if (length == 1)
11403        return PyBool_FromLong(
11404            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11405
11406    for (i = 0; i < length; i++) {
11407        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11408            Py_RETURN_FALSE;
11409        }
11410    }
11411    Py_RETURN_TRUE;
11412}
11413
11414PyDoc_STRVAR(join__doc__,
11415             "S.join(iterable) -> str\n\
11416\n\
11417Return a string which is the concatenation of the strings in the\n\
11418iterable.  The separator between elements is S.");
11419
11420static PyObject*
11421unicode_join(PyObject *self, PyObject *data)
11422{
11423    return PyUnicode_Join(self, data);
11424}
11425
11426static Py_ssize_t
11427unicode_length(PyObject *self)
11428{
11429    if (PyUnicode_READY(self) == -1)
11430        return -1;
11431    return PyUnicode_GET_LENGTH(self);
11432}
11433
11434PyDoc_STRVAR(ljust__doc__,
11435             "S.ljust(width[, fillchar]) -> str\n\
11436\n\
11437Return S left-justified in a Unicode string of length width. Padding is\n\
11438done using the specified fill character (default is a space).");
11439
11440static PyObject *
11441unicode_ljust(PyObject *self, PyObject *args)
11442{
11443    Py_ssize_t width;
11444    Py_UCS4 fillchar = ' ';
11445
11446    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11447        return NULL;
11448
11449    if (PyUnicode_READY(self) == -1)
11450        return NULL;
11451
11452    if (PyUnicode_GET_LENGTH(self) >= width)
11453        return unicode_result_unchanged(self);
11454
11455    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11456}
11457
11458PyDoc_STRVAR(lower__doc__,
11459             "S.lower() -> str\n\
11460\n\
11461Return a copy of the string S converted to lowercase.");
11462
11463static PyObject*
11464unicode_lower(PyObject *self)
11465{
11466    if (PyUnicode_READY(self) == -1)
11467        return NULL;
11468    if (PyUnicode_IS_ASCII(self))
11469        return ascii_upper_or_lower(self, 1);
11470    return case_operation(self, do_lower);
11471}
11472
11473#define LEFTSTRIP 0
11474#define RIGHTSTRIP 1
11475#define BOTHSTRIP 2
11476
11477/* Arrays indexed by above */
11478static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11479
11480#define STRIPNAME(i) (stripformat[i]+3)
11481
11482/* externally visible for str.strip(unicode) */
11483PyObject *
11484_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11485{
11486    void *data;
11487    int kind;
11488    Py_ssize_t i, j, len;
11489    BLOOM_MASK sepmask;
11490
11491    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11492        return NULL;
11493
11494    kind = PyUnicode_KIND(self);
11495    data = PyUnicode_DATA(self);
11496    len = PyUnicode_GET_LENGTH(self);
11497    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11498                              PyUnicode_DATA(sepobj),
11499                              PyUnicode_GET_LENGTH(sepobj));
11500
11501    i = 0;
11502    if (striptype != RIGHTSTRIP) {
11503        while (i < len &&
11504               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11505            i++;
11506        }
11507    }
11508
11509    j = len;
11510    if (striptype != LEFTSTRIP) {
11511        do {
11512            j--;
11513        } while (j >= i &&
11514                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11515        j++;
11516    }
11517
11518    return PyUnicode_Substring(self, i, j);
11519}
11520
11521PyObject*
11522PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11523{
11524    unsigned char *data;
11525    int kind;
11526    Py_ssize_t length;
11527
11528    if (PyUnicode_READY(self) == -1)
11529        return NULL;
11530
11531    length = PyUnicode_GET_LENGTH(self);
11532    end = Py_MIN(end, length);
11533
11534    if (start == 0 && end == length)
11535        return unicode_result_unchanged(self);
11536
11537    if (start < 0 || end < 0) {
11538        PyErr_SetString(PyExc_IndexError, "string index out of range");
11539        return NULL;
11540    }
11541    if (start >= length || end < start) {
11542        Py_INCREF(unicode_empty);
11543        return unicode_empty;
11544    }
11545
11546    length = end - start;
11547    if (PyUnicode_IS_ASCII(self)) {
11548        data = PyUnicode_1BYTE_DATA(self);
11549        return _PyUnicode_FromASCII((char*)(data + start), length);
11550    }
11551    else {
11552        kind = PyUnicode_KIND(self);
11553        data = PyUnicode_1BYTE_DATA(self);
11554        return PyUnicode_FromKindAndData(kind,
11555                                         data + kind * start,
11556                                         length);
11557    }
11558}
11559
11560static PyObject *
11561do_strip(PyObject *self, int striptype)
11562{
11563    int kind;
11564    void *data;
11565    Py_ssize_t len, i, j;
11566
11567    if (PyUnicode_READY(self) == -1)
11568        return NULL;
11569
11570    kind = PyUnicode_KIND(self);
11571    data = PyUnicode_DATA(self);
11572    len = PyUnicode_GET_LENGTH(self);
11573
11574    i = 0;
11575    if (striptype != RIGHTSTRIP) {
11576        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11577            i++;
11578        }
11579    }
11580
11581    j = len;
11582    if (striptype != LEFTSTRIP) {
11583        do {
11584            j--;
11585        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11586        j++;
11587    }
11588
11589    return PyUnicode_Substring(self, i, j);
11590}
11591
11592
11593static PyObject *
11594do_argstrip(PyObject *self, int striptype, PyObject *args)
11595{
11596    PyObject *sep = NULL;
11597
11598    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11599        return NULL;
11600
11601    if (sep != NULL && sep != Py_None) {
11602        if (PyUnicode_Check(sep))
11603            return _PyUnicode_XStrip(self, striptype, sep);
11604        else {
11605            PyErr_Format(PyExc_TypeError,
11606                         "%s arg must be None or str",
11607                         STRIPNAME(striptype));
11608            return NULL;
11609        }
11610    }
11611
11612    return do_strip(self, striptype);
11613}
11614
11615
11616PyDoc_STRVAR(strip__doc__,
11617             "S.strip([chars]) -> str\n\
11618\n\
11619Return a copy of the string S with leading and trailing\n\
11620whitespace removed.\n\
11621If chars is given and not None, remove characters in chars instead.");
11622
11623static PyObject *
11624unicode_strip(PyObject *self, PyObject *args)
11625{
11626    if (PyTuple_GET_SIZE(args) == 0)
11627        return do_strip(self, BOTHSTRIP); /* Common case */
11628    else
11629        return do_argstrip(self, BOTHSTRIP, args);
11630}
11631
11632
11633PyDoc_STRVAR(lstrip__doc__,
11634             "S.lstrip([chars]) -> str\n\
11635\n\
11636Return a copy of the string S with leading whitespace removed.\n\
11637If chars is given and not None, remove characters in chars instead.");
11638
11639static PyObject *
11640unicode_lstrip(PyObject *self, PyObject *args)
11641{
11642    if (PyTuple_GET_SIZE(args) == 0)
11643        return do_strip(self, LEFTSTRIP); /* Common case */
11644    else
11645        return do_argstrip(self, LEFTSTRIP, args);
11646}
11647
11648
11649PyDoc_STRVAR(rstrip__doc__,
11650             "S.rstrip([chars]) -> str\n\
11651\n\
11652Return a copy of the string S with trailing whitespace removed.\n\
11653If chars is given and not None, remove characters in chars instead.");
11654
11655static PyObject *
11656unicode_rstrip(PyObject *self, PyObject *args)
11657{
11658    if (PyTuple_GET_SIZE(args) == 0)
11659        return do_strip(self, RIGHTSTRIP); /* Common case */
11660    else
11661        return do_argstrip(self, RIGHTSTRIP, args);
11662}
11663
11664
11665static PyObject*
11666unicode_repeat(PyObject *str, Py_ssize_t len)
11667{
11668    PyObject *u;
11669    Py_ssize_t nchars, n;
11670
11671    if (len < 1) {
11672        Py_INCREF(unicode_empty);
11673        return unicode_empty;
11674    }
11675
11676    /* no repeat, return original string */
11677    if (len == 1)
11678        return unicode_result_unchanged(str);
11679
11680    if (PyUnicode_READY(str) == -1)
11681        return NULL;
11682
11683    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11684        PyErr_SetString(PyExc_OverflowError,
11685                        "repeated string is too long");
11686        return NULL;
11687    }
11688    nchars = len * PyUnicode_GET_LENGTH(str);
11689
11690    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11691    if (!u)
11692        return NULL;
11693    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11694
11695    if (PyUnicode_GET_LENGTH(str) == 1) {
11696        const int kind = PyUnicode_KIND(str);
11697        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11698        if (kind == PyUnicode_1BYTE_KIND) {
11699            void *to = PyUnicode_DATA(u);
11700            memset(to, (unsigned char)fill_char, len);
11701        }
11702        else if (kind == PyUnicode_2BYTE_KIND) {
11703            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11704            for (n = 0; n < len; ++n)
11705                ucs2[n] = fill_char;
11706        } else {
11707            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11708            assert(kind == PyUnicode_4BYTE_KIND);
11709            for (n = 0; n < len; ++n)
11710                ucs4[n] = fill_char;
11711        }
11712    }
11713    else {
11714        /* number of characters copied this far */
11715        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11716        const Py_ssize_t char_size = PyUnicode_KIND(str);
11717        char *to = (char *) PyUnicode_DATA(u);
11718        Py_MEMCPY(to, PyUnicode_DATA(str),
11719                  PyUnicode_GET_LENGTH(str) * char_size);
11720        while (done < nchars) {
11721            n = (done <= nchars-done) ? done : nchars-done;
11722            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11723            done += n;
11724        }
11725    }
11726
11727    assert(_PyUnicode_CheckConsistency(u, 1));
11728    return u;
11729}
11730
11731PyObject *
11732PyUnicode_Replace(PyObject *obj,
11733                  PyObject *subobj,
11734                  PyObject *replobj,
11735                  Py_ssize_t maxcount)
11736{
11737    PyObject *self;
11738    PyObject *str1;
11739    PyObject *str2;
11740    PyObject *result;
11741
11742    self = PyUnicode_FromObject(obj);
11743    if (self == NULL)
11744        return NULL;
11745    str1 = PyUnicode_FromObject(subobj);
11746    if (str1 == NULL) {
11747        Py_DECREF(self);
11748        return NULL;
11749    }
11750    str2 = PyUnicode_FromObject(replobj);
11751    if (str2 == NULL) {
11752        Py_DECREF(self);
11753        Py_DECREF(str1);
11754        return NULL;
11755    }
11756    if (PyUnicode_READY(self) == -1 ||
11757        PyUnicode_READY(str1) == -1 ||
11758        PyUnicode_READY(str2) == -1)
11759        result = NULL;
11760    else
11761        result = replace(self, str1, str2, maxcount);
11762    Py_DECREF(self);
11763    Py_DECREF(str1);
11764    Py_DECREF(str2);
11765    return result;
11766}
11767
11768PyDoc_STRVAR(replace__doc__,
11769             "S.replace(old, new[, count]) -> str\n\
11770\n\
11771Return a copy of S with all occurrences of substring\n\
11772old replaced by new.  If the optional argument count is\n\
11773given, only the first count occurrences are replaced.");
11774
11775static PyObject*
11776unicode_replace(PyObject *self, PyObject *args)
11777{
11778    PyObject *str1;
11779    PyObject *str2;
11780    Py_ssize_t maxcount = -1;
11781    PyObject *result;
11782
11783    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11784        return NULL;
11785    if (PyUnicode_READY(self) == -1)
11786        return NULL;
11787    str1 = PyUnicode_FromObject(str1);
11788    if (str1 == NULL)
11789        return NULL;
11790    str2 = PyUnicode_FromObject(str2);
11791    if (str2 == NULL) {
11792        Py_DECREF(str1);
11793        return NULL;
11794    }
11795    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11796        result = NULL;
11797    else
11798        result = replace(self, str1, str2, maxcount);
11799
11800    Py_DECREF(str1);
11801    Py_DECREF(str2);
11802    return result;
11803}
11804
11805static PyObject *
11806unicode_repr(PyObject *unicode)
11807{
11808    PyObject *repr;
11809    Py_ssize_t isize;
11810    Py_ssize_t osize, squote, dquote, i, o;
11811    Py_UCS4 max, quote;
11812    int ikind, okind;
11813    void *idata, *odata;
11814
11815    if (PyUnicode_READY(unicode) == -1)
11816        return NULL;
11817
11818    isize = PyUnicode_GET_LENGTH(unicode);
11819    idata = PyUnicode_DATA(unicode);
11820
11821    /* Compute length of output, quote characters, and
11822       maximum character */
11823    osize = 2; /* quotes */
11824    max = 127;
11825    squote = dquote = 0;
11826    ikind = PyUnicode_KIND(unicode);
11827    for (i = 0; i < isize; i++) {
11828        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11829        switch (ch) {
11830        case '\'': squote++; osize++; break;
11831        case '"':  dquote++; osize++; break;
11832        case '\\': case '\t': case '\r': case '\n':
11833            osize += 2; break;
11834        default:
11835            /* Fast-path ASCII */
11836            if (ch < ' ' || ch == 0x7f)
11837                osize += 4; /* \xHH */
11838            else if (ch < 0x7f)
11839                osize++;
11840            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11841                osize++;
11842                max = ch > max ? ch : max;
11843            }
11844            else if (ch < 0x100)
11845                osize += 4; /* \xHH */
11846            else if (ch < 0x10000)
11847                osize += 6; /* \uHHHH */
11848            else
11849                osize += 10; /* \uHHHHHHHH */
11850        }
11851    }
11852
11853    quote = '\'';
11854    if (squote) {
11855        if (dquote)
11856            /* Both squote and dquote present. Use squote,
11857               and escape them */
11858            osize += squote;
11859        else
11860            quote = '"';
11861    }
11862
11863    repr = PyUnicode_New(osize, max);
11864    if (repr == NULL)
11865        return NULL;
11866    okind = PyUnicode_KIND(repr);
11867    odata = PyUnicode_DATA(repr);
11868
11869    PyUnicode_WRITE(okind, odata, 0, quote);
11870    PyUnicode_WRITE(okind, odata, osize-1, quote);
11871
11872    for (i = 0, o = 1; i < isize; i++) {
11873        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11874
11875        /* Escape quotes and backslashes */
11876        if ((ch == quote) || (ch == '\\')) {
11877            PyUnicode_WRITE(okind, odata, o++, '\\');
11878            PyUnicode_WRITE(okind, odata, o++, ch);
11879            continue;
11880        }
11881
11882        /* Map special whitespace to '\t', \n', '\r' */
11883        if (ch == '\t') {
11884            PyUnicode_WRITE(okind, odata, o++, '\\');
11885            PyUnicode_WRITE(okind, odata, o++, 't');
11886        }
11887        else if (ch == '\n') {
11888            PyUnicode_WRITE(okind, odata, o++, '\\');
11889            PyUnicode_WRITE(okind, odata, o++, 'n');
11890        }
11891        else if (ch == '\r') {
11892            PyUnicode_WRITE(okind, odata, o++, '\\');
11893            PyUnicode_WRITE(okind, odata, o++, 'r');
11894        }
11895
11896        /* Map non-printable US ASCII to '\xhh' */
11897        else if (ch < ' ' || ch == 0x7F) {
11898            PyUnicode_WRITE(okind, odata, o++, '\\');
11899            PyUnicode_WRITE(okind, odata, o++, 'x');
11900            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11901            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
11902        }
11903
11904        /* Copy ASCII characters as-is */
11905        else if (ch < 0x7F) {
11906            PyUnicode_WRITE(okind, odata, o++, ch);
11907        }
11908
11909        /* Non-ASCII characters */
11910        else {
11911            /* Map Unicode whitespace and control characters
11912               (categories Z* and C* except ASCII space)
11913            */
11914            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11915                PyUnicode_WRITE(okind, odata, o++, '\\');
11916                /* Map 8-bit characters to '\xhh' */
11917                if (ch <= 0xff) {
11918                    PyUnicode_WRITE(okind, odata, o++, 'x');
11919                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11920                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
11921                }
11922                /* Map 16-bit characters to '\uxxxx' */
11923                else if (ch <= 0xffff) {
11924                    PyUnicode_WRITE(okind, odata, o++, 'u');
11925                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11926                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11927                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11928                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
11929                }
11930                /* Map 21-bit characters to '\U00xxxxxx' */
11931                else {
11932                    PyUnicode_WRITE(okind, odata, o++, 'U');
11933                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11934                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11935                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11936                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11937                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11938                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11939                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11940                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
11941                }
11942            }
11943            /* Copy characters as-is */
11944            else {
11945                PyUnicode_WRITE(okind, odata, o++, ch);
11946            }
11947        }
11948    }
11949    /* Closing quote already added at the beginning */
11950    assert(_PyUnicode_CheckConsistency(repr, 1));
11951    return repr;
11952}
11953
11954PyDoc_STRVAR(rfind__doc__,
11955             "S.rfind(sub[, start[, end]]) -> int\n\
11956\n\
11957Return the highest index in S where substring sub is found,\n\
11958such that sub is contained within S[start:end].  Optional\n\
11959arguments start and end are interpreted as in slice notation.\n\
11960\n\
11961Return -1 on failure.");
11962
11963static PyObject *
11964unicode_rfind(PyObject *self, PyObject *args)
11965{
11966    PyObject *substring;
11967    Py_ssize_t start;
11968    Py_ssize_t end;
11969    Py_ssize_t result;
11970
11971    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11972                                            &start, &end))
11973        return NULL;
11974
11975    if (PyUnicode_READY(self) == -1)
11976        return NULL;
11977    if (PyUnicode_READY(substring) == -1)
11978        return NULL;
11979
11980    result = any_find_slice(-1, self, substring, start, end);
11981
11982    Py_DECREF(substring);
11983
11984    if (result == -2)
11985        return NULL;
11986
11987    return PyLong_FromSsize_t(result);
11988}
11989
11990PyDoc_STRVAR(rindex__doc__,
11991             "S.rindex(sub[, start[, end]]) -> int\n\
11992\n\
11993Like S.rfind() but raise ValueError when the substring is not found.");
11994
11995static PyObject *
11996unicode_rindex(PyObject *self, PyObject *args)
11997{
11998    PyObject *substring;
11999    Py_ssize_t start;
12000    Py_ssize_t end;
12001    Py_ssize_t result;
12002
12003    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12004                                            &start, &end))
12005        return NULL;
12006
12007    if (PyUnicode_READY(self) == -1)
12008        return NULL;
12009    if (PyUnicode_READY(substring) == -1)
12010        return NULL;
12011
12012    result = any_find_slice(-1, self, substring, start, end);
12013
12014    Py_DECREF(substring);
12015
12016    if (result == -2)
12017        return NULL;
12018
12019    if (result < 0) {
12020        PyErr_SetString(PyExc_ValueError, "substring not found");
12021        return NULL;
12022    }
12023
12024    return PyLong_FromSsize_t(result);
12025}
12026
12027PyDoc_STRVAR(rjust__doc__,
12028             "S.rjust(width[, fillchar]) -> str\n\
12029\n\
12030Return S right-justified in a string of length width. Padding is\n\
12031done using the specified fill character (default is a space).");
12032
12033static PyObject *
12034unicode_rjust(PyObject *self, PyObject *args)
12035{
12036    Py_ssize_t width;
12037    Py_UCS4 fillchar = ' ';
12038
12039    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12040        return NULL;
12041
12042    if (PyUnicode_READY(self) == -1)
12043        return NULL;
12044
12045    if (PyUnicode_GET_LENGTH(self) >= width)
12046        return unicode_result_unchanged(self);
12047
12048    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12049}
12050
12051PyObject *
12052PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12053{
12054    PyObject *result;
12055
12056    s = PyUnicode_FromObject(s);
12057    if (s == NULL)
12058        return NULL;
12059    if (sep != NULL) {
12060        sep = PyUnicode_FromObject(sep);
12061        if (sep == NULL) {
12062            Py_DECREF(s);
12063            return NULL;
12064        }
12065    }
12066
12067    result = split(s, sep, maxsplit);
12068
12069    Py_DECREF(s);
12070    Py_XDECREF(sep);
12071    return result;
12072}
12073
12074PyDoc_STRVAR(split__doc__,
12075             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12076\n\
12077Return a list of the words in S, using sep as the\n\
12078delimiter string.  If maxsplit is given, at most maxsplit\n\
12079splits are done. If sep is not specified or is None, any\n\
12080whitespace string is a separator and empty strings are\n\
12081removed from the result.");
12082
12083static PyObject*
12084unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12085{
12086    static char *kwlist[] = {"sep", "maxsplit", 0};
12087    PyObject *substring = Py_None;
12088    Py_ssize_t maxcount = -1;
12089
12090    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12091                                     kwlist, &substring, &maxcount))
12092        return NULL;
12093
12094    if (substring == Py_None)
12095        return split(self, NULL, maxcount);
12096    else if (PyUnicode_Check(substring))
12097        return split(self, substring, maxcount);
12098    else
12099        return PyUnicode_Split(self, substring, maxcount);
12100}
12101
12102PyObject *
12103PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12104{
12105    PyObject* str_obj;
12106    PyObject* sep_obj;
12107    PyObject* out;
12108    int kind1, kind2, kind;
12109    void *buf1 = NULL, *buf2 = NULL;
12110    Py_ssize_t len1, len2;
12111
12112    str_obj = PyUnicode_FromObject(str_in);
12113    if (!str_obj)
12114        return NULL;
12115    sep_obj = PyUnicode_FromObject(sep_in);
12116    if (!sep_obj) {
12117        Py_DECREF(str_obj);
12118        return NULL;
12119    }
12120    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12121        Py_DECREF(sep_obj);
12122        Py_DECREF(str_obj);
12123        return NULL;
12124    }
12125
12126    kind1 = PyUnicode_KIND(str_obj);
12127    kind2 = PyUnicode_KIND(sep_obj);
12128    kind = Py_MAX(kind1, kind2);
12129    buf1 = PyUnicode_DATA(str_obj);
12130    if (kind1 != kind)
12131        buf1 = _PyUnicode_AsKind(str_obj, kind);
12132    if (!buf1)
12133        goto onError;
12134    buf2 = PyUnicode_DATA(sep_obj);
12135    if (kind2 != kind)
12136        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12137    if (!buf2)
12138        goto onError;
12139    len1 = PyUnicode_GET_LENGTH(str_obj);
12140    len2 = PyUnicode_GET_LENGTH(sep_obj);
12141
12142    switch (PyUnicode_KIND(str_obj)) {
12143    case PyUnicode_1BYTE_KIND:
12144        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12145            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12146        else
12147            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12148        break;
12149    case PyUnicode_2BYTE_KIND:
12150        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12151        break;
12152    case PyUnicode_4BYTE_KIND:
12153        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12154        break;
12155    default:
12156        assert(0);
12157        out = 0;
12158    }
12159
12160    Py_DECREF(sep_obj);
12161    Py_DECREF(str_obj);
12162    if (kind1 != kind)
12163        PyMem_Free(buf1);
12164    if (kind2 != kind)
12165        PyMem_Free(buf2);
12166
12167    return out;
12168  onError:
12169    Py_DECREF(sep_obj);
12170    Py_DECREF(str_obj);
12171    if (kind1 != kind && buf1)
12172        PyMem_Free(buf1);
12173    if (kind2 != kind && buf2)
12174        PyMem_Free(buf2);
12175    return NULL;
12176}
12177
12178
12179PyObject *
12180PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12181{
12182    PyObject* str_obj;
12183    PyObject* sep_obj;
12184    PyObject* out;
12185    int kind1, kind2, kind;
12186    void *buf1 = NULL, *buf2 = NULL;
12187    Py_ssize_t len1, len2;
12188
12189    str_obj = PyUnicode_FromObject(str_in);
12190    if (!str_obj)
12191        return NULL;
12192    sep_obj = PyUnicode_FromObject(sep_in);
12193    if (!sep_obj) {
12194        Py_DECREF(str_obj);
12195        return NULL;
12196    }
12197
12198    kind1 = PyUnicode_KIND(str_in);
12199    kind2 = PyUnicode_KIND(sep_obj);
12200    kind = Py_MAX(kind1, kind2);
12201    buf1 = PyUnicode_DATA(str_in);
12202    if (kind1 != kind)
12203        buf1 = _PyUnicode_AsKind(str_in, kind);
12204    if (!buf1)
12205        goto onError;
12206    buf2 = PyUnicode_DATA(sep_obj);
12207    if (kind2 != kind)
12208        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12209    if (!buf2)
12210        goto onError;
12211    len1 = PyUnicode_GET_LENGTH(str_obj);
12212    len2 = PyUnicode_GET_LENGTH(sep_obj);
12213
12214    switch (PyUnicode_KIND(str_in)) {
12215    case PyUnicode_1BYTE_KIND:
12216        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12217            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12218        else
12219            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12220        break;
12221    case PyUnicode_2BYTE_KIND:
12222        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12223        break;
12224    case PyUnicode_4BYTE_KIND:
12225        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12226        break;
12227    default:
12228        assert(0);
12229        out = 0;
12230    }
12231
12232    Py_DECREF(sep_obj);
12233    Py_DECREF(str_obj);
12234    if (kind1 != kind)
12235        PyMem_Free(buf1);
12236    if (kind2 != kind)
12237        PyMem_Free(buf2);
12238
12239    return out;
12240  onError:
12241    Py_DECREF(sep_obj);
12242    Py_DECREF(str_obj);
12243    if (kind1 != kind && buf1)
12244        PyMem_Free(buf1);
12245    if (kind2 != kind && buf2)
12246        PyMem_Free(buf2);
12247    return NULL;
12248}
12249
12250PyDoc_STRVAR(partition__doc__,
12251             "S.partition(sep) -> (head, sep, tail)\n\
12252\n\
12253Search for the separator sep in S, and return the part before it,\n\
12254the separator itself, and the part after it.  If the separator is not\n\
12255found, return S and two empty strings.");
12256
12257static PyObject*
12258unicode_partition(PyObject *self, PyObject *separator)
12259{
12260    return PyUnicode_Partition(self, separator);
12261}
12262
12263PyDoc_STRVAR(rpartition__doc__,
12264             "S.rpartition(sep) -> (head, sep, tail)\n\
12265\n\
12266Search for the separator sep in S, starting at the end of S, and return\n\
12267the part before it, the separator itself, and the part after it.  If the\n\
12268separator is not found, return two empty strings and S.");
12269
12270static PyObject*
12271unicode_rpartition(PyObject *self, PyObject *separator)
12272{
12273    return PyUnicode_RPartition(self, separator);
12274}
12275
12276PyObject *
12277PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12278{
12279    PyObject *result;
12280
12281    s = PyUnicode_FromObject(s);
12282    if (s == NULL)
12283        return NULL;
12284    if (sep != NULL) {
12285        sep = PyUnicode_FromObject(sep);
12286        if (sep == NULL) {
12287            Py_DECREF(s);
12288            return NULL;
12289        }
12290    }
12291
12292    result = rsplit(s, sep, maxsplit);
12293
12294    Py_DECREF(s);
12295    Py_XDECREF(sep);
12296    return result;
12297}
12298
12299PyDoc_STRVAR(rsplit__doc__,
12300             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12301\n\
12302Return a list of the words in S, using sep as the\n\
12303delimiter string, starting at the end of the string and\n\
12304working to the front.  If maxsplit is given, at most maxsplit\n\
12305splits are done. If sep is not specified, any whitespace string\n\
12306is a separator.");
12307
12308static PyObject*
12309unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12310{
12311    static char *kwlist[] = {"sep", "maxsplit", 0};
12312    PyObject *substring = Py_None;
12313    Py_ssize_t maxcount = -1;
12314
12315    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12316                                     kwlist, &substring, &maxcount))
12317        return NULL;
12318
12319    if (substring == Py_None)
12320        return rsplit(self, NULL, maxcount);
12321    else if (PyUnicode_Check(substring))
12322        return rsplit(self, substring, maxcount);
12323    else
12324        return PyUnicode_RSplit(self, substring, maxcount);
12325}
12326
12327PyDoc_STRVAR(splitlines__doc__,
12328             "S.splitlines([keepends]) -> list of strings\n\
12329\n\
12330Return a list of the lines in S, breaking at line boundaries.\n\
12331Line breaks are not included in the resulting list unless keepends\n\
12332is given and true.");
12333
12334static PyObject*
12335unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12336{
12337    static char *kwlist[] = {"keepends", 0};
12338    int keepends = 0;
12339
12340    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12341                                     kwlist, &keepends))
12342        return NULL;
12343
12344    return PyUnicode_Splitlines(self, keepends);
12345}
12346
12347static
12348PyObject *unicode_str(PyObject *self)
12349{
12350    return unicode_result_unchanged(self);
12351}
12352
12353PyDoc_STRVAR(swapcase__doc__,
12354             "S.swapcase() -> str\n\
12355\n\
12356Return a copy of S with uppercase characters converted to lowercase\n\
12357and vice versa.");
12358
12359static PyObject*
12360unicode_swapcase(PyObject *self)
12361{
12362    if (PyUnicode_READY(self) == -1)
12363        return NULL;
12364    return case_operation(self, do_swapcase);
12365}
12366
12367PyDoc_STRVAR(maketrans__doc__,
12368             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12369\n\
12370Return a translation table usable for str.translate().\n\
12371If there is only one argument, it must be a dictionary mapping Unicode\n\
12372ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12373Character keys will be then converted to ordinals.\n\
12374If there are two arguments, they must be strings of equal length, and\n\
12375in the resulting dictionary, each character in x will be mapped to the\n\
12376character at the same position in y. If there is a third argument, it\n\
12377must be a string, whose characters will be mapped to None in the result.");
12378
12379static PyObject*
12380unicode_maketrans(PyObject *null, PyObject *args)
12381{
12382    PyObject *x, *y = NULL, *z = NULL;
12383    PyObject *new = NULL, *key, *value;
12384    Py_ssize_t i = 0;
12385    int res;
12386
12387    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12388        return NULL;
12389    new = PyDict_New();
12390    if (!new)
12391        return NULL;
12392    if (y != NULL) {
12393        int x_kind, y_kind, z_kind;
12394        void *x_data, *y_data, *z_data;
12395
12396        /* x must be a string too, of equal length */
12397        if (!PyUnicode_Check(x)) {
12398            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12399                            "be a string if there is a second argument");
12400            goto err;
12401        }
12402        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12403            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12404                            "arguments must have equal length");
12405            goto err;
12406        }
12407        /* create entries for translating chars in x to those in y */
12408        x_kind = PyUnicode_KIND(x);
12409        y_kind = PyUnicode_KIND(y);
12410        x_data = PyUnicode_DATA(x);
12411        y_data = PyUnicode_DATA(y);
12412        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12413            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12414            if (!key)
12415                goto err;
12416            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12417            if (!value) {
12418                Py_DECREF(key);
12419                goto err;
12420            }
12421            res = PyDict_SetItem(new, key, value);
12422            Py_DECREF(key);
12423            Py_DECREF(value);
12424            if (res < 0)
12425                goto err;
12426        }
12427        /* create entries for deleting chars in z */
12428        if (z != NULL) {
12429            z_kind = PyUnicode_KIND(z);
12430            z_data = PyUnicode_DATA(z);
12431            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12432                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12433                if (!key)
12434                    goto err;
12435                res = PyDict_SetItem(new, key, Py_None);
12436                Py_DECREF(key);
12437                if (res < 0)
12438                    goto err;
12439            }
12440        }
12441    } else {
12442        int kind;
12443        void *data;
12444
12445        /* x must be a dict */
12446        if (!PyDict_CheckExact(x)) {
12447            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12448                            "to maketrans it must be a dict");
12449            goto err;
12450        }
12451        /* copy entries into the new dict, converting string keys to int keys */
12452        while (PyDict_Next(x, &i, &key, &value)) {
12453            if (PyUnicode_Check(key)) {
12454                /* convert string keys to integer keys */
12455                PyObject *newkey;
12456                if (PyUnicode_GET_LENGTH(key) != 1) {
12457                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12458                                    "table must be of length 1");
12459                    goto err;
12460                }
12461                kind = PyUnicode_KIND(key);
12462                data = PyUnicode_DATA(key);
12463                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12464                if (!newkey)
12465                    goto err;
12466                res = PyDict_SetItem(new, newkey, value);
12467                Py_DECREF(newkey);
12468                if (res < 0)
12469                    goto err;
12470            } else if (PyLong_Check(key)) {
12471                /* just keep integer keys */
12472                if (PyDict_SetItem(new, key, value) < 0)
12473                    goto err;
12474            } else {
12475                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12476                                "be strings or integers");
12477                goto err;
12478            }
12479        }
12480    }
12481    return new;
12482  err:
12483    Py_DECREF(new);
12484    return NULL;
12485}
12486
12487PyDoc_STRVAR(translate__doc__,
12488             "S.translate(table) -> str\n\
12489\n\
12490Return a copy of the string S, where all characters have been mapped\n\
12491through the given translation table, which must be a mapping of\n\
12492Unicode ordinals to Unicode ordinals, strings, or None.\n\
12493Unmapped characters are left untouched. Characters mapped to None\n\
12494are deleted.");
12495
12496static PyObject*
12497unicode_translate(PyObject *self, PyObject *table)
12498{
12499    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12500}
12501
12502PyDoc_STRVAR(upper__doc__,
12503             "S.upper() -> str\n\
12504\n\
12505Return a copy of S converted to uppercase.");
12506
12507static PyObject*
12508unicode_upper(PyObject *self)
12509{
12510    if (PyUnicode_READY(self) == -1)
12511        return NULL;
12512    if (PyUnicode_IS_ASCII(self))
12513        return ascii_upper_or_lower(self, 0);
12514    return case_operation(self, do_upper);
12515}
12516
12517PyDoc_STRVAR(zfill__doc__,
12518             "S.zfill(width) -> str\n\
12519\n\
12520Pad a numeric string S with zeros on the left, to fill a field\n\
12521of the specified width. The string S is never truncated.");
12522
12523static PyObject *
12524unicode_zfill(PyObject *self, PyObject *args)
12525{
12526    Py_ssize_t fill;
12527    PyObject *u;
12528    Py_ssize_t width;
12529    int kind;
12530    void *data;
12531    Py_UCS4 chr;
12532
12533    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12534        return NULL;
12535
12536    if (PyUnicode_READY(self) == -1)
12537        return NULL;
12538
12539    if (PyUnicode_GET_LENGTH(self) >= width)
12540        return unicode_result_unchanged(self);
12541
12542    fill = width - PyUnicode_GET_LENGTH(self);
12543
12544    u = pad(self, fill, 0, '0');
12545
12546    if (u == NULL)
12547        return NULL;
12548
12549    kind = PyUnicode_KIND(u);
12550    data = PyUnicode_DATA(u);
12551    chr = PyUnicode_READ(kind, data, fill);
12552
12553    if (chr == '+' || chr == '-') {
12554        /* move sign to beginning of string */
12555        PyUnicode_WRITE(kind, data, 0, chr);
12556        PyUnicode_WRITE(kind, data, fill, '0');
12557    }
12558
12559    assert(_PyUnicode_CheckConsistency(u, 1));
12560    return u;
12561}
12562
12563#if 0
12564static PyObject *
12565unicode__decimal2ascii(PyObject *self)
12566{
12567    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12568}
12569#endif
12570
12571PyDoc_STRVAR(startswith__doc__,
12572             "S.startswith(prefix[, start[, end]]) -> bool\n\
12573\n\
12574Return True if S starts with the specified prefix, False otherwise.\n\
12575With optional start, test S beginning at that position.\n\
12576With optional end, stop comparing S at that position.\n\
12577prefix can also be a tuple of strings to try.");
12578
12579static PyObject *
12580unicode_startswith(PyObject *self,
12581                   PyObject *args)
12582{
12583    PyObject *subobj;
12584    PyObject *substring;
12585    Py_ssize_t start = 0;
12586    Py_ssize_t end = PY_SSIZE_T_MAX;
12587    int result;
12588
12589    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12590        return NULL;
12591    if (PyTuple_Check(subobj)) {
12592        Py_ssize_t i;
12593        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12594            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12595            if (substring == NULL)
12596                return NULL;
12597            result = tailmatch(self, substring, start, end, -1);
12598            Py_DECREF(substring);
12599            if (result) {
12600                Py_RETURN_TRUE;
12601            }
12602        }
12603        /* nothing matched */
12604        Py_RETURN_FALSE;
12605    }
12606    substring = PyUnicode_FromObject(subobj);
12607    if (substring == NULL) {
12608        if (PyErr_ExceptionMatches(PyExc_TypeError))
12609            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12610                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12611        return NULL;
12612    }
12613    result = tailmatch(self, substring, start, end, -1);
12614    Py_DECREF(substring);
12615    return PyBool_FromLong(result);
12616}
12617
12618
12619PyDoc_STRVAR(endswith__doc__,
12620             "S.endswith(suffix[, start[, end]]) -> bool\n\
12621\n\
12622Return True if S ends with the specified suffix, False otherwise.\n\
12623With optional start, test S beginning at that position.\n\
12624With optional end, stop comparing S at that position.\n\
12625suffix can also be a tuple of strings to try.");
12626
12627static PyObject *
12628unicode_endswith(PyObject *self,
12629                 PyObject *args)
12630{
12631    PyObject *subobj;
12632    PyObject *substring;
12633    Py_ssize_t start = 0;
12634    Py_ssize_t end = PY_SSIZE_T_MAX;
12635    int result;
12636
12637    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12638        return NULL;
12639    if (PyTuple_Check(subobj)) {
12640        Py_ssize_t i;
12641        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12642            substring = PyUnicode_FromObject(
12643                PyTuple_GET_ITEM(subobj, i));
12644            if (substring == NULL)
12645                return NULL;
12646            result = tailmatch(self, substring, start, end, +1);
12647            Py_DECREF(substring);
12648            if (result) {
12649                Py_RETURN_TRUE;
12650            }
12651        }
12652        Py_RETURN_FALSE;
12653    }
12654    substring = PyUnicode_FromObject(subobj);
12655    if (substring == NULL) {
12656        if (PyErr_ExceptionMatches(PyExc_TypeError))
12657            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12658                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12659        return NULL;
12660    }
12661    result = tailmatch(self, substring, start, end, +1);
12662    Py_DECREF(substring);
12663    return PyBool_FromLong(result);
12664}
12665
12666Py_LOCAL_INLINE(void)
12667_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12668{
12669    writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12670    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12671    writer->data = PyUnicode_DATA(writer->buffer);
12672    writer->kind = PyUnicode_KIND(writer->buffer);
12673}
12674
12675void
12676_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
12677{
12678    memset(writer, 0, sizeof(*writer));
12679#ifdef Py_DEBUG
12680    writer->kind = 5;    /* invalid kind */
12681#endif
12682    writer->min_length = Py_MAX(min_length, 100);
12683    writer->overallocate = (min_length > 0);
12684}
12685
12686int
12687_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12688                                 Py_ssize_t length, Py_UCS4 maxchar)
12689{
12690    Py_ssize_t newlen;
12691    PyObject *newbuffer;
12692
12693    assert(length > 0);
12694
12695    if (length > PY_SSIZE_T_MAX - writer->pos) {
12696        PyErr_NoMemory();
12697        return -1;
12698    }
12699    newlen = writer->pos + length;
12700
12701    if (writer->buffer == NULL) {
12702        if (writer->overallocate) {
12703            /* overallocate 25% to limit the number of resize */
12704            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12705                newlen += newlen / 4;
12706            if (newlen < writer->min_length)
12707                newlen = writer->min_length;
12708        }
12709        writer->buffer = PyUnicode_New(newlen, maxchar);
12710        if (writer->buffer == NULL)
12711            return -1;
12712        _PyUnicodeWriter_Update(writer);
12713        return 0;
12714    }
12715
12716    if (newlen > writer->size) {
12717        if (writer->overallocate) {
12718            /* overallocate 25% to limit the number of resize */
12719            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12720                newlen += newlen / 4;
12721            if (newlen < writer->min_length)
12722                newlen = writer->min_length;
12723        }
12724
12725        if (maxchar > writer->maxchar || writer->readonly) {
12726            /* resize + widen */
12727            newbuffer = PyUnicode_New(newlen, maxchar);
12728            if (newbuffer == NULL)
12729                return -1;
12730            _PyUnicode_FastCopyCharacters(newbuffer, 0,
12731                                          writer->buffer, 0, writer->pos);
12732            Py_DECREF(writer->buffer);
12733            writer->readonly = 0;
12734        }
12735        else {
12736            newbuffer = resize_compact(writer->buffer, newlen);
12737            if (newbuffer == NULL)
12738                return -1;
12739        }
12740        writer->buffer = newbuffer;
12741        _PyUnicodeWriter_Update(writer);
12742    }
12743    else if (maxchar > writer->maxchar) {
12744        assert(!writer->readonly);
12745        newbuffer = PyUnicode_New(writer->size, maxchar);
12746        if (newbuffer == NULL)
12747            return -1;
12748        _PyUnicode_FastCopyCharacters(newbuffer, 0,
12749                                      writer->buffer, 0, writer->pos);
12750        Py_DECREF(writer->buffer);
12751        writer->buffer = newbuffer;
12752        _PyUnicodeWriter_Update(writer);
12753    }
12754    return 0;
12755}
12756
12757int
12758_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12759{
12760    Py_UCS4 maxchar;
12761    Py_ssize_t len;
12762
12763    if (PyUnicode_READY(str) == -1)
12764        return -1;
12765    len = PyUnicode_GET_LENGTH(str);
12766    if (len == 0)
12767        return 0;
12768    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12769    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
12770        if (writer->buffer == NULL && !writer->overallocate) {
12771            Py_INCREF(str);
12772            writer->buffer = str;
12773            _PyUnicodeWriter_Update(writer);
12774            writer->readonly = 1;
12775            writer->size = 0;
12776            writer->pos += len;
12777            return 0;
12778        }
12779        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12780            return -1;
12781    }
12782    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12783                                  str, 0, len);
12784    writer->pos += len;
12785    return 0;
12786}
12787
12788int
12789_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12790{
12791    Py_UCS4 maxchar;
12792
12793    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12794    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12795        return -1;
12796    unicode_write_cstr(writer->buffer, writer->pos, str, len);
12797    writer->pos += len;
12798    return 0;
12799}
12800
12801PyObject *
12802_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
12803{
12804    if (writer->pos == 0) {
12805        Py_XDECREF(writer->buffer);
12806        Py_INCREF(unicode_empty);
12807        return unicode_empty;
12808    }
12809    if (writer->readonly) {
12810        assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12811        return writer->buffer;
12812    }
12813    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12814        PyObject *newbuffer;
12815        newbuffer = resize_compact(writer->buffer, writer->pos);
12816        if (newbuffer == NULL) {
12817            Py_DECREF(writer->buffer);
12818            return NULL;
12819        }
12820        writer->buffer = newbuffer;
12821    }
12822    assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
12823    return writer->buffer;
12824}
12825
12826void
12827_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
12828{
12829    Py_CLEAR(writer->buffer);
12830}
12831
12832#include "stringlib/unicode_format.h"
12833
12834PyDoc_STRVAR(format__doc__,
12835             "S.format(*args, **kwargs) -> str\n\
12836\n\
12837Return a formatted version of S, using substitutions from args and kwargs.\n\
12838The substitutions are identified by braces ('{' and '}').");
12839
12840PyDoc_STRVAR(format_map__doc__,
12841             "S.format_map(mapping) -> str\n\
12842\n\
12843Return a formatted version of S, using substitutions from mapping.\n\
12844The substitutions are identified by braces ('{' and '}').");
12845
12846static PyObject *
12847unicode__format__(PyObject* self, PyObject* args)
12848{
12849    PyObject *format_spec;
12850    _PyUnicodeWriter writer;
12851    int ret;
12852
12853    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12854        return NULL;
12855
12856    if (PyUnicode_READY(self) == -1)
12857        return NULL;
12858    _PyUnicodeWriter_Init(&writer, 0);
12859    ret = _PyUnicode_FormatAdvancedWriter(&writer,
12860                                          self, format_spec, 0,
12861                                          PyUnicode_GET_LENGTH(format_spec));
12862    if (ret == -1) {
12863        _PyUnicodeWriter_Dealloc(&writer);
12864        return NULL;
12865    }
12866    return _PyUnicodeWriter_Finish(&writer);
12867}
12868
12869PyDoc_STRVAR(p_format__doc__,
12870             "S.__format__(format_spec) -> str\n\
12871\n\
12872Return a formatted version of S as described by format_spec.");
12873
12874static PyObject *
12875unicode__sizeof__(PyObject *v)
12876{
12877    Py_ssize_t size;
12878
12879    /* If it's a compact object, account for base structure +
12880       character data. */
12881    if (PyUnicode_IS_COMPACT_ASCII(v))
12882        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12883    else if (PyUnicode_IS_COMPACT(v))
12884        size = sizeof(PyCompactUnicodeObject) +
12885            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12886    else {
12887        /* If it is a two-block object, account for base object, and
12888           for character block if present. */
12889        size = sizeof(PyUnicodeObject);
12890        if (_PyUnicode_DATA_ANY(v))
12891            size += (PyUnicode_GET_LENGTH(v) + 1) *
12892                PyUnicode_KIND(v);
12893    }
12894    /* If the wstr pointer is present, account for it unless it is shared
12895       with the data pointer. Check if the data is not shared. */
12896    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12897        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12898    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12899        size += PyUnicode_UTF8_LENGTH(v) + 1;
12900
12901    return PyLong_FromSsize_t(size);
12902}
12903
12904PyDoc_STRVAR(sizeof__doc__,
12905             "S.__sizeof__() -> size of S in memory, in bytes");
12906
12907static PyObject *
12908unicode_getnewargs(PyObject *v)
12909{
12910    PyObject *copy = _PyUnicode_Copy(v);
12911    if (!copy)
12912        return NULL;
12913    return Py_BuildValue("(N)", copy);
12914}
12915
12916static PyMethodDef unicode_methods[] = {
12917    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12918    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12919    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12920    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
12921    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12922    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12923    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
12924    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12925    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12926    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12927    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12928    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12929    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12930    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12931    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12932    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12933    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12934    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12935    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12936    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12937    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12938    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12939    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12940    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12941    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12942    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12943    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12944    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12945    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12946    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12947    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12948    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12949    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12950    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12951    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12952    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12953    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12954    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12955    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12956    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12957    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12958    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12959    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12960    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12961    {"maketrans", (PyCFunction) unicode_maketrans,
12962     METH_VARARGS | METH_STATIC, maketrans__doc__},
12963    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12964#if 0
12965    /* These methods are just used for debugging the implementation. */
12966    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12967#endif
12968
12969    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12970    {NULL, NULL}
12971};
12972
12973static PyObject *
12974unicode_mod(PyObject *v, PyObject *w)
12975{
12976    if (!PyUnicode_Check(v))
12977        Py_RETURN_NOTIMPLEMENTED;
12978    return PyUnicode_Format(v, w);
12979}
12980
12981static PyNumberMethods unicode_as_number = {
12982    0,              /*nb_add*/
12983    0,              /*nb_subtract*/
12984    0,              /*nb_multiply*/
12985    unicode_mod,            /*nb_remainder*/
12986};
12987
12988static PySequenceMethods unicode_as_sequence = {
12989    (lenfunc) unicode_length,       /* sq_length */
12990    PyUnicode_Concat,           /* sq_concat */
12991    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12992    (ssizeargfunc) unicode_getitem,     /* sq_item */
12993    0,                  /* sq_slice */
12994    0,                  /* sq_ass_item */
12995    0,                  /* sq_ass_slice */
12996    PyUnicode_Contains,         /* sq_contains */
12997};
12998
12999static PyObject*
13000unicode_subscript(PyObject* self, PyObject* item)
13001{
13002    if (PyUnicode_READY(self) == -1)
13003        return NULL;
13004
13005    if (PyIndex_Check(item)) {
13006        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13007        if (i == -1 && PyErr_Occurred())
13008            return NULL;
13009        if (i < 0)
13010            i += PyUnicode_GET_LENGTH(self);
13011        return unicode_getitem(self, i);
13012    } else if (PySlice_Check(item)) {
13013        Py_ssize_t start, stop, step, slicelength, cur, i;
13014        PyObject *result;
13015        void *src_data, *dest_data;
13016        int src_kind, dest_kind;
13017        Py_UCS4 ch, max_char, kind_limit;
13018
13019        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13020                                 &start, &stop, &step, &slicelength) < 0) {
13021            return NULL;
13022        }
13023
13024        if (slicelength <= 0) {
13025            Py_INCREF(unicode_empty);
13026            return unicode_empty;
13027        } else if (start == 0 && step == 1 &&
13028                   slicelength == PyUnicode_GET_LENGTH(self)) {
13029            return unicode_result_unchanged(self);
13030        } else if (step == 1) {
13031            return PyUnicode_Substring(self,
13032                                       start, start + slicelength);
13033        }
13034        /* General case */
13035        src_kind = PyUnicode_KIND(self);
13036        src_data = PyUnicode_DATA(self);
13037        if (!PyUnicode_IS_ASCII(self)) {
13038            kind_limit = kind_maxchar_limit(src_kind);
13039            max_char = 0;
13040            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13041                ch = PyUnicode_READ(src_kind, src_data, cur);
13042                if (ch > max_char) {
13043                    max_char = ch;
13044                    if (max_char >= kind_limit)
13045                        break;
13046                }
13047            }
13048        }
13049        else
13050            max_char = 127;
13051        result = PyUnicode_New(slicelength, max_char);
13052        if (result == NULL)
13053            return NULL;
13054        dest_kind = PyUnicode_KIND(result);
13055        dest_data = PyUnicode_DATA(result);
13056
13057        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13058            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13059            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13060        }
13061        assert(_PyUnicode_CheckConsistency(result, 1));
13062        return result;
13063    } else {
13064        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13065        return NULL;
13066    }
13067}
13068
13069static PyMappingMethods unicode_as_mapping = {
13070    (lenfunc)unicode_length,        /* mp_length */
13071    (binaryfunc)unicode_subscript,  /* mp_subscript */
13072    (objobjargproc)0,           /* mp_ass_subscript */
13073};
13074
13075
13076/* Helpers for PyUnicode_Format() */
13077
13078struct unicode_formatter_t {
13079    PyObject *args;
13080    int args_owned;
13081    Py_ssize_t arglen, argidx;
13082    PyObject *dict;
13083
13084    enum PyUnicode_Kind fmtkind;
13085    Py_ssize_t fmtcnt, fmtpos;
13086    void *fmtdata;
13087    PyObject *fmtstr;
13088
13089    _PyUnicodeWriter writer;
13090};
13091
13092struct unicode_format_arg_t {
13093    Py_UCS4 ch;
13094    int flags;
13095    Py_ssize_t width;
13096    int prec;
13097    int sign;
13098};
13099
13100static PyObject *
13101unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13102{
13103    Py_ssize_t argidx = ctx->argidx;
13104
13105    if (argidx < ctx->arglen) {
13106        ctx->argidx++;
13107        if (ctx->arglen < 0)
13108            return ctx->args;
13109        else
13110            return PyTuple_GetItem(ctx->args, argidx);
13111    }
13112    PyErr_SetString(PyExc_TypeError,
13113                    "not enough arguments for format string");
13114    return NULL;
13115}
13116
13117/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13118
13119/* Format a float into the writer if the writer is not NULL, or into *p_output
13120   otherwise.
13121
13122   Return 0 on success, raise an exception and return -1 on error. */
13123static int
13124formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13125            PyObject **p_output,
13126            _PyUnicodeWriter *writer)
13127{
13128    char *p;
13129    double x;
13130    Py_ssize_t len;
13131    int prec;
13132    int dtoa_flags;
13133
13134    x = PyFloat_AsDouble(v);
13135    if (x == -1.0 && PyErr_Occurred())
13136        return -1;
13137
13138    prec = arg->prec;
13139    if (prec < 0)
13140        prec = 6;
13141
13142    if (arg->flags & F_ALT)
13143        dtoa_flags = Py_DTSF_ALT;
13144    else
13145        dtoa_flags = 0;
13146    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13147    if (p == NULL)
13148        return -1;
13149    len = strlen(p);
13150    if (writer) {
13151        if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13152            PyMem_Free(p);
13153            return -1;
13154        }
13155        unicode_write_cstr(writer->buffer, writer->pos, p, len);
13156        writer->pos += len;
13157    }
13158    else
13159        *p_output = _PyUnicode_FromASCII(p, len);
13160    PyMem_Free(p);
13161    return 0;
13162}
13163
13164/* formatlong() emulates the format codes d, u, o, x and X, and
13165 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13166 * Python's regular ints.
13167 * Return value:  a new PyUnicodeObject*, or NULL if error.
13168 *     The output string is of the form
13169 *         "-"? ("0x" | "0X")? digit+
13170 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13171 *         set in flags.  The case of hex digits will be correct,
13172 *     There will be at least prec digits, zero-filled on the left if
13173 *         necessary to get that many.
13174 * val          object to be converted
13175 * flags        bitmask of format flags; only F_ALT is looked at
13176 * prec         minimum number of digits; 0-fill on left if needed
13177 * type         a character in [duoxX]; u acts the same as d
13178 *
13179 * CAUTION:  o, x and X conversions on regular ints can never
13180 * produce a '-' sign, but can for Python's unbounded ints.
13181 */
13182static PyObject*
13183formatlong(PyObject *val, struct unicode_format_arg_t *arg)
13184{
13185    PyObject *result = NULL;
13186    char *buf;
13187    Py_ssize_t i;
13188    int sign;           /* 1 if '-', else 0 */
13189    int len;            /* number of characters */
13190    Py_ssize_t llen;
13191    int numdigits;      /* len == numnondigits + numdigits */
13192    int numnondigits = 0;
13193    int prec = arg->prec;
13194    int type = arg->ch;
13195
13196    /* Avoid exceeding SSIZE_T_MAX */
13197    if (prec > INT_MAX-3) {
13198        PyErr_SetString(PyExc_OverflowError,
13199                        "precision too large");
13200        return NULL;
13201    }
13202
13203    assert(PyLong_Check(val));
13204
13205    switch (type) {
13206    default:
13207        assert(!"'type' not in [diuoxX]");
13208    case 'd':
13209    case 'i':
13210    case 'u':
13211        /* Special-case boolean: we want 0/1 */
13212        if (PyBool_Check(val))
13213            result = PyNumber_ToBase(val, 10);
13214        else
13215            result = Py_TYPE(val)->tp_str(val);
13216        break;
13217    case 'o':
13218        numnondigits = 2;
13219        result = PyNumber_ToBase(val, 8);
13220        break;
13221    case 'x':
13222    case 'X':
13223        numnondigits = 2;
13224        result = PyNumber_ToBase(val, 16);
13225        break;
13226    }
13227    if (!result)
13228        return NULL;
13229
13230    assert(unicode_modifiable(result));
13231    assert(PyUnicode_IS_READY(result));
13232    assert(PyUnicode_IS_ASCII(result));
13233
13234    /* To modify the string in-place, there can only be one reference. */
13235    if (Py_REFCNT(result) != 1) {
13236        PyErr_BadInternalCall();
13237        return NULL;
13238    }
13239    buf = PyUnicode_DATA(result);
13240    llen = PyUnicode_GET_LENGTH(result);
13241    if (llen > INT_MAX) {
13242        PyErr_SetString(PyExc_ValueError,
13243                        "string too large in _PyBytes_FormatLong");
13244        return NULL;
13245    }
13246    len = (int)llen;
13247    sign = buf[0] == '-';
13248    numnondigits += sign;
13249    numdigits = len - numnondigits;
13250    assert(numdigits > 0);
13251
13252    /* Get rid of base marker unless F_ALT */
13253    if (((arg->flags & F_ALT) == 0 &&
13254        (type == 'o' || type == 'x' || type == 'X'))) {
13255        assert(buf[sign] == '0');
13256        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13257               buf[sign+1] == 'o');
13258        numnondigits -= 2;
13259        buf += 2;
13260        len -= 2;
13261        if (sign)
13262            buf[0] = '-';
13263        assert(len == numnondigits + numdigits);
13264        assert(numdigits > 0);
13265    }
13266
13267    /* Fill with leading zeroes to meet minimum width. */
13268    if (prec > numdigits) {
13269        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13270                                numnondigits + prec);
13271        char *b1;
13272        if (!r1) {
13273            Py_DECREF(result);
13274            return NULL;
13275        }
13276        b1 = PyBytes_AS_STRING(r1);
13277        for (i = 0; i < numnondigits; ++i)
13278            *b1++ = *buf++;
13279        for (i = 0; i < prec - numdigits; i++)
13280            *b1++ = '0';
13281        for (i = 0; i < numdigits; i++)
13282            *b1++ = *buf++;
13283        *b1 = '\0';
13284        Py_DECREF(result);
13285        result = r1;
13286        buf = PyBytes_AS_STRING(result);
13287        len = numnondigits + prec;
13288    }
13289
13290    /* Fix up case for hex conversions. */
13291    if (type == 'X') {
13292        /* Need to convert all lower case letters to upper case.
13293           and need to convert 0x to 0X (and -0x to -0X). */
13294        for (i = 0; i < len; i++)
13295            if (buf[i] >= 'a' && buf[i] <= 'x')
13296                buf[i] -= 'a'-'A';
13297    }
13298    if (!PyUnicode_Check(result)
13299        || buf != PyUnicode_DATA(result)) {
13300        PyObject *unicode;
13301        unicode = _PyUnicode_FromASCII(buf, len);
13302        Py_DECREF(result);
13303        result = unicode;
13304    }
13305    else if (len != PyUnicode_GET_LENGTH(result)) {
13306        if (PyUnicode_Resize(&result, len) < 0)
13307            Py_CLEAR(result);
13308    }
13309    return result;
13310}
13311
13312/* Format an integer.
13313 * Return 1 if the number has been formatted into the writer,
13314 *        0 if the number has been formatted into *p_output
13315 *       -1 and raise an exception on error */
13316static int
13317mainformatlong(PyObject *v,
13318               struct unicode_format_arg_t *arg,
13319               PyObject **p_output,
13320               _PyUnicodeWriter *writer)
13321{
13322    PyObject *iobj, *res;
13323    char type = (char)arg->ch;
13324
13325    if (!PyNumber_Check(v))
13326        goto wrongtype;
13327
13328    if (!PyLong_Check(v)) {
13329        iobj = PyNumber_Long(v);
13330        if (iobj == NULL) {
13331            if (PyErr_ExceptionMatches(PyExc_TypeError))
13332                goto wrongtype;
13333            return -1;
13334        }
13335        assert(PyLong_Check(iobj));
13336    }
13337    else {
13338        iobj = v;
13339        Py_INCREF(iobj);
13340    }
13341
13342    if (PyLong_CheckExact(v)
13343        && arg->width == -1 && arg->prec == -1
13344        && !(arg->flags & (F_SIGN | F_BLANK))
13345        && type != 'X')
13346    {
13347        /* Fast path */
13348        int alternate = arg->flags & F_ALT;
13349        int base;
13350
13351        switch(type)
13352        {
13353            default:
13354                assert(0 && "'type' not in [diuoxX]");
13355            case 'd':
13356            case 'i':
13357            case 'u':
13358                base = 10;
13359                break;
13360            case 'o':
13361                base = 8;
13362                break;
13363            case 'x':
13364            case 'X':
13365                base = 16;
13366                break;
13367        }
13368
13369        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13370            Py_DECREF(iobj);
13371            return -1;
13372        }
13373        Py_DECREF(iobj);
13374        return 1;
13375    }
13376
13377    res = formatlong(iobj, arg);
13378    Py_DECREF(iobj);
13379    if (res == NULL)
13380        return -1;
13381    *p_output = res;
13382    return 0;
13383
13384wrongtype:
13385    PyErr_Format(PyExc_TypeError,
13386            "%%%c format: a number is required, "
13387            "not %.200s",
13388            type, Py_TYPE(v)->tp_name);
13389    return -1;
13390}
13391
13392static Py_UCS4
13393formatchar(PyObject *v)
13394{
13395    /* presume that the buffer is at least 3 characters long */
13396    if (PyUnicode_Check(v)) {
13397        if (PyUnicode_GET_LENGTH(v) == 1) {
13398            return PyUnicode_READ_CHAR(v, 0);
13399        }
13400        goto onError;
13401    }
13402    else {
13403        /* Integer input truncated to a character */
13404        long x;
13405        x = PyLong_AsLong(v);
13406        if (x == -1 && PyErr_Occurred())
13407            goto onError;
13408
13409        if (x < 0 || x > MAX_UNICODE) {
13410            PyErr_SetString(PyExc_OverflowError,
13411                            "%c arg not in range(0x110000)");
13412            return (Py_UCS4) -1;
13413        }
13414
13415        return (Py_UCS4) x;
13416    }
13417
13418  onError:
13419    PyErr_SetString(PyExc_TypeError,
13420                    "%c requires int or char");
13421    return (Py_UCS4) -1;
13422}
13423
13424/* Parse options of an argument: flags, width, precision.
13425   Handle also "%(name)" syntax.
13426
13427   Return 0 if the argument has been formatted into arg->str.
13428   Return 1 if the argument has been written into ctx->writer,
13429   Raise an exception and return -1 on error. */
13430static int
13431unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13432                         struct unicode_format_arg_t *arg)
13433{
13434#define FORMAT_READ(ctx) \
13435        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13436
13437    PyObject *v;
13438
13439    arg->ch = FORMAT_READ(ctx);
13440    if (arg->ch == '(') {
13441        /* Get argument value from a dictionary. Example: "%(name)s". */
13442        Py_ssize_t keystart;
13443        Py_ssize_t keylen;
13444        PyObject *key;
13445        int pcount = 1;
13446
13447        if (ctx->dict == NULL) {
13448            PyErr_SetString(PyExc_TypeError,
13449                            "format requires a mapping");
13450            return -1;
13451        }
13452        ++ctx->fmtpos;
13453        --ctx->fmtcnt;
13454        keystart = ctx->fmtpos;
13455        /* Skip over balanced parentheses */
13456        while (pcount > 0 && --ctx->fmtcnt >= 0) {
13457            arg->ch = FORMAT_READ(ctx);
13458            if (arg->ch == ')')
13459                --pcount;
13460            else if (arg->ch == '(')
13461                ++pcount;
13462            ctx->fmtpos++;
13463        }
13464        keylen = ctx->fmtpos - keystart - 1;
13465        if (ctx->fmtcnt < 0 || pcount > 0) {
13466            PyErr_SetString(PyExc_ValueError,
13467                            "incomplete format key");
13468            return -1;
13469        }
13470        key = PyUnicode_Substring(ctx->fmtstr,
13471                                  keystart, keystart + keylen);
13472        if (key == NULL)
13473            return -1;
13474        if (ctx->args_owned) {
13475            Py_DECREF(ctx->args);
13476            ctx->args_owned = 0;
13477        }
13478        ctx->args = PyObject_GetItem(ctx->dict, key);
13479        Py_DECREF(key);
13480        if (ctx->args == NULL)
13481            return -1;
13482        ctx->args_owned = 1;
13483        ctx->arglen = -1;
13484        ctx->argidx = -2;
13485    }
13486
13487    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13488    arg->flags = 0;
13489    while (--ctx->fmtcnt >= 0) {
13490        arg->ch = FORMAT_READ(ctx);
13491        ctx->fmtpos++;
13492        switch (arg->ch) {
13493        case '-': arg->flags |= F_LJUST; continue;
13494        case '+': arg->flags |= F_SIGN; continue;
13495        case ' ': arg->flags |= F_BLANK; continue;
13496        case '#': arg->flags |= F_ALT; continue;
13497        case '0': arg->flags |= F_ZERO; continue;
13498        }
13499        break;
13500    }
13501
13502    /* Parse width. Example: "%10s" => width=10 */
13503    arg->width = -1;
13504    if (arg->ch == '*') {
13505        v = unicode_format_getnextarg(ctx);
13506        if (v == NULL)
13507            return -1;
13508        if (!PyLong_Check(v)) {
13509            PyErr_SetString(PyExc_TypeError,
13510                            "* wants int");
13511            return -1;
13512        }
13513        arg->width = PyLong_AsLong(v);
13514        if (arg->width == -1 && PyErr_Occurred())
13515            return -1;
13516        if (arg->width < 0) {
13517            arg->flags |= F_LJUST;
13518            arg->width = -arg->width;
13519        }
13520        if (--ctx->fmtcnt >= 0) {
13521            arg->ch = FORMAT_READ(ctx);
13522            ctx->fmtpos++;
13523        }
13524    }
13525    else if (arg->ch >= '0' && arg->ch <= '9') {
13526        arg->width = arg->ch - '0';
13527        while (--ctx->fmtcnt >= 0) {
13528            arg->ch = FORMAT_READ(ctx);
13529            ctx->fmtpos++;
13530            if (arg->ch < '0' || arg->ch > '9')
13531                break;
13532            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13533               mixing signed and unsigned comparison. Since arg->ch is between
13534               '0' and '9', casting to int is safe. */
13535            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13536                PyErr_SetString(PyExc_ValueError,
13537                                "width too big");
13538                return -1;
13539            }
13540            arg->width = arg->width*10 + (arg->ch - '0');
13541        }
13542    }
13543
13544    /* Parse precision. Example: "%.3f" => prec=3 */
13545    arg->prec = -1;
13546    if (arg->ch == '.') {
13547        arg->prec = 0;
13548        if (--ctx->fmtcnt >= 0) {
13549            arg->ch = FORMAT_READ(ctx);
13550            ctx->fmtpos++;
13551        }
13552        if (arg->ch == '*') {
13553            v = unicode_format_getnextarg(ctx);
13554            if (v == NULL)
13555                return -1;
13556            if (!PyLong_Check(v)) {
13557                PyErr_SetString(PyExc_TypeError,
13558                                "* wants int");
13559                return -1;
13560            }
13561            arg->prec = PyLong_AsLong(v);
13562            if (arg->prec == -1 && PyErr_Occurred())
13563                return -1;
13564            if (arg->prec < 0)
13565                arg->prec = 0;
13566            if (--ctx->fmtcnt >= 0) {
13567                arg->ch = FORMAT_READ(ctx);
13568                ctx->fmtpos++;
13569            }
13570        }
13571        else if (arg->ch >= '0' && arg->ch <= '9') {
13572            arg->prec = arg->ch - '0';
13573            while (--ctx->fmtcnt >= 0) {
13574                arg->ch = FORMAT_READ(ctx);
13575                ctx->fmtpos++;
13576                if (arg->ch < '0' || arg->ch > '9')
13577                    break;
13578                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13579                    PyErr_SetString(PyExc_ValueError,
13580                                    "precision too big");
13581                    return -1;
13582                }
13583                arg->prec = arg->prec*10 + (arg->ch - '0');
13584            }
13585        }
13586    }
13587
13588    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13589    if (ctx->fmtcnt >= 0) {
13590        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13591            if (--ctx->fmtcnt >= 0) {
13592                arg->ch = FORMAT_READ(ctx);
13593                ctx->fmtpos++;
13594            }
13595        }
13596    }
13597    if (ctx->fmtcnt < 0) {
13598        PyErr_SetString(PyExc_ValueError,
13599                        "incomplete format");
13600        return -1;
13601    }
13602    return 0;
13603
13604#undef FORMAT_READ
13605}
13606
13607/* Format one argument. Supported conversion specifiers:
13608
13609   - "s", "r", "a": any type
13610   - "i", "d", "u", "o", "x", "X": int
13611   - "e", "E", "f", "F", "g", "G": float
13612   - "c": int or str (1 character)
13613
13614   Return 0 if the argument has been formatted into *p_str,
13615          1 if the argument has been written into ctx->writer,
13616          -1 on error. */
13617static int
13618unicode_format_arg_format(struct unicode_formatter_t *ctx,
13619                          struct unicode_format_arg_t *arg,
13620                          PyObject **p_str)
13621{
13622    PyObject *v;
13623    _PyUnicodeWriter *writer = &ctx->writer;
13624
13625    if (ctx->fmtcnt == 0)
13626        ctx->writer.overallocate = 0;
13627
13628    if (arg->ch == '%') {
13629        if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13630            return -1;
13631        PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13632        writer->pos += 1;
13633        return 1;
13634    }
13635
13636    v = unicode_format_getnextarg(ctx);
13637    if (v == NULL)
13638        return -1;
13639
13640    arg->sign = 0;
13641
13642    switch (arg->ch) {
13643
13644    case 's':
13645    case 'r':
13646    case 'a':
13647        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13648            /* Fast path */
13649            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13650                return -1;
13651            return 1;
13652        }
13653
13654        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13655            *p_str = v;
13656            Py_INCREF(*p_str);
13657        }
13658        else {
13659            if (arg->ch == 's')
13660                *p_str = PyObject_Str(v);
13661            else if (arg->ch == 'r')
13662                *p_str = PyObject_Repr(v);
13663            else
13664                *p_str = PyObject_ASCII(v);
13665        }
13666        break;
13667
13668    case 'i':
13669    case 'd':
13670    case 'u':
13671    case 'o':
13672    case 'x':
13673    case 'X':
13674    {
13675        int ret = mainformatlong(v, arg, p_str, writer);
13676        if (ret != 0)
13677            return ret;
13678        arg->sign = 1;
13679        break;
13680    }
13681
13682    case 'e':
13683    case 'E':
13684    case 'f':
13685    case 'F':
13686    case 'g':
13687    case 'G':
13688        if (arg->width == -1 && arg->prec == -1
13689            && !(arg->flags & (F_SIGN | F_BLANK)))
13690        {
13691            /* Fast path */
13692            if (formatfloat(v, arg, NULL, writer) == -1)
13693                return -1;
13694            return 1;
13695        }
13696
13697        arg->sign = 1;
13698        if (formatfloat(v, arg, p_str, NULL) == -1)
13699            return -1;
13700        break;
13701
13702    case 'c':
13703    {
13704        Py_UCS4 ch = formatchar(v);
13705        if (ch == (Py_UCS4) -1)
13706            return -1;
13707        if (arg->width == -1 && arg->prec == -1) {
13708            /* Fast path */
13709            if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13710                return -1;
13711            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13712            writer->pos += 1;
13713            return 1;
13714        }
13715        *p_str = PyUnicode_FromOrdinal(ch);
13716        break;
13717    }
13718
13719    default:
13720        PyErr_Format(PyExc_ValueError,
13721                     "unsupported format character '%c' (0x%x) "
13722                     "at index %zd",
13723                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13724                     (int)arg->ch,
13725                     ctx->fmtpos - 1);
13726        return -1;
13727    }
13728    if (*p_str == NULL)
13729        return -1;
13730    assert (PyUnicode_Check(*p_str));
13731    return 0;
13732}
13733
13734static int
13735unicode_format_arg_output(struct unicode_formatter_t *ctx,
13736                          struct unicode_format_arg_t *arg,
13737                          PyObject *str)
13738{
13739    Py_ssize_t len;
13740    enum PyUnicode_Kind kind;
13741    void *pbuf;
13742    Py_ssize_t pindex;
13743    Py_UCS4 signchar;
13744    Py_ssize_t buflen;
13745    Py_UCS4 maxchar, bufmaxchar;
13746    Py_ssize_t sublen;
13747    _PyUnicodeWriter *writer = &ctx->writer;
13748    Py_UCS4 fill;
13749
13750    fill = ' ';
13751    if (arg->sign && arg->flags & F_ZERO)
13752        fill = '0';
13753
13754    if (PyUnicode_READY(str) == -1)
13755        return -1;
13756
13757    len = PyUnicode_GET_LENGTH(str);
13758    if ((arg->width == -1 || arg->width <= len)
13759        && (arg->prec == -1 || arg->prec >= len)
13760        && !(arg->flags & (F_SIGN | F_BLANK)))
13761    {
13762        /* Fast path */
13763        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13764            return -1;
13765        return 0;
13766    }
13767
13768    /* Truncate the string for "s", "r" and "a" formats
13769       if the precision is set */
13770    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13771        if (arg->prec >= 0 && len > arg->prec)
13772            len = arg->prec;
13773    }
13774
13775    /* Adjust sign and width */
13776    kind = PyUnicode_KIND(str);
13777    pbuf = PyUnicode_DATA(str);
13778    pindex = 0;
13779    signchar = '\0';
13780    if (arg->sign) {
13781        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13782        if (ch == '-' || ch == '+') {
13783            signchar = ch;
13784            len--;
13785            pindex++;
13786        }
13787        else if (arg->flags & F_SIGN)
13788            signchar = '+';
13789        else if (arg->flags & F_BLANK)
13790            signchar = ' ';
13791        else
13792            arg->sign = 0;
13793    }
13794    if (arg->width < len)
13795        arg->width = len;
13796
13797    /* Prepare the writer */
13798    bufmaxchar = 127;
13799    if (!(arg->flags & F_LJUST)) {
13800        if (arg->sign) {
13801            if ((arg->width-1) > len)
13802                bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13803        }
13804        else {
13805            if (arg->width > len)
13806                bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13807        }
13808    }
13809    maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13810    bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13811    buflen = arg->width;
13812    if (arg->sign && len == arg->width)
13813        buflen++;
13814    if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13815        return -1;
13816
13817    /* Write the sign if needed */
13818    if (arg->sign) {
13819        if (fill != ' ') {
13820            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13821            writer->pos += 1;
13822        }
13823        if (arg->width > len)
13824            arg->width--;
13825    }
13826
13827    /* Write the numeric prefix for "x", "X" and "o" formats
13828       if the alternate form is used.
13829       For example, write "0x" for the "%#x" format. */
13830    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13831        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13832        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13833        if (fill != ' ') {
13834            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13835            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13836            writer->pos += 2;
13837            pindex += 2;
13838        }
13839        arg->width -= 2;
13840        if (arg->width < 0)
13841            arg->width = 0;
13842        len -= 2;
13843    }
13844
13845    /* Pad left with the fill character if needed */
13846    if (arg->width > len && !(arg->flags & F_LJUST)) {
13847        sublen = arg->width - len;
13848        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13849        writer->pos += sublen;
13850        arg->width = len;
13851    }
13852
13853    /* If padding with spaces: write sign if needed and/or numeric prefix if
13854       the alternate form is used */
13855    if (fill == ' ') {
13856        if (arg->sign) {
13857            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13858            writer->pos += 1;
13859        }
13860        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13861            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13862            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13863            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13864            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13865            writer->pos += 2;
13866            pindex += 2;
13867        }
13868    }
13869
13870    /* Write characters */
13871    if (len) {
13872        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13873                                      str, pindex, len);
13874        writer->pos += len;
13875    }
13876
13877    /* Pad right with the fill character if needed */
13878    if (arg->width > len) {
13879        sublen = arg->width - len;
13880        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13881        writer->pos += sublen;
13882    }
13883    return 0;
13884}
13885
13886/* Helper of PyUnicode_Format(): format one arg.
13887   Return 0 on success, raise an exception and return -1 on error. */
13888static int
13889unicode_format_arg(struct unicode_formatter_t *ctx)
13890{
13891    struct unicode_format_arg_t arg;
13892    PyObject *str;
13893    int ret;
13894
13895    ret = unicode_format_arg_parse(ctx, &arg);
13896    if (ret == -1)
13897        return -1;
13898
13899    ret = unicode_format_arg_format(ctx, &arg, &str);
13900    if (ret == -1)
13901        return -1;
13902
13903    if (ret != 1) {
13904        ret = unicode_format_arg_output(ctx, &arg, str);
13905        Py_DECREF(str);
13906        if (ret == -1)
13907            return -1;
13908    }
13909
13910    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13911        PyErr_SetString(PyExc_TypeError,
13912                        "not all arguments converted during string formatting");
13913        return -1;
13914    }
13915    return 0;
13916}
13917
13918PyObject *
13919PyUnicode_Format(PyObject *format, PyObject *args)
13920{
13921    struct unicode_formatter_t ctx;
13922
13923    if (format == NULL || args == NULL) {
13924        PyErr_BadInternalCall();
13925        return NULL;
13926    }
13927
13928    ctx.fmtstr = PyUnicode_FromObject(format);
13929    if (ctx.fmtstr == NULL)
13930        return NULL;
13931    if (PyUnicode_READY(ctx.fmtstr) == -1) {
13932        Py_DECREF(ctx.fmtstr);
13933        return NULL;
13934    }
13935    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13936    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13937    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13938    ctx.fmtpos = 0;
13939
13940    _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
13941
13942    if (PyTuple_Check(args)) {
13943        ctx.arglen = PyTuple_Size(args);
13944        ctx.argidx = 0;
13945    }
13946    else {
13947        ctx.arglen = -1;
13948        ctx.argidx = -2;
13949    }
13950    ctx.args_owned = 0;
13951    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
13952        ctx.dict = args;
13953    else
13954        ctx.dict = NULL;
13955    ctx.args = args;
13956
13957    while (--ctx.fmtcnt >= 0) {
13958        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13959            Py_ssize_t nonfmtpos, sublen;
13960            Py_UCS4 maxchar;
13961
13962            nonfmtpos = ctx.fmtpos++;
13963            while (ctx.fmtcnt >= 0 &&
13964                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13965                ctx.fmtpos++;
13966                ctx.fmtcnt--;
13967            }
13968            if (ctx.fmtcnt < 0) {
13969                ctx.fmtpos--;
13970                ctx.writer.overallocate = 0;
13971            }
13972            sublen = ctx.fmtpos - nonfmtpos;
13973            maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
13974                                             nonfmtpos, nonfmtpos + sublen);
13975            if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
13976                goto onError;
13977
13978            _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13979                                          ctx.fmtstr, nonfmtpos, sublen);
13980            ctx.writer.pos += sublen;
13981        }
13982        else {
13983            ctx.fmtpos++;
13984            if (unicode_format_arg(&ctx) == -1)
13985                goto onError;
13986        }
13987    }
13988
13989    if (ctx.argidx < ctx.arglen && !ctx.dict) {
13990        PyErr_SetString(PyExc_TypeError,
13991                        "not all arguments converted during string formatting");
13992        goto onError;
13993    }
13994
13995    if (ctx.args_owned) {
13996        Py_DECREF(ctx.args);
13997    }
13998    Py_DECREF(ctx.fmtstr);
13999    return _PyUnicodeWriter_Finish(&ctx.writer);
14000
14001  onError:
14002    Py_DECREF(ctx.fmtstr);
14003    _PyUnicodeWriter_Dealloc(&ctx.writer);
14004    if (ctx.args_owned) {
14005        Py_DECREF(ctx.args);
14006    }
14007    return NULL;
14008}
14009
14010static PyObject *
14011unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14012
14013static PyObject *
14014unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14015{
14016    PyObject *x = NULL;
14017    static char *kwlist[] = {"object", "encoding", "errors", 0};
14018    char *encoding = NULL;
14019    char *errors = NULL;
14020
14021    if (type != &PyUnicode_Type)
14022        return unicode_subtype_new(type, args, kwds);
14023    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14024                                     kwlist, &x, &encoding, &errors))
14025        return NULL;
14026    if (x == NULL) {
14027        Py_INCREF(unicode_empty);
14028        return unicode_empty;
14029    }
14030    if (encoding == NULL && errors == NULL)
14031        return PyObject_Str(x);
14032    else
14033        return PyUnicode_FromEncodedObject(x, encoding, errors);
14034}
14035
14036static PyObject *
14037unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14038{
14039    PyObject *unicode, *self;
14040    Py_ssize_t length, char_size;
14041    int share_wstr, share_utf8;
14042    unsigned int kind;
14043    void *data;
14044
14045    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14046
14047    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14048    if (unicode == NULL)
14049        return NULL;
14050    assert(_PyUnicode_CHECK(unicode));
14051    if (PyUnicode_READY(unicode) == -1) {
14052        Py_DECREF(unicode);
14053        return NULL;
14054    }
14055
14056    self = type->tp_alloc(type, 0);
14057    if (self == NULL) {
14058        Py_DECREF(unicode);
14059        return NULL;
14060    }
14061    kind = PyUnicode_KIND(unicode);
14062    length = PyUnicode_GET_LENGTH(unicode);
14063
14064    _PyUnicode_LENGTH(self) = length;
14065#ifdef Py_DEBUG
14066    _PyUnicode_HASH(self) = -1;
14067#else
14068    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14069#endif
14070    _PyUnicode_STATE(self).interned = 0;
14071    _PyUnicode_STATE(self).kind = kind;
14072    _PyUnicode_STATE(self).compact = 0;
14073    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14074    _PyUnicode_STATE(self).ready = 1;
14075    _PyUnicode_WSTR(self) = NULL;
14076    _PyUnicode_UTF8_LENGTH(self) = 0;
14077    _PyUnicode_UTF8(self) = NULL;
14078    _PyUnicode_WSTR_LENGTH(self) = 0;
14079    _PyUnicode_DATA_ANY(self) = NULL;
14080
14081    share_utf8 = 0;
14082    share_wstr = 0;
14083    if (kind == PyUnicode_1BYTE_KIND) {
14084        char_size = 1;
14085        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14086            share_utf8 = 1;
14087    }
14088    else if (kind == PyUnicode_2BYTE_KIND) {
14089        char_size = 2;
14090        if (sizeof(wchar_t) == 2)
14091            share_wstr = 1;
14092    }
14093    else {
14094        assert(kind == PyUnicode_4BYTE_KIND);
14095        char_size = 4;
14096        if (sizeof(wchar_t) == 4)
14097            share_wstr = 1;
14098    }
14099
14100    /* Ensure we won't overflow the length. */
14101    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14102        PyErr_NoMemory();
14103        goto onError;
14104    }
14105    data = PyObject_MALLOC((length + 1) * char_size);
14106    if (data == NULL) {
14107        PyErr_NoMemory();
14108        goto onError;
14109    }
14110
14111    _PyUnicode_DATA_ANY(self) = data;
14112    if (share_utf8) {
14113        _PyUnicode_UTF8_LENGTH(self) = length;
14114        _PyUnicode_UTF8(self) = data;
14115    }
14116    if (share_wstr) {
14117        _PyUnicode_WSTR_LENGTH(self) = length;
14118        _PyUnicode_WSTR(self) = (wchar_t *)data;
14119    }
14120
14121    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14122              kind * (length + 1));
14123    assert(_PyUnicode_CheckConsistency(self, 1));
14124#ifdef Py_DEBUG
14125    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14126#endif
14127    Py_DECREF(unicode);
14128    return self;
14129
14130onError:
14131    Py_DECREF(unicode);
14132    Py_DECREF(self);
14133    return NULL;
14134}
14135
14136PyDoc_STRVAR(unicode_doc,
14137"str(object='') -> str\n\
14138str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14139\n\
14140Create a new string object from the given object. If encoding or\n\
14141errors is specified, then the object must expose a data buffer\n\
14142that will be decoded using the given encoding and error handler.\n\
14143Otherwise, returns the result of object.__str__() (if defined)\n\
14144or repr(object).\n\
14145encoding defaults to sys.getdefaultencoding().\n\
14146errors defaults to 'strict'.");
14147
14148static PyObject *unicode_iter(PyObject *seq);
14149
14150PyTypeObject PyUnicode_Type = {
14151    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14152    "str",              /* tp_name */
14153    sizeof(PyUnicodeObject),        /* tp_size */
14154    0,                  /* tp_itemsize */
14155    /* Slots */
14156    (destructor)unicode_dealloc,    /* tp_dealloc */
14157    0,                  /* tp_print */
14158    0,                  /* tp_getattr */
14159    0,                  /* tp_setattr */
14160    0,                  /* tp_reserved */
14161    unicode_repr,           /* tp_repr */
14162    &unicode_as_number,         /* tp_as_number */
14163    &unicode_as_sequence,       /* tp_as_sequence */
14164    &unicode_as_mapping,        /* tp_as_mapping */
14165    (hashfunc) unicode_hash,        /* tp_hash*/
14166    0,                  /* tp_call*/
14167    (reprfunc) unicode_str,     /* tp_str */
14168    PyObject_GenericGetAttr,        /* tp_getattro */
14169    0,                  /* tp_setattro */
14170    0,                  /* tp_as_buffer */
14171    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14172    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14173    unicode_doc,            /* tp_doc */
14174    0,                  /* tp_traverse */
14175    0,                  /* tp_clear */
14176    PyUnicode_RichCompare,      /* tp_richcompare */
14177    0,                  /* tp_weaklistoffset */
14178    unicode_iter,           /* tp_iter */
14179    0,                  /* tp_iternext */
14180    unicode_methods,            /* tp_methods */
14181    0,                  /* tp_members */
14182    0,                  /* tp_getset */
14183    &PyBaseObject_Type,         /* tp_base */
14184    0,                  /* tp_dict */
14185    0,                  /* tp_descr_get */
14186    0,                  /* tp_descr_set */
14187    0,                  /* tp_dictoffset */
14188    0,                  /* tp_init */
14189    0,                  /* tp_alloc */
14190    unicode_new,            /* tp_new */
14191    PyObject_Del,           /* tp_free */
14192};
14193
14194/* Initialize the Unicode implementation */
14195
14196int _PyUnicode_Init(void)
14197{
14198    int i;
14199
14200    /* XXX - move this array to unicodectype.c ? */
14201    Py_UCS2 linebreak[] = {
14202        0x000A, /* LINE FEED */
14203        0x000D, /* CARRIAGE RETURN */
14204        0x001C, /* FILE SEPARATOR */
14205        0x001D, /* GROUP SEPARATOR */
14206        0x001E, /* RECORD SEPARATOR */
14207        0x0085, /* NEXT LINE */
14208        0x2028, /* LINE SEPARATOR */
14209        0x2029, /* PARAGRAPH SEPARATOR */
14210    };
14211
14212    /* Init the implementation */
14213    unicode_empty = PyUnicode_New(0, 0);
14214    if (!unicode_empty)
14215        Py_FatalError("Can't create empty string");
14216    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
14217
14218    for (i = 0; i < 256; i++)
14219        unicode_latin1[i] = NULL;
14220    if (PyType_Ready(&PyUnicode_Type) < 0)
14221        Py_FatalError("Can't initialize 'unicode'");
14222
14223    /* initialize the linebreak bloom filter */
14224    bloom_linebreak = make_bloom_mask(
14225        PyUnicode_2BYTE_KIND, linebreak,
14226        Py_ARRAY_LENGTH(linebreak));
14227
14228    PyType_Ready(&EncodingMapType);
14229
14230#ifdef HAVE_MBCS
14231    winver.dwOSVersionInfoSize = sizeof(winver);
14232    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14233        PyErr_SetFromWindowsErr(0);
14234        return -1;
14235    }
14236#endif
14237    return 0;
14238}
14239
14240/* Finalize the Unicode implementation */
14241
14242int
14243PyUnicode_ClearFreeList(void)
14244{
14245    return 0;
14246}
14247
14248void
14249_PyUnicode_Fini(void)
14250{
14251    int i;
14252
14253    Py_XDECREF(unicode_empty);
14254    unicode_empty = NULL;
14255
14256    for (i = 0; i < 256; i++) {
14257        if (unicode_latin1[i]) {
14258            Py_DECREF(unicode_latin1[i]);
14259            unicode_latin1[i] = NULL;
14260        }
14261    }
14262    _PyUnicode_ClearStaticStrings();
14263    (void)PyUnicode_ClearFreeList();
14264}
14265
14266void
14267PyUnicode_InternInPlace(PyObject **p)
14268{
14269    register PyObject *s = *p;
14270    PyObject *t;
14271#ifdef Py_DEBUG
14272    assert(s != NULL);
14273    assert(_PyUnicode_CHECK(s));
14274#else
14275    if (s == NULL || !PyUnicode_Check(s))
14276        return;
14277#endif
14278    /* If it's a subclass, we don't really know what putting
14279       it in the interned dict might do. */
14280    if (!PyUnicode_CheckExact(s))
14281        return;
14282    if (PyUnicode_CHECK_INTERNED(s))
14283        return;
14284    if (interned == NULL) {
14285        interned = PyDict_New();
14286        if (interned == NULL) {
14287            PyErr_Clear(); /* Don't leave an exception */
14288            return;
14289        }
14290    }
14291    /* It might be that the GetItem call fails even
14292       though the key is present in the dictionary,
14293       namely when this happens during a stack overflow. */
14294    Py_ALLOW_RECURSION
14295    t = PyDict_GetItem(interned, s);
14296    Py_END_ALLOW_RECURSION
14297
14298        if (t) {
14299            Py_INCREF(t);
14300            Py_DECREF(*p);
14301            *p = t;
14302            return;
14303        }
14304
14305    PyThreadState_GET()->recursion_critical = 1;
14306    if (PyDict_SetItem(interned, s, s) < 0) {
14307        PyErr_Clear();
14308        PyThreadState_GET()->recursion_critical = 0;
14309        return;
14310    }
14311    PyThreadState_GET()->recursion_critical = 0;
14312    /* The two references in interned are not counted by refcnt.
14313       The deallocator will take care of this */
14314    Py_REFCNT(s) -= 2;
14315    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
14316}
14317
14318void
14319PyUnicode_InternImmortal(PyObject **p)
14320{
14321    PyUnicode_InternInPlace(p);
14322    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
14323        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
14324        Py_INCREF(*p);
14325    }
14326}
14327
14328PyObject *
14329PyUnicode_InternFromString(const char *cp)
14330{
14331    PyObject *s = PyUnicode_FromString(cp);
14332    if (s == NULL)
14333        return NULL;
14334    PyUnicode_InternInPlace(&s);
14335    return s;
14336}
14337
14338void
14339_Py_ReleaseInternedUnicodeStrings(void)
14340{
14341    PyObject *keys;
14342    PyObject *s;
14343    Py_ssize_t i, n;
14344    Py_ssize_t immortal_size = 0, mortal_size = 0;
14345
14346    if (interned == NULL || !PyDict_Check(interned))
14347        return;
14348    keys = PyDict_Keys(interned);
14349    if (keys == NULL || !PyList_Check(keys)) {
14350        PyErr_Clear();
14351        return;
14352    }
14353
14354    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14355       detector, interned unicode strings are not forcibly deallocated;
14356       rather, we give them their stolen references back, and then clear
14357       and DECREF the interned dict. */
14358
14359    n = PyList_GET_SIZE(keys);
14360    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14361            n);
14362    for (i = 0; i < n; i++) {
14363        s = PyList_GET_ITEM(keys, i);
14364        if (PyUnicode_READY(s) == -1) {
14365            assert(0 && "could not ready string");
14366            fprintf(stderr, "could not ready string\n");
14367        }
14368        switch (PyUnicode_CHECK_INTERNED(s)) {
14369        case SSTATE_NOT_INTERNED:
14370            /* XXX Shouldn't happen */
14371            break;
14372        case SSTATE_INTERNED_IMMORTAL:
14373            Py_REFCNT(s) += 1;
14374            immortal_size += PyUnicode_GET_LENGTH(s);
14375            break;
14376        case SSTATE_INTERNED_MORTAL:
14377            Py_REFCNT(s) += 2;
14378            mortal_size += PyUnicode_GET_LENGTH(s);
14379            break;
14380        default:
14381            Py_FatalError("Inconsistent interned string state.");
14382        }
14383        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14384    }
14385    fprintf(stderr, "total size of all interned strings: "
14386            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14387            "mortal/immortal\n", mortal_size, immortal_size);
14388    Py_DECREF(keys);
14389    PyDict_Clear(interned);
14390    Py_DECREF(interned);
14391    interned = NULL;
14392}
14393
14394
14395/********************* Unicode Iterator **************************/
14396
14397typedef struct {
14398    PyObject_HEAD
14399    Py_ssize_t it_index;
14400    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14401} unicodeiterobject;
14402
14403static void
14404unicodeiter_dealloc(unicodeiterobject *it)
14405{
14406    _PyObject_GC_UNTRACK(it);
14407    Py_XDECREF(it->it_seq);
14408    PyObject_GC_Del(it);
14409}
14410
14411static int
14412unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14413{
14414    Py_VISIT(it->it_seq);
14415    return 0;
14416}
14417
14418static PyObject *
14419unicodeiter_next(unicodeiterobject *it)
14420{
14421    PyObject *seq, *item;
14422
14423    assert(it != NULL);
14424    seq = it->it_seq;
14425    if (seq == NULL)
14426        return NULL;
14427    assert(_PyUnicode_CHECK(seq));
14428
14429    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14430        int kind = PyUnicode_KIND(seq);
14431        void *data = PyUnicode_DATA(seq);
14432        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14433        item = PyUnicode_FromOrdinal(chr);
14434        if (item != NULL)
14435            ++it->it_index;
14436        return item;
14437    }
14438
14439    Py_DECREF(seq);
14440    it->it_seq = NULL;
14441    return NULL;
14442}
14443
14444static PyObject *
14445unicodeiter_len(unicodeiterobject *it)
14446{
14447    Py_ssize_t len = 0;
14448    if (it->it_seq)
14449        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14450    return PyLong_FromSsize_t(len);
14451}
14452
14453PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14454
14455static PyObject *
14456unicodeiter_reduce(unicodeiterobject *it)
14457{
14458    if (it->it_seq != NULL) {
14459        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
14460                             it->it_seq, it->it_index);
14461    } else {
14462        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14463        if (u == NULL)
14464            return NULL;
14465        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
14466    }
14467}
14468
14469PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14470
14471static PyObject *
14472unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14473{
14474    Py_ssize_t index = PyLong_AsSsize_t(state);
14475    if (index == -1 && PyErr_Occurred())
14476        return NULL;
14477    if (index < 0)
14478        index = 0;
14479    it->it_index = index;
14480    Py_RETURN_NONE;
14481}
14482
14483PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14484
14485static PyMethodDef unicodeiter_methods[] = {
14486    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14487     length_hint_doc},
14488    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14489     reduce_doc},
14490    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
14491     setstate_doc},
14492    {NULL,      NULL}       /* sentinel */
14493};
14494
14495PyTypeObject PyUnicodeIter_Type = {
14496    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14497    "str_iterator",         /* tp_name */
14498    sizeof(unicodeiterobject),      /* tp_basicsize */
14499    0,                  /* tp_itemsize */
14500    /* methods */
14501    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14502    0,                  /* tp_print */
14503    0,                  /* tp_getattr */
14504    0,                  /* tp_setattr */
14505    0,                  /* tp_reserved */
14506    0,                  /* tp_repr */
14507    0,                  /* tp_as_number */
14508    0,                  /* tp_as_sequence */
14509    0,                  /* tp_as_mapping */
14510    0,                  /* tp_hash */
14511    0,                  /* tp_call */
14512    0,                  /* tp_str */
14513    PyObject_GenericGetAttr,        /* tp_getattro */
14514    0,                  /* tp_setattro */
14515    0,                  /* tp_as_buffer */
14516    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14517    0,                  /* tp_doc */
14518    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14519    0,                  /* tp_clear */
14520    0,                  /* tp_richcompare */
14521    0,                  /* tp_weaklistoffset */
14522    PyObject_SelfIter,          /* tp_iter */
14523    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14524    unicodeiter_methods,            /* tp_methods */
14525    0,
14526};
14527
14528static PyObject *
14529unicode_iter(PyObject *seq)
14530{
14531    unicodeiterobject *it;
14532
14533    if (!PyUnicode_Check(seq)) {
14534        PyErr_BadInternalCall();
14535        return NULL;
14536    }
14537    if (PyUnicode_READY(seq) == -1)
14538        return NULL;
14539    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14540    if (it == NULL)
14541        return NULL;
14542    it->it_index = 0;
14543    Py_INCREF(seq);
14544    it->it_seq = seq;
14545    _PyObject_GC_TRACK(it);
14546    return (PyObject *)it;
14547}
14548
14549
14550size_t
14551Py_UNICODE_strlen(const Py_UNICODE *u)
14552{
14553    int res = 0;
14554    while(*u++)
14555        res++;
14556    return res;
14557}
14558
14559Py_UNICODE*
14560Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14561{
14562    Py_UNICODE *u = s1;
14563    while ((*u++ = *s2++));
14564    return s1;
14565}
14566
14567Py_UNICODE*
14568Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14569{
14570    Py_UNICODE *u = s1;
14571    while ((*u++ = *s2++))
14572        if (n-- == 0)
14573            break;
14574    return s1;
14575}
14576
14577Py_UNICODE*
14578Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14579{
14580    Py_UNICODE *u1 = s1;
14581    u1 += Py_UNICODE_strlen(u1);
14582    Py_UNICODE_strcpy(u1, s2);
14583    return s1;
14584}
14585
14586int
14587Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14588{
14589    while (*s1 && *s2 && *s1 == *s2)
14590        s1++, s2++;
14591    if (*s1 && *s2)
14592        return (*s1 < *s2) ? -1 : +1;
14593    if (*s1)
14594        return 1;
14595    if (*s2)
14596        return -1;
14597    return 0;
14598}
14599
14600int
14601Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14602{
14603    register Py_UNICODE u1, u2;
14604    for (; n != 0; n--) {
14605        u1 = *s1;
14606        u2 = *s2;
14607        if (u1 != u2)
14608            return (u1 < u2) ? -1 : +1;
14609        if (u1 == '\0')
14610            return 0;
14611        s1++;
14612        s2++;
14613    }
14614    return 0;
14615}
14616
14617Py_UNICODE*
14618Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14619{
14620    const Py_UNICODE *p;
14621    for (p = s; *p; p++)
14622        if (*p == c)
14623            return (Py_UNICODE*)p;
14624    return NULL;
14625}
14626
14627Py_UNICODE*
14628Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14629{
14630    const Py_UNICODE *p;
14631    p = s + Py_UNICODE_strlen(s);
14632    while (p != s) {
14633        p--;
14634        if (*p == c)
14635            return (Py_UNICODE*)p;
14636    }
14637    return NULL;
14638}
14639
14640Py_UNICODE*
14641PyUnicode_AsUnicodeCopy(PyObject *unicode)
14642{
14643    Py_UNICODE *u, *copy;
14644    Py_ssize_t len, size;
14645
14646    if (!PyUnicode_Check(unicode)) {
14647        PyErr_BadArgument();
14648        return NULL;
14649    }
14650    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14651    if (u == NULL)
14652        return NULL;
14653    /* Ensure we won't overflow the size. */
14654    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14655        PyErr_NoMemory();
14656        return NULL;
14657    }
14658    size = len + 1; /* copy the null character */
14659    size *= sizeof(Py_UNICODE);
14660    copy = PyMem_Malloc(size);
14661    if (copy == NULL) {
14662        PyErr_NoMemory();
14663        return NULL;
14664    }
14665    memcpy(copy, u, size);
14666    return copy;
14667}
14668
14669/* A _string module, to export formatter_parser and formatter_field_name_split
14670   to the string.Formatter class implemented in Python. */
14671
14672static PyMethodDef _string_methods[] = {
14673    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14674     METH_O, PyDoc_STR("split the argument as a field name")},
14675    {"formatter_parser", (PyCFunction) formatter_parser,
14676     METH_O, PyDoc_STR("parse the argument as a format string")},
14677    {NULL, NULL}
14678};
14679
14680static struct PyModuleDef _string_module = {
14681    PyModuleDef_HEAD_INIT,
14682    "_string",
14683    PyDoc_STR("string helper module"),
14684    0,
14685    _string_methods,
14686    NULL,
14687    NULL,
14688    NULL,
14689    NULL
14690};
14691
14692PyMODINIT_FUNC
14693PyInit__string(void)
14694{
14695    return PyModule_Create(&_string_module);
14696}
14697
14698
14699#ifdef __cplusplus
14700}
14701#endif
14702