unicodeobject.c revision 1f7951711c16cca5f041288b81d01cb3021d0b7e
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49#ifdef Py_DEBUG
50#  define DONT_MAKE_RESULT_READY
51#endif
52
53/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
61/* --- Globals ------------------------------------------------------------
62
63   The globals are initialized by the _PyUnicode_Init() API and should
64   not be used before calling that API.
65
66*/
67
68
69#ifdef __cplusplus
70extern "C" {
71#endif
72
73#ifdef Py_DEBUG
74#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
75#else
76#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
78
79#define _PyUnicode_UTF8(op)                             \
80    (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op)                              \
82    (assert(_PyUnicode_CHECK(op)),                      \
83     assert(PyUnicode_IS_READY(op)),                    \
84     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
85         ((char*)((PyASCIIObject*)(op) + 1)) :          \
86         _PyUnicode_UTF8(op))
87#define _PyUnicode_UTF8_LENGTH(op)                      \
88    (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op)                       \
90    (assert(_PyUnicode_CHECK(op)),                      \
91     assert(PyUnicode_IS_READY(op)),                    \
92     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
93         ((PyASCIIObject*)(op))->length :               \
94         _PyUnicode_UTF8_LENGTH(op))
95#define _PyUnicode_WSTR(op)                             \
96    (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op)                      \
98    (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op)                           \
100    (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op)                            \
102    (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op)                             \
104    (((PyASCIIObject *)(op))->hash)
105#define _PyUnicode_KIND(op)                             \
106    (assert(_PyUnicode_CHECK(op)),                      \
107     ((PyASCIIObject *)(op))->state.kind)
108#define _PyUnicode_GET_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     ((PyASCIIObject *)(op))->length)
111#define _PyUnicode_DATA_ANY(op)                         \
112    (((PyUnicodeObject*)(op))->data.any)
113
114#undef PyUnicode_READY
115#define PyUnicode_READY(op)                             \
116    (assert(_PyUnicode_CHECK(op)),                      \
117     (PyUnicode_IS_READY(op) ?                          \
118      0 :                                               \
119      _PyUnicode_Ready(op)))
120
121#define _PyUnicode_READY_REPLACE(p_obj)                 \
122    (assert(_PyUnicode_CHECK(*p_obj)),                  \
123     (PyUnicode_IS_READY(*p_obj) ?                      \
124      0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
126#define _PyUnicode_SHARE_UTF8(op)                       \
127    (assert(_PyUnicode_CHECK(op)),                      \
128     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
129     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op)                       \
131    (assert(_PyUnicode_CHECK(op)),                      \
132     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated UTF-8 memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
137    (assert(_PyUnicode_CHECK(op)),                      \
138     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
139      && _PyUnicode_UTF8(op)                            \
140      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
142/* true if the Unicode object has an allocated wstr memory block
143   (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
145    (assert(_PyUnicode_CHECK(op)),                      \
146     (_PyUnicode_WSTR(op) &&                            \
147      (!PyUnicode_IS_READY(op) ||                       \
148       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
150/* Generic helper macro to convert characters of different types.
151   from_type and to_type have to be valid type names, begin and end
152   are pointers to the source characters which should be of type
153   "from_type *".  to is a pointer of type "to_type *" and points to the
154   buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156    do {                                                \
157        to_type *_to = (to_type *) to;                  \
158        const from_type *_iter = (begin);               \
159        const from_type *_end = (end);                  \
160        Py_ssize_t n = (_end) - (_iter);                \
161        const from_type *_unrolled_end =                \
162            _iter + (n & ~ (Py_ssize_t) 3);             \
163        while (_iter < (_unrolled_end)) {               \
164            _to[0] = (to_type) _iter[0];                \
165            _to[1] = (to_type) _iter[1];                \
166            _to[2] = (to_type) _iter[2];                \
167            _to[3] = (to_type) _iter[3];                \
168            _iter += 4; _to += 4;                       \
169        }                                               \
170        while (_iter < (_end))                          \
171            *_to++ = (to_type) *_iter++;                \
172    } while (0)
173
174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
177/* This dictionary holds all interned unicode strings.  Note that references
178   to strings in this dictionary are *not* counted in the string's ob_refcnt.
179   When the interned string reaches a refcnt of 0 the string deallocation
180   function will delete the reference from this dictionary.
181
182   Another way to look at this is that to say that the actual reference
183   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
184*/
185static PyObject *interned;
186
187/* The empty Unicode object is shared to improve performance. */
188static PyObject *unicode_empty;
189
190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
193/* Single character Unicode strings in the Latin-1 range are being
194   shared as well. */
195static PyObject *unicode_latin1[256];
196
197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
199    0, 0, 0, 0, 0, 0, 0, 0,
200/*     case 0x0009: * CHARACTER TABULATION */
201/*     case 0x000A: * LINE FEED */
202/*     case 0x000B: * LINE TABULATION */
203/*     case 0x000C: * FORM FEED */
204/*     case 0x000D: * CARRIAGE RETURN */
205    0, 1, 1, 1, 1, 1, 0, 0,
206    0, 0, 0, 0, 0, 0, 0, 0,
207/*     case 0x001C: * FILE SEPARATOR */
208/*     case 0x001D: * GROUP SEPARATOR */
209/*     case 0x001E: * RECORD SEPARATOR */
210/*     case 0x001F: * UNIT SEPARATOR */
211    0, 0, 0, 0, 1, 1, 1, 1,
212/*     case 0x0020: * SPACE */
213    1, 0, 0, 0, 0, 0, 0, 0,
214    0, 0, 0, 0, 0, 0, 0, 0,
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0,
223    0, 0, 0, 0, 0, 0, 0, 0,
224    0, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0
226};
227
228/* forward */
229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
230static PyObject* get_latin1_char(unsigned char ch);
231static void copy_characters(
232    PyObject *to, Py_ssize_t to_start,
233    PyObject *from, Py_ssize_t from_start,
234    Py_ssize_t how_many);
235#ifdef Py_DEBUG
236static int unicode_is_singleton(PyObject *unicode);
237#endif
238
239static PyObject *
240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
249unicode_encode_call_errorhandler(const char *errors,
250       PyObject **errorHandler,const char *encoding, const char *reason,
251       PyObject *unicode, PyObject **exceptionObject,
252       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
254static void
255raise_encode_exception(PyObject **exceptionObject,
256                       const char *encoding,
257                       PyObject *unicode,
258                       Py_ssize_t startpos, Py_ssize_t endpos,
259                       const char *reason);
260
261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
263    0, 0, 0, 0, 0, 0, 0, 0,
264/*         0x000A, * LINE FEED */
265/*         0x000B, * LINE TABULATION */
266/*         0x000C, * FORM FEED */
267/*         0x000D, * CARRIAGE RETURN */
268    0, 0, 1, 1, 1, 1, 0, 0,
269    0, 0, 0, 0, 0, 0, 0, 0,
270/*         0x001C, * FILE SEPARATOR */
271/*         0x001D, * GROUP SEPARATOR */
272/*         0x001E, * RECORD SEPARATOR */
273    0, 0, 0, 0, 1, 1, 1, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0,
277    0, 0, 0, 0, 0, 0, 0, 0,
278
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0,
283    0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0
287};
288
289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290   This function is kept for backward compatibility with the old API. */
291Py_UNICODE
292PyUnicode_GetMax(void)
293{
294#ifdef Py_UNICODE_WIDE
295    return 0x10FFFF;
296#else
297    /* This is actually an illegal character, so it should
298       not be passed to unichr. */
299    return 0xFFFF;
300#endif
301}
302
303#ifdef Py_DEBUG
304int
305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
306{
307    PyASCIIObject *ascii;
308    unsigned int kind;
309
310    assert(PyUnicode_Check(op));
311
312    ascii = (PyASCIIObject *)op;
313    kind = ascii->state.kind;
314
315    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
316        assert(kind == PyUnicode_1BYTE_KIND);
317        assert(ascii->state.ready == 1);
318    }
319    else {
320        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
321        void *data;
322
323        if (ascii->state.compact == 1) {
324            data = compact + 1;
325            assert(kind == PyUnicode_1BYTE_KIND
326                   || kind == PyUnicode_2BYTE_KIND
327                   || kind == PyUnicode_4BYTE_KIND);
328            assert(ascii->state.ascii == 0);
329            assert(ascii->state.ready == 1);
330            assert (compact->utf8 != data);
331        }
332        else {
333            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335            data = unicode->data.any;
336            if (kind == PyUnicode_WCHAR_KIND) {
337                assert(ascii->length == 0);
338                assert(ascii->hash == -1);
339                assert(ascii->state.compact == 0);
340                assert(ascii->state.ascii == 0);
341                assert(ascii->state.ready == 0);
342                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
343                assert(ascii->wstr != NULL);
344                assert(data == NULL);
345                assert(compact->utf8 == NULL);
346            }
347            else {
348                assert(kind == PyUnicode_1BYTE_KIND
349                       || kind == PyUnicode_2BYTE_KIND
350                       || kind == PyUnicode_4BYTE_KIND);
351                assert(ascii->state.compact == 0);
352                assert(ascii->state.ready == 1);
353                assert(data != NULL);
354                if (ascii->state.ascii) {
355                    assert (compact->utf8 == data);
356                    assert (compact->utf8_length == ascii->length);
357                }
358                else
359                    assert (compact->utf8 != data);
360            }
361        }
362        if (kind != PyUnicode_WCHAR_KIND) {
363            if (
364#if SIZEOF_WCHAR_T == 2
365                kind == PyUnicode_2BYTE_KIND
366#else
367                kind == PyUnicode_4BYTE_KIND
368#endif
369               )
370            {
371                assert(ascii->wstr == data);
372                assert(compact->wstr_length == ascii->length);
373            } else
374                assert(ascii->wstr != data);
375        }
376
377        if (compact->utf8 == NULL)
378            assert(compact->utf8_length == 0);
379        if (ascii->wstr == NULL)
380            assert(compact->wstr_length == 0);
381    }
382    /* check that the best kind is used */
383    if (check_content && kind != PyUnicode_WCHAR_KIND)
384    {
385        Py_ssize_t i;
386        Py_UCS4 maxchar = 0;
387        void *data = PyUnicode_DATA(ascii);
388        for (i=0; i < ascii->length; i++)
389        {
390            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
391            if (ch > maxchar)
392                maxchar = ch;
393        }
394        if (kind == PyUnicode_1BYTE_KIND) {
395            if (ascii->state.ascii == 0)
396                assert(maxchar >= 128);
397            else
398                assert(maxchar < 128);
399        }
400        else if (kind == PyUnicode_2BYTE_KIND)
401            assert(maxchar >= 0x100);
402        else
403            assert(maxchar >= 0x10000);
404    }
405    if (check_content && !unicode_is_singleton(op))
406        assert(ascii->hash == -1);
407    return 1;
408}
409#endif
410
411#ifdef HAVE_MBCS
412static OSVERSIONINFOEX winver;
413#endif
414
415/* --- Bloom Filters ----------------------------------------------------- */
416
417/* stuff to implement simple "bloom filters" for Unicode characters.
418   to keep things simple, we use a single bitmask, using the least 5
419   bits from each unicode characters as the bit index. */
420
421/* the linebreak mask is set up by Unicode_Init below */
422
423#if LONG_BIT >= 128
424#define BLOOM_WIDTH 128
425#elif LONG_BIT >= 64
426#define BLOOM_WIDTH 64
427#elif LONG_BIT >= 32
428#define BLOOM_WIDTH 32
429#else
430#error "LONG_BIT is smaller than 32"
431#endif
432
433#define BLOOM_MASK unsigned long
434
435static BLOOM_MASK bloom_linebreak;
436
437#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
438#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
439
440#define BLOOM_LINEBREAK(ch)                                             \
441    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
442     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
443
444Py_LOCAL_INLINE(BLOOM_MASK)
445make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
446{
447    /* calculate simple bloom-style bitmask for a given unicode string */
448
449    BLOOM_MASK mask;
450    Py_ssize_t i;
451
452    mask = 0;
453    for (i = 0; i < len; i++)
454        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
455
456    return mask;
457}
458
459#define BLOOM_MEMBER(mask, chr, str) \
460    (BLOOM(mask, chr) \
461     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
462
463/* Compilation of templated routines */
464
465#include "stringlib/asciilib.h"
466#include "stringlib/fastsearch.h"
467#include "stringlib/partition.h"
468#include "stringlib/split.h"
469#include "stringlib/count.h"
470#include "stringlib/find.h"
471#include "stringlib/find_max_char.h"
472#include "stringlib/localeutil.h"
473#include "stringlib/undef.h"
474
475#include "stringlib/ucs1lib.h"
476#include "stringlib/fastsearch.h"
477#include "stringlib/partition.h"
478#include "stringlib/split.h"
479#include "stringlib/count.h"
480#include "stringlib/find.h"
481#include "stringlib/find_max_char.h"
482#include "stringlib/localeutil.h"
483#include "stringlib/undef.h"
484
485#include "stringlib/ucs2lib.h"
486#include "stringlib/fastsearch.h"
487#include "stringlib/partition.h"
488#include "stringlib/split.h"
489#include "stringlib/count.h"
490#include "stringlib/find.h"
491#include "stringlib/find_max_char.h"
492#include "stringlib/localeutil.h"
493#include "stringlib/undef.h"
494
495#include "stringlib/ucs4lib.h"
496#include "stringlib/fastsearch.h"
497#include "stringlib/partition.h"
498#include "stringlib/split.h"
499#include "stringlib/count.h"
500#include "stringlib/find.h"
501#include "stringlib/find_max_char.h"
502#include "stringlib/localeutil.h"
503#include "stringlib/undef.h"
504
505#include "stringlib/unicodedefs.h"
506#include "stringlib/fastsearch.h"
507#include "stringlib/count.h"
508#include "stringlib/find.h"
509
510/* --- Unicode Object ----------------------------------------------------- */
511
512static PyObject *
513fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
514
515Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
516                                     Py_ssize_t size, Py_UCS4 ch,
517                                     int direction)
518{
519    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
520
521    switch (kind) {
522    case PyUnicode_1BYTE_KIND:
523        {
524            Py_UCS1 ch1 = (Py_UCS1) ch;
525            if (ch1 == ch)
526                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
527            else
528                return -1;
529        }
530    case PyUnicode_2BYTE_KIND:
531        {
532            Py_UCS2 ch2 = (Py_UCS2) ch;
533            if (ch2 == ch)
534                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
535            else
536                return -1;
537        }
538    case PyUnicode_4BYTE_KIND:
539        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
540    default:
541        assert(0);
542        return -1;
543    }
544}
545
546static PyObject*
547resize_compact(PyObject *unicode, Py_ssize_t length)
548{
549    Py_ssize_t char_size;
550    Py_ssize_t struct_size;
551    Py_ssize_t new_size;
552    int share_wstr;
553
554    assert(PyUnicode_IS_READY(unicode));
555    char_size = PyUnicode_KIND(unicode);
556    if (PyUnicode_IS_COMPACT_ASCII(unicode))
557        struct_size = sizeof(PyASCIIObject);
558    else
559        struct_size = sizeof(PyCompactUnicodeObject);
560    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
561
562    _Py_DEC_REFTOTAL;
563    _Py_ForgetReference(unicode);
564
565    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
566        PyErr_NoMemory();
567        return NULL;
568    }
569    new_size = (struct_size + (length + 1) * char_size);
570
571    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
572    if (unicode == NULL) {
573        PyObject_Del(unicode);
574        PyErr_NoMemory();
575        return NULL;
576    }
577    _Py_NewReference(unicode);
578    _PyUnicode_LENGTH(unicode) = length;
579    if (share_wstr) {
580        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
581        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
582            _PyUnicode_WSTR_LENGTH(unicode) = length;
583    }
584    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
585                    length, 0);
586    return unicode;
587}
588
589static int
590resize_inplace(PyObject *unicode, Py_ssize_t length)
591{
592    wchar_t *wstr;
593    assert(!PyUnicode_IS_COMPACT(unicode));
594    assert(Py_REFCNT(unicode) == 1);
595
596    _PyUnicode_DIRTY(unicode);
597
598    if (PyUnicode_IS_READY(unicode)) {
599        Py_ssize_t char_size;
600        Py_ssize_t new_size;
601        int share_wstr, share_utf8;
602        void *data;
603
604        data = _PyUnicode_DATA_ANY(unicode);
605        assert(data != NULL);
606        char_size = PyUnicode_KIND(unicode);
607        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
608        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
609        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
610        {
611            PyObject_DEL(_PyUnicode_UTF8(unicode));
612            _PyUnicode_UTF8(unicode) = NULL;
613            _PyUnicode_UTF8_LENGTH(unicode) = 0;
614        }
615
616        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
617            PyErr_NoMemory();
618            return -1;
619        }
620        new_size = (length + 1) * char_size;
621
622        data = (PyObject *)PyObject_REALLOC(data, new_size);
623        if (data == NULL) {
624            PyErr_NoMemory();
625            return -1;
626        }
627        _PyUnicode_DATA_ANY(unicode) = data;
628        if (share_wstr) {
629            _PyUnicode_WSTR(unicode) = data;
630            _PyUnicode_WSTR_LENGTH(unicode) = length;
631        }
632        if (share_utf8) {
633            _PyUnicode_UTF8(unicode) = data;
634            _PyUnicode_UTF8_LENGTH(unicode) = length;
635        }
636        _PyUnicode_LENGTH(unicode) = length;
637        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
638        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
639            assert(_PyUnicode_CheckConsistency(unicode, 0));
640            return 0;
641        }
642    }
643    assert(_PyUnicode_WSTR(unicode) != NULL);
644
645    /* check for integer overflow */
646    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
647        PyErr_NoMemory();
648        return -1;
649    }
650    wstr =  _PyUnicode_WSTR(unicode);
651    wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
652    if (!wstr) {
653        PyErr_NoMemory();
654        return -1;
655    }
656    _PyUnicode_WSTR(unicode) = wstr;
657    _PyUnicode_WSTR(unicode)[length] = 0;
658    _PyUnicode_WSTR_LENGTH(unicode) = length;
659    assert(_PyUnicode_CheckConsistency(unicode, 0));
660    return 0;
661}
662
663static PyObject*
664resize_copy(PyObject *unicode, Py_ssize_t length)
665{
666    Py_ssize_t copy_length;
667    if (PyUnicode_IS_COMPACT(unicode)) {
668        PyObject *copy;
669        assert(PyUnicode_IS_READY(unicode));
670
671        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
672        if (copy == NULL)
673            return NULL;
674
675        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
676        copy_characters(copy, 0, unicode, 0, copy_length);
677        return copy;
678    }
679    else {
680        PyObject *w;
681        assert(_PyUnicode_WSTR(unicode) != NULL);
682        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
683        w = (PyObject*)_PyUnicode_New(length);
684        if (w == NULL)
685            return NULL;
686        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
687        copy_length = Py_MIN(copy_length, length);
688        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
689                        copy_length);
690        return w;
691    }
692}
693
694/* We allocate one more byte to make sure the string is
695   Ux0000 terminated; some code (e.g. new_identifier)
696   relies on that.
697
698   XXX This allocator could further be enhanced by assuring that the
699   free list never reduces its size below 1.
700
701*/
702
703#ifdef Py_DEBUG
704static int unicode_old_new_calls = 0;
705#endif
706
707static PyUnicodeObject *
708_PyUnicode_New(Py_ssize_t length)
709{
710    register PyUnicodeObject *unicode;
711    size_t new_size;
712
713    /* Optimization for empty strings */
714    if (length == 0 && unicode_empty != NULL) {
715        Py_INCREF(unicode_empty);
716        return (PyUnicodeObject*)unicode_empty;
717    }
718
719    /* Ensure we won't overflow the size. */
720    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
721        return (PyUnicodeObject *)PyErr_NoMemory();
722    }
723    if (length < 0) {
724        PyErr_SetString(PyExc_SystemError,
725                        "Negative size passed to _PyUnicode_New");
726        return NULL;
727    }
728
729#ifdef Py_DEBUG
730    ++unicode_old_new_calls;
731#endif
732
733    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
734    if (unicode == NULL)
735        return NULL;
736    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
737    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
738    if (!_PyUnicode_WSTR(unicode)) {
739        PyErr_NoMemory();
740        goto onError;
741    }
742
743    /* Initialize the first element to guard against cases where
744     * the caller fails before initializing str -- unicode_resize()
745     * reads str[0], and the Keep-Alive optimization can keep memory
746     * allocated for str alive across a call to unicode_dealloc(unicode).
747     * We don't want unicode_resize to read uninitialized memory in
748     * that case.
749     */
750    _PyUnicode_WSTR(unicode)[0] = 0;
751    _PyUnicode_WSTR(unicode)[length] = 0;
752    _PyUnicode_WSTR_LENGTH(unicode) = length;
753    _PyUnicode_HASH(unicode) = -1;
754    _PyUnicode_STATE(unicode).interned = 0;
755    _PyUnicode_STATE(unicode).kind = 0;
756    _PyUnicode_STATE(unicode).compact = 0;
757    _PyUnicode_STATE(unicode).ready = 0;
758    _PyUnicode_STATE(unicode).ascii = 0;
759    _PyUnicode_DATA_ANY(unicode) = NULL;
760    _PyUnicode_LENGTH(unicode) = 0;
761    _PyUnicode_UTF8(unicode) = NULL;
762    _PyUnicode_UTF8_LENGTH(unicode) = 0;
763    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
764    return unicode;
765
766  onError:
767    /* XXX UNREF/NEWREF interface should be more symmetrical */
768    _Py_DEC_REFTOTAL;
769    _Py_ForgetReference((PyObject *)unicode);
770    PyObject_Del(unicode);
771    return NULL;
772}
773
774static const char*
775unicode_kind_name(PyObject *unicode)
776{
777    /* don't check consistency: unicode_kind_name() is called from
778       _PyUnicode_Dump() */
779    if (!PyUnicode_IS_COMPACT(unicode))
780    {
781        if (!PyUnicode_IS_READY(unicode))
782            return "wstr";
783        switch(PyUnicode_KIND(unicode))
784        {
785        case PyUnicode_1BYTE_KIND:
786            if (PyUnicode_IS_ASCII(unicode))
787                return "legacy ascii";
788            else
789                return "legacy latin1";
790        case PyUnicode_2BYTE_KIND:
791            return "legacy UCS2";
792        case PyUnicode_4BYTE_KIND:
793            return "legacy UCS4";
794        default:
795            return "<legacy invalid kind>";
796        }
797    }
798    assert(PyUnicode_IS_READY(unicode));
799    switch(PyUnicode_KIND(unicode))
800    {
801    case PyUnicode_1BYTE_KIND:
802        if (PyUnicode_IS_ASCII(unicode))
803            return "ascii";
804        else
805            return "latin1";
806    case PyUnicode_2BYTE_KIND:
807        return "UCS2";
808    case PyUnicode_4BYTE_KIND:
809        return "UCS4";
810    default:
811        return "<invalid compact kind>";
812    }
813}
814
815#ifdef Py_DEBUG
816static int unicode_new_new_calls = 0;
817
818/* Functions wrapping macros for use in debugger */
819char *_PyUnicode_utf8(void *unicode){
820    return PyUnicode_UTF8(unicode);
821}
822
823void *_PyUnicode_compact_data(void *unicode) {
824    return _PyUnicode_COMPACT_DATA(unicode);
825}
826void *_PyUnicode_data(void *unicode){
827    printf("obj %p\n", unicode);
828    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
829    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
830    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
831    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
832    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
833    return PyUnicode_DATA(unicode);
834}
835
836void
837_PyUnicode_Dump(PyObject *op)
838{
839    PyASCIIObject *ascii = (PyASCIIObject *)op;
840    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
841    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
842    void *data;
843
844    if (ascii->state.compact)
845    {
846        if (ascii->state.ascii)
847            data = (ascii + 1);
848        else
849            data = (compact + 1);
850    }
851    else
852        data = unicode->data.any;
853    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
854
855    if (ascii->wstr == data)
856        printf("shared ");
857    printf("wstr=%p", ascii->wstr);
858
859    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
860        printf(" (%zu), ", compact->wstr_length);
861        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
862            printf("shared ");
863        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
864    }
865    printf(", data=%p\n", data);
866}
867#endif
868
869PyObject *
870PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
871{
872    PyObject *obj;
873    PyCompactUnicodeObject *unicode;
874    void *data;
875    int kind_state;
876    int is_sharing, is_ascii;
877    Py_ssize_t char_size;
878    Py_ssize_t struct_size;
879
880    /* Optimization for empty strings */
881    if (size == 0 && unicode_empty != NULL) {
882        Py_INCREF(unicode_empty);
883        return unicode_empty;
884    }
885
886#ifdef Py_DEBUG
887    ++unicode_new_new_calls;
888#endif
889
890    is_ascii = 0;
891    is_sharing = 0;
892    struct_size = sizeof(PyCompactUnicodeObject);
893    if (maxchar < 128) {
894        kind_state = PyUnicode_1BYTE_KIND;
895        char_size = 1;
896        is_ascii = 1;
897        struct_size = sizeof(PyASCIIObject);
898    }
899    else if (maxchar < 256) {
900        kind_state = PyUnicode_1BYTE_KIND;
901        char_size = 1;
902    }
903    else if (maxchar < 65536) {
904        kind_state = PyUnicode_2BYTE_KIND;
905        char_size = 2;
906        if (sizeof(wchar_t) == 2)
907            is_sharing = 1;
908    }
909    else {
910        kind_state = PyUnicode_4BYTE_KIND;
911        char_size = 4;
912        if (sizeof(wchar_t) == 4)
913            is_sharing = 1;
914    }
915
916    /* Ensure we won't overflow the size. */
917    if (size < 0) {
918        PyErr_SetString(PyExc_SystemError,
919                        "Negative size passed to PyUnicode_New");
920        return NULL;
921    }
922    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
923        return PyErr_NoMemory();
924
925    /* Duplicated allocation code from _PyObject_New() instead of a call to
926     * PyObject_New() so we are able to allocate space for the object and
927     * it's data buffer.
928     */
929    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
930    if (obj == NULL)
931        return PyErr_NoMemory();
932    obj = PyObject_INIT(obj, &PyUnicode_Type);
933    if (obj == NULL)
934        return NULL;
935
936    unicode = (PyCompactUnicodeObject *)obj;
937    if (is_ascii)
938        data = ((PyASCIIObject*)obj) + 1;
939    else
940        data = unicode + 1;
941    _PyUnicode_LENGTH(unicode) = size;
942    _PyUnicode_HASH(unicode) = -1;
943    _PyUnicode_STATE(unicode).interned = 0;
944    _PyUnicode_STATE(unicode).kind = kind_state;
945    _PyUnicode_STATE(unicode).compact = 1;
946    _PyUnicode_STATE(unicode).ready = 1;
947    _PyUnicode_STATE(unicode).ascii = is_ascii;
948    if (is_ascii) {
949        ((char*)data)[size] = 0;
950        _PyUnicode_WSTR(unicode) = NULL;
951    }
952    else if (kind_state == PyUnicode_1BYTE_KIND) {
953        ((char*)data)[size] = 0;
954        _PyUnicode_WSTR(unicode) = NULL;
955        _PyUnicode_WSTR_LENGTH(unicode) = 0;
956        unicode->utf8 = NULL;
957        unicode->utf8_length = 0;
958        }
959    else {
960        unicode->utf8 = NULL;
961        unicode->utf8_length = 0;
962        if (kind_state == PyUnicode_2BYTE_KIND)
963            ((Py_UCS2*)data)[size] = 0;
964        else /* kind_state == PyUnicode_4BYTE_KIND */
965            ((Py_UCS4*)data)[size] = 0;
966        if (is_sharing) {
967            _PyUnicode_WSTR_LENGTH(unicode) = size;
968            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
969        }
970        else {
971            _PyUnicode_WSTR_LENGTH(unicode) = 0;
972            _PyUnicode_WSTR(unicode) = NULL;
973        }
974    }
975    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
976    return obj;
977}
978
979#if SIZEOF_WCHAR_T == 2
980/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
981   will decode surrogate pairs, the other conversions are implemented as macros
982   for efficiency.
983
984   This function assumes that unicode can hold one more code point than wstr
985   characters for a terminating null character. */
986static void
987unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
988                              PyObject *unicode)
989{
990    const wchar_t *iter;
991    Py_UCS4 *ucs4_out;
992
993    assert(unicode != NULL);
994    assert(_PyUnicode_CHECK(unicode));
995    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
996    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
997
998    for (iter = begin; iter < end; ) {
999        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1000                           _PyUnicode_GET_LENGTH(unicode)));
1001        if (*iter >= 0xD800 && *iter <= 0xDBFF
1002            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1003        {
1004            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1005            iter += 2;
1006        }
1007        else {
1008            *ucs4_out++ = *iter;
1009            iter++;
1010        }
1011    }
1012    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1013                        _PyUnicode_GET_LENGTH(unicode)));
1014
1015}
1016#endif
1017
1018static int
1019_PyUnicode_Dirty(PyObject *unicode)
1020{
1021    assert(_PyUnicode_CHECK(unicode));
1022    if (Py_REFCNT(unicode) != 1) {
1023        PyErr_SetString(PyExc_SystemError,
1024                        "Cannot modify a string having more than 1 reference");
1025        return -1;
1026    }
1027    _PyUnicode_DIRTY(unicode);
1028    return 0;
1029}
1030
1031static int
1032_copy_characters(PyObject *to, Py_ssize_t to_start,
1033                 PyObject *from, Py_ssize_t from_start,
1034                 Py_ssize_t how_many, int check_maxchar)
1035{
1036    unsigned int from_kind, to_kind;
1037    void *from_data, *to_data;
1038    int fast;
1039
1040    assert(PyUnicode_Check(from));
1041    assert(PyUnicode_Check(to));
1042    assert(PyUnicode_IS_READY(from));
1043    assert(PyUnicode_IS_READY(to));
1044
1045    assert(PyUnicode_GET_LENGTH(from) >= how_many);
1046    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1047    assert(0 <= how_many);
1048
1049    if (how_many == 0)
1050        return 0;
1051
1052    from_kind = PyUnicode_KIND(from);
1053    from_data = PyUnicode_DATA(from);
1054    to_kind = PyUnicode_KIND(to);
1055    to_data = PyUnicode_DATA(to);
1056
1057#ifdef Py_DEBUG
1058    if (!check_maxchar
1059        && (from_kind > to_kind
1060            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
1061    {
1062        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1063        Py_UCS4 ch;
1064        Py_ssize_t i;
1065        for (i=0; i < how_many; i++) {
1066            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1067            assert(ch <= to_maxchar);
1068        }
1069    }
1070#endif
1071    fast = (from_kind == to_kind);
1072    if (check_maxchar
1073        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1074    {
1075        /* deny latin1 => ascii */
1076        fast = 0;
1077    }
1078
1079    if (fast) {
1080        Py_MEMCPY((char*)to_data + to_kind * to_start,
1081                  (char*)from_data + from_kind * from_start,
1082                  to_kind * how_many);
1083    }
1084    else if (from_kind == PyUnicode_1BYTE_KIND
1085             && to_kind == PyUnicode_2BYTE_KIND)
1086    {
1087        _PyUnicode_CONVERT_BYTES(
1088            Py_UCS1, Py_UCS2,
1089            PyUnicode_1BYTE_DATA(from) + from_start,
1090            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1091            PyUnicode_2BYTE_DATA(to) + to_start
1092            );
1093    }
1094    else if (from_kind == PyUnicode_1BYTE_KIND
1095             && to_kind == PyUnicode_4BYTE_KIND)
1096    {
1097        _PyUnicode_CONVERT_BYTES(
1098            Py_UCS1, Py_UCS4,
1099            PyUnicode_1BYTE_DATA(from) + from_start,
1100            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1101            PyUnicode_4BYTE_DATA(to) + to_start
1102            );
1103    }
1104    else if (from_kind == PyUnicode_2BYTE_KIND
1105             && to_kind == PyUnicode_4BYTE_KIND)
1106    {
1107        _PyUnicode_CONVERT_BYTES(
1108            Py_UCS2, Py_UCS4,
1109            PyUnicode_2BYTE_DATA(from) + from_start,
1110            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1111            PyUnicode_4BYTE_DATA(to) + to_start
1112            );
1113    }
1114    else {
1115        /* check if max_char(from substring) <= max_char(to) */
1116        if (from_kind > to_kind
1117                /* latin1 => ascii */
1118            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1119        {
1120            /* slow path to check for character overflow */
1121            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1122            Py_UCS4 ch;
1123            Py_ssize_t i;
1124
1125#ifdef Py_DEBUG
1126            for (i=0; i < how_many; i++) {
1127                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1128                assert(ch <= to_maxchar);
1129                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1130            }
1131#else
1132            if (!check_maxchar) {
1133                for (i=0; i < how_many; i++) {
1134                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1135                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1136                }
1137            }
1138            else {
1139                for (i=0; i < how_many; i++) {
1140                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1141                    if (ch > to_maxchar)
1142                        return 1;
1143                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1144                }
1145            }
1146#endif
1147        }
1148        else {
1149            assert(0 && "inconsistent state");
1150            return 1;
1151        }
1152    }
1153    return 0;
1154}
1155
1156static void
1157copy_characters(PyObject *to, Py_ssize_t to_start,
1158                       PyObject *from, Py_ssize_t from_start,
1159                       Py_ssize_t how_many)
1160{
1161    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1162}
1163
1164Py_ssize_t
1165PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1166                         PyObject *from, Py_ssize_t from_start,
1167                         Py_ssize_t how_many)
1168{
1169    int err;
1170
1171    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1172        PyErr_BadInternalCall();
1173        return -1;
1174    }
1175
1176    if (PyUnicode_READY(from))
1177        return -1;
1178    if (PyUnicode_READY(to))
1179        return -1;
1180
1181    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1182    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1183        PyErr_Format(PyExc_SystemError,
1184                     "Cannot write %zi characters at %zi "
1185                     "in a string of %zi characters",
1186                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1187        return -1;
1188    }
1189
1190    if (how_many == 0)
1191        return 0;
1192
1193    if (_PyUnicode_Dirty(to))
1194        return -1;
1195
1196    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1197    if (err) {
1198        PyErr_Format(PyExc_SystemError,
1199                     "Cannot copy %s characters "
1200                     "into a string of %s characters",
1201                     unicode_kind_name(from),
1202                     unicode_kind_name(to));
1203        return -1;
1204    }
1205    return how_many;
1206}
1207
1208/* Find the maximum code point and count the number of surrogate pairs so a
1209   correct string length can be computed before converting a string to UCS4.
1210   This function counts single surrogates as a character and not as a pair.
1211
1212   Return 0 on success, or -1 on error. */
1213static int
1214find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1215                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1216{
1217    const wchar_t *iter;
1218
1219    assert(num_surrogates != NULL && maxchar != NULL);
1220    *num_surrogates = 0;
1221    *maxchar = 0;
1222
1223    for (iter = begin; iter < end; ) {
1224        if (*iter > *maxchar) {
1225            *maxchar = *iter;
1226#if SIZEOF_WCHAR_T != 2
1227            if (*maxchar >= 0x10000)
1228                return 0;
1229#endif
1230        }
1231#if SIZEOF_WCHAR_T == 2
1232        if (*iter >= 0xD800 && *iter <= 0xDBFF
1233            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1234        {
1235            Py_UCS4 surrogate_val;
1236            surrogate_val = (((iter[0] & 0x3FF)<<10)
1237                             | (iter[1] & 0x3FF)) + 0x10000;
1238            ++(*num_surrogates);
1239            if (surrogate_val > *maxchar)
1240                *maxchar = surrogate_val;
1241            iter += 2;
1242        }
1243        else
1244            iter++;
1245#else
1246        iter++;
1247#endif
1248    }
1249    return 0;
1250}
1251
1252#ifdef Py_DEBUG
1253static int unicode_ready_calls = 0;
1254#endif
1255
1256static int
1257unicode_ready(PyObject **p_obj, int replace)
1258{
1259    PyObject *unicode;
1260    wchar_t *end;
1261    Py_UCS4 maxchar = 0;
1262    Py_ssize_t num_surrogates;
1263#if SIZEOF_WCHAR_T == 2
1264    Py_ssize_t length_wo_surrogates;
1265#endif
1266
1267    assert(p_obj != NULL);
1268    unicode = *p_obj;
1269
1270    /* _PyUnicode_Ready() is only intended for old-style API usage where
1271       strings were created using _PyObject_New() and where no canonical
1272       representation (the str field) has been set yet aka strings
1273       which are not yet ready. */
1274    assert(_PyUnicode_CHECK(unicode));
1275    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1276    assert(_PyUnicode_WSTR(unicode) != NULL);
1277    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1278    assert(_PyUnicode_UTF8(unicode) == NULL);
1279    /* Actually, it should neither be interned nor be anything else: */
1280    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1281
1282#ifdef Py_DEBUG
1283    ++unicode_ready_calls;
1284#endif
1285
1286#ifdef Py_DEBUG
1287    assert(!replace || Py_REFCNT(unicode) == 1);
1288#else
1289    if (replace && Py_REFCNT(unicode) != 1)
1290        replace = 0;
1291#endif
1292    if (replace) {
1293        Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1294        wchar_t *wstr = _PyUnicode_WSTR(unicode);
1295        /* Optimization for empty strings */
1296        if (len == 0) {
1297            Py_INCREF(unicode_empty);
1298            Py_DECREF(*p_obj);
1299            *p_obj = unicode_empty;
1300            return 0;
1301        }
1302        if (len == 1 && wstr[0] < 256) {
1303            PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1304            if (latin1_char == NULL)
1305                return -1;
1306            Py_DECREF(*p_obj);
1307            *p_obj = latin1_char;
1308            return 0;
1309        }
1310    }
1311
1312    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1313    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1314                                &maxchar, &num_surrogates) == -1)
1315        return -1;
1316
1317    if (maxchar < 256) {
1318        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1319        if (!_PyUnicode_DATA_ANY(unicode)) {
1320            PyErr_NoMemory();
1321            return -1;
1322        }
1323        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1324                                _PyUnicode_WSTR(unicode), end,
1325                                PyUnicode_1BYTE_DATA(unicode));
1326        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1327        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1328        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1329        if (maxchar < 128) {
1330            _PyUnicode_STATE(unicode).ascii = 1;
1331            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1332            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1333        }
1334        else {
1335            _PyUnicode_STATE(unicode).ascii = 0;
1336            _PyUnicode_UTF8(unicode) = NULL;
1337            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1338        }
1339        PyObject_FREE(_PyUnicode_WSTR(unicode));
1340        _PyUnicode_WSTR(unicode) = NULL;
1341        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1342    }
1343    /* In this case we might have to convert down from 4-byte native
1344       wchar_t to 2-byte unicode. */
1345    else if (maxchar < 65536) {
1346        assert(num_surrogates == 0 &&
1347               "FindMaxCharAndNumSurrogatePairs() messed up");
1348
1349#if SIZEOF_WCHAR_T == 2
1350        /* We can share representations and are done. */
1351        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1352        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1353        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1354        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1355        _PyUnicode_UTF8(unicode) = NULL;
1356        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1357#else
1358        /* sizeof(wchar_t) == 4 */
1359        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1360            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1361        if (!_PyUnicode_DATA_ANY(unicode)) {
1362            PyErr_NoMemory();
1363            return -1;
1364        }
1365        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1366                                _PyUnicode_WSTR(unicode), end,
1367                                PyUnicode_2BYTE_DATA(unicode));
1368        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1369        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1370        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1371        _PyUnicode_UTF8(unicode) = NULL;
1372        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1373        PyObject_FREE(_PyUnicode_WSTR(unicode));
1374        _PyUnicode_WSTR(unicode) = NULL;
1375        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1376#endif
1377    }
1378    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1379    else {
1380#if SIZEOF_WCHAR_T == 2
1381        /* in case the native representation is 2-bytes, we need to allocate a
1382           new normalized 4-byte version. */
1383        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1384        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1385        if (!_PyUnicode_DATA_ANY(unicode)) {
1386            PyErr_NoMemory();
1387            return -1;
1388        }
1389        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1390        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1391        _PyUnicode_UTF8(unicode) = NULL;
1392        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1393        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1394        _PyUnicode_STATE(unicode).ready = 1;
1395        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1396        PyObject_FREE(_PyUnicode_WSTR(unicode));
1397        _PyUnicode_WSTR(unicode) = NULL;
1398        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1399#else
1400        assert(num_surrogates == 0);
1401
1402        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1403        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1404        _PyUnicode_UTF8(unicode) = NULL;
1405        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1406        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1407#endif
1408        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1409    }
1410    _PyUnicode_STATE(unicode).ready = 1;
1411    assert(_PyUnicode_CheckConsistency(unicode, 1));
1412    return 0;
1413}
1414
1415int
1416_PyUnicode_ReadyReplace(PyObject **op)
1417{
1418    return unicode_ready(op, 1);
1419}
1420
1421int
1422_PyUnicode_Ready(PyObject *op)
1423{
1424    return unicode_ready(&op, 0);
1425}
1426
1427static void
1428unicode_dealloc(register PyObject *unicode)
1429{
1430    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1431    case SSTATE_NOT_INTERNED:
1432        break;
1433
1434    case SSTATE_INTERNED_MORTAL:
1435        /* revive dead object temporarily for DelItem */
1436        Py_REFCNT(unicode) = 3;
1437        if (PyDict_DelItem(interned, unicode) != 0)
1438            Py_FatalError(
1439                "deletion of interned string failed");
1440        break;
1441
1442    case SSTATE_INTERNED_IMMORTAL:
1443        Py_FatalError("Immortal interned string died.");
1444
1445    default:
1446        Py_FatalError("Inconsistent interned string state.");
1447    }
1448
1449    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1450        PyObject_DEL(_PyUnicode_WSTR(unicode));
1451    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1452        PyObject_DEL(_PyUnicode_UTF8(unicode));
1453
1454    if (PyUnicode_IS_COMPACT(unicode)) {
1455        Py_TYPE(unicode)->tp_free(unicode);
1456    }
1457    else {
1458        if (_PyUnicode_DATA_ANY(unicode))
1459            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1460        Py_TYPE(unicode)->tp_free(unicode);
1461    }
1462}
1463
1464#ifdef Py_DEBUG
1465static int
1466unicode_is_singleton(PyObject *unicode)
1467{
1468    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1469    if (unicode == unicode_empty)
1470        return 1;
1471    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1472    {
1473        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1474        if (ch < 256 && unicode_latin1[ch] == unicode)
1475            return 1;
1476    }
1477    return 0;
1478}
1479#endif
1480
1481static int
1482unicode_resizable(PyObject *unicode)
1483{
1484    if (Py_REFCNT(unicode) != 1)
1485        return 0;
1486    if (PyUnicode_CHECK_INTERNED(unicode))
1487        return 0;
1488#ifdef Py_DEBUG
1489    /* singleton refcount is greater than 1 */
1490    assert(!unicode_is_singleton(unicode));
1491#endif
1492    return 1;
1493}
1494
1495static int
1496unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1497{
1498    PyObject *unicode;
1499    Py_ssize_t old_length;
1500
1501    assert(p_unicode != NULL);
1502    unicode = *p_unicode;
1503
1504    assert(unicode != NULL);
1505    assert(PyUnicode_Check(unicode));
1506    assert(0 <= length);
1507
1508    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1509        old_length = PyUnicode_WSTR_LENGTH(unicode);
1510    else
1511        old_length = PyUnicode_GET_LENGTH(unicode);
1512    if (old_length == length)
1513        return 0;
1514
1515    if (length == 0) {
1516        Py_DECREF(*p_unicode);
1517        *p_unicode = unicode_empty;
1518        Py_INCREF(*p_unicode);
1519        return 0;
1520    }
1521
1522    if (!unicode_resizable(unicode)) {
1523        PyObject *copy = resize_copy(unicode, length);
1524        if (copy == NULL)
1525            return -1;
1526        Py_DECREF(*p_unicode);
1527        *p_unicode = copy;
1528        return 0;
1529    }
1530
1531    if (PyUnicode_IS_COMPACT(unicode)) {
1532        *p_unicode = resize_compact(unicode, length);
1533        if (*p_unicode == NULL)
1534            return -1;
1535        assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
1536        return 0;
1537    }
1538    return resize_inplace(unicode, length);
1539}
1540
1541int
1542PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1543{
1544    PyObject *unicode;
1545    if (p_unicode == NULL) {
1546        PyErr_BadInternalCall();
1547        return -1;
1548    }
1549    unicode = *p_unicode;
1550    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1551    {
1552        PyErr_BadInternalCall();
1553        return -1;
1554    }
1555    return unicode_resize(p_unicode, length);
1556}
1557
1558static int
1559unicode_widen(PyObject **p_unicode, unsigned int maxchar)
1560{
1561    PyObject *result;
1562    assert(PyUnicode_IS_READY(*p_unicode));
1563    if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1564        return 0;
1565    result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1566                           maxchar);
1567    if (result == NULL)
1568        return -1;
1569    PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1570                             PyUnicode_GET_LENGTH(*p_unicode));
1571    Py_DECREF(*p_unicode);
1572    *p_unicode = result;
1573    return 0;
1574}
1575
1576static int
1577unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1578                Py_UCS4 ch)
1579{
1580    if (unicode_widen(p_unicode, ch) < 0)
1581        return -1;
1582    PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1583                    PyUnicode_DATA(*p_unicode),
1584                    (*pos)++, ch);
1585    return 0;
1586}
1587
1588static PyObject*
1589get_latin1_char(unsigned char ch)
1590{
1591    PyObject *unicode = unicode_latin1[ch];
1592    if (!unicode) {
1593        unicode = PyUnicode_New(1, ch);
1594        if (!unicode)
1595            return NULL;
1596        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1597        assert(_PyUnicode_CheckConsistency(unicode, 1));
1598        unicode_latin1[ch] = unicode;
1599    }
1600    Py_INCREF(unicode);
1601    return unicode;
1602}
1603
1604PyObject *
1605PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1606{
1607    PyObject *unicode;
1608    Py_UCS4 maxchar = 0;
1609    Py_ssize_t num_surrogates;
1610
1611    if (u == NULL)
1612        return (PyObject*)_PyUnicode_New(size);
1613
1614    /* If the Unicode data is known at construction time, we can apply
1615       some optimizations which share commonly used objects. */
1616
1617    /* Optimization for empty strings */
1618    if (size == 0 && unicode_empty != NULL) {
1619        Py_INCREF(unicode_empty);
1620        return unicode_empty;
1621    }
1622
1623    /* Single character Unicode objects in the Latin-1 range are
1624       shared when using this constructor */
1625    if (size == 1 && *u < 256)
1626        return get_latin1_char((unsigned char)*u);
1627
1628    /* If not empty and not single character, copy the Unicode data
1629       into the new object */
1630    if (find_maxchar_surrogates(u, u + size,
1631                                &maxchar, &num_surrogates) == -1)
1632        return NULL;
1633
1634    unicode = PyUnicode_New(size - num_surrogates,
1635                                                maxchar);
1636    if (!unicode)
1637        return NULL;
1638
1639    switch (PyUnicode_KIND(unicode)) {
1640    case PyUnicode_1BYTE_KIND:
1641        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1642                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1643        break;
1644    case PyUnicode_2BYTE_KIND:
1645#if Py_UNICODE_SIZE == 2
1646        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1647#else
1648        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1649                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1650#endif
1651        break;
1652    case PyUnicode_4BYTE_KIND:
1653#if SIZEOF_WCHAR_T == 2
1654        /* This is the only case which has to process surrogates, thus
1655           a simple copy loop is not enough and we need a function. */
1656        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1657#else
1658        assert(num_surrogates == 0);
1659        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1660#endif
1661        break;
1662    default:
1663        assert(0 && "Impossible state");
1664    }
1665
1666    assert(_PyUnicode_CheckConsistency(unicode, 1));
1667    return unicode;
1668}
1669
1670PyObject *
1671PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1672{
1673    if (size < 0) {
1674        PyErr_SetString(PyExc_SystemError,
1675                        "Negative size passed to PyUnicode_FromStringAndSize");
1676        return NULL;
1677    }
1678
1679    /* If the Unicode data is known at construction time, we can apply
1680       some optimizations which share commonly used objects.
1681       Also, this means the input must be UTF-8, so fall back to the
1682       UTF-8 decoder at the end. */
1683    if (u != NULL) {
1684
1685        /* Optimization for empty strings */
1686        if (size == 0 && unicode_empty != NULL) {
1687            Py_INCREF(unicode_empty);
1688            return unicode_empty;
1689        }
1690
1691        /* Single characters are shared when using this constructor.
1692           Restrict to ASCII, since the input must be UTF-8. */
1693        if (size == 1 && (unsigned char)*u < 128)
1694            return get_latin1_char((unsigned char)*u);
1695
1696        return PyUnicode_DecodeUTF8(u, size, NULL);
1697    }
1698
1699    return (PyObject *)_PyUnicode_New(size);
1700}
1701
1702PyObject *
1703PyUnicode_FromString(const char *u)
1704{
1705    size_t size = strlen(u);
1706    if (size > PY_SSIZE_T_MAX) {
1707        PyErr_SetString(PyExc_OverflowError, "input too long");
1708        return NULL;
1709    }
1710
1711    return PyUnicode_FromStringAndSize(u, size);
1712}
1713
1714PyObject *
1715_PyUnicode_FromId(_Py_Identifier *id)
1716{
1717    if (!id->object) {
1718        id->object = PyUnicode_FromString(id->string);
1719        if (!id->object)
1720            return NULL;
1721        PyUnicode_InternInPlace(&id->object);
1722        assert(!id->next);
1723        id->next = static_strings;
1724        static_strings = id;
1725    }
1726    return id->object;
1727}
1728
1729void
1730_PyUnicode_ClearStaticStrings()
1731{
1732    _Py_Identifier *i;
1733    for (i = static_strings; i; i = i->next) {
1734        Py_DECREF(i->object);
1735        i->object = NULL;
1736        i->next = NULL;
1737    }
1738}
1739
1740static PyObject*
1741unicode_fromascii(const unsigned char* s, Py_ssize_t size)
1742{
1743    PyObject *res;
1744#ifdef Py_DEBUG
1745    const unsigned char *p;
1746    const unsigned char *end = s + size;
1747    for (p=s; p < end; p++) {
1748        assert(*p < 128);
1749    }
1750#endif
1751    if (size == 1)
1752        return get_latin1_char(s[0]);
1753    res = PyUnicode_New(size, 127);
1754    if (!res)
1755        return NULL;
1756    memcpy(PyUnicode_1BYTE_DATA(res), s, size);
1757    return res;
1758}
1759
1760static Py_UCS4
1761kind_maxchar_limit(unsigned int kind)
1762{
1763    switch(kind) {
1764    case PyUnicode_1BYTE_KIND:
1765        return 0x80;
1766    case PyUnicode_2BYTE_KIND:
1767        return 0x100;
1768    case PyUnicode_4BYTE_KIND:
1769        return 0x10000;
1770    default:
1771        assert(0 && "invalid kind");
1772        return 0x10ffff;
1773    }
1774}
1775
1776static PyObject*
1777_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1778{
1779    PyObject *res;
1780    unsigned char max_char = 127;
1781
1782    assert(size >= 0);
1783    if (size == 1)
1784        return get_latin1_char(u[0]);
1785    max_char = ucs1lib_find_max_char(u, u + size);
1786    res = PyUnicode_New(size, max_char);
1787    if (!res)
1788        return NULL;
1789    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1790    assert(_PyUnicode_CheckConsistency(res, 1));
1791    return res;
1792}
1793
1794static PyObject*
1795_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1796{
1797    PyObject *res;
1798    Py_UCS2 max_char = 0;
1799
1800    assert(size >= 0);
1801    if (size == 1 && u[0] < 256)
1802        return get_latin1_char((unsigned char)u[0]);
1803    max_char = ucs2lib_find_max_char(u, u + size);
1804    res = PyUnicode_New(size, max_char);
1805    if (!res)
1806        return NULL;
1807    if (max_char >= 256)
1808        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1809    else {
1810        _PyUnicode_CONVERT_BYTES(
1811            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1812    }
1813    assert(_PyUnicode_CheckConsistency(res, 1));
1814    return res;
1815}
1816
1817static PyObject*
1818_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1819{
1820    PyObject *res;
1821    Py_UCS4 max_char = 0;
1822
1823    assert(size >= 0);
1824    if (size == 1 && u[0] < 256)
1825        return get_latin1_char(u[0]);
1826    max_char = ucs4lib_find_max_char(u, u + size);
1827    res = PyUnicode_New(size, max_char);
1828    if (!res)
1829        return NULL;
1830    if (max_char < 256)
1831        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1832                                 PyUnicode_1BYTE_DATA(res));
1833    else if (max_char < 0x10000)
1834        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1835                                 PyUnicode_2BYTE_DATA(res));
1836    else
1837        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1838    assert(_PyUnicode_CheckConsistency(res, 1));
1839    return res;
1840}
1841
1842PyObject*
1843PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1844{
1845    switch(kind) {
1846    case PyUnicode_1BYTE_KIND:
1847        return _PyUnicode_FromUCS1(buffer, size);
1848    case PyUnicode_2BYTE_KIND:
1849        return _PyUnicode_FromUCS2(buffer, size);
1850    case PyUnicode_4BYTE_KIND:
1851        return _PyUnicode_FromUCS4(buffer, size);
1852    default:
1853        assert(0 && "invalid kind");
1854        PyErr_SetString(PyExc_SystemError, "invalid kind");
1855        return NULL;
1856    }
1857}
1858
1859/* Ensure that a string uses the most efficient storage, if it is not the
1860   case: create a new string with of the right kind. Write NULL into *p_unicode
1861   on error. */
1862static void
1863unicode_adjust_maxchar(PyObject **p_unicode)
1864{
1865    PyObject *unicode, *copy;
1866    Py_UCS4 max_char;
1867    Py_ssize_t len;
1868    unsigned int kind;
1869
1870    assert(p_unicode != NULL);
1871    unicode = *p_unicode;
1872    assert(PyUnicode_IS_READY(unicode));
1873    if (PyUnicode_IS_ASCII(unicode))
1874        return;
1875
1876    len = PyUnicode_GET_LENGTH(unicode);
1877    kind = PyUnicode_KIND(unicode);
1878    if (kind == PyUnicode_1BYTE_KIND) {
1879        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1880        max_char = ucs1lib_find_max_char(u, u + len);
1881        if (max_char >= 128)
1882            return;
1883    }
1884    else if (kind == PyUnicode_2BYTE_KIND) {
1885        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1886        max_char = ucs2lib_find_max_char(u, u + len);
1887        if (max_char >= 256)
1888            return;
1889    }
1890    else {
1891        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
1892        assert(kind == PyUnicode_4BYTE_KIND);
1893        max_char = ucs4lib_find_max_char(u, u + len);
1894        if (max_char >= 0x10000)
1895            return;
1896    }
1897    copy = PyUnicode_New(len, max_char);
1898    copy_characters(copy, 0, unicode, 0, len);
1899    Py_DECREF(unicode);
1900    *p_unicode = copy;
1901}
1902
1903PyObject*
1904PyUnicode_Copy(PyObject *unicode)
1905{
1906    Py_ssize_t size;
1907    PyObject *copy;
1908    void *data;
1909
1910    if (!PyUnicode_Check(unicode)) {
1911        PyErr_BadInternalCall();
1912        return NULL;
1913    }
1914    if (PyUnicode_READY(unicode))
1915        return NULL;
1916
1917    size = PyUnicode_GET_LENGTH(unicode);
1918    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1919    if (!copy)
1920        return NULL;
1921    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1922
1923    data = PyUnicode_DATA(unicode);
1924    switch (PyUnicode_KIND(unicode))
1925    {
1926    case PyUnicode_1BYTE_KIND:
1927        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1928        break;
1929    case PyUnicode_2BYTE_KIND:
1930        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1931        break;
1932    case PyUnicode_4BYTE_KIND:
1933        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1934        break;
1935    default:
1936        assert(0);
1937        break;
1938    }
1939    assert(_PyUnicode_CheckConsistency(copy, 1));
1940    return copy;
1941}
1942
1943
1944/* Widen Unicode objects to larger buffers. Don't write terminating null
1945   character. Return NULL on error. */
1946
1947void*
1948_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1949{
1950    Py_ssize_t len;
1951    void *result;
1952    unsigned int skind;
1953
1954    if (PyUnicode_READY(s))
1955        return NULL;
1956
1957    len = PyUnicode_GET_LENGTH(s);
1958    skind = PyUnicode_KIND(s);
1959    if (skind >= kind) {
1960        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
1961        return NULL;
1962    }
1963    switch(kind) {
1964    case PyUnicode_2BYTE_KIND:
1965        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1966        if (!result)
1967            return PyErr_NoMemory();
1968        assert(skind == PyUnicode_1BYTE_KIND);
1969        _PyUnicode_CONVERT_BYTES(
1970            Py_UCS1, Py_UCS2,
1971            PyUnicode_1BYTE_DATA(s),
1972            PyUnicode_1BYTE_DATA(s) + len,
1973            result);
1974        return result;
1975    case PyUnicode_4BYTE_KIND:
1976        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1977        if (!result)
1978            return PyErr_NoMemory();
1979        if (skind == PyUnicode_2BYTE_KIND) {
1980            _PyUnicode_CONVERT_BYTES(
1981                Py_UCS2, Py_UCS4,
1982                PyUnicode_2BYTE_DATA(s),
1983                PyUnicode_2BYTE_DATA(s) + len,
1984                result);
1985        }
1986        else {
1987            assert(skind == PyUnicode_1BYTE_KIND);
1988            _PyUnicode_CONVERT_BYTES(
1989                Py_UCS1, Py_UCS4,
1990                PyUnicode_1BYTE_DATA(s),
1991                PyUnicode_1BYTE_DATA(s) + len,
1992                result);
1993        }
1994        return result;
1995    default:
1996        break;
1997    }
1998    PyErr_SetString(PyExc_SystemError, "invalid kind");
1999    return NULL;
2000}
2001
2002static Py_UCS4*
2003as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2004        int copy_null)
2005{
2006    int kind;
2007    void *data;
2008    Py_ssize_t len, targetlen;
2009    if (PyUnicode_READY(string) == -1)
2010        return NULL;
2011    kind = PyUnicode_KIND(string);
2012    data = PyUnicode_DATA(string);
2013    len = PyUnicode_GET_LENGTH(string);
2014    targetlen = len;
2015    if (copy_null)
2016        targetlen++;
2017    if (!target) {
2018        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2019            PyErr_NoMemory();
2020            return NULL;
2021        }
2022        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2023        if (!target) {
2024            PyErr_NoMemory();
2025            return NULL;
2026        }
2027    }
2028    else {
2029        if (targetsize < targetlen) {
2030            PyErr_Format(PyExc_SystemError,
2031                         "string is longer than the buffer");
2032            if (copy_null && 0 < targetsize)
2033                target[0] = 0;
2034            return NULL;
2035        }
2036    }
2037    if (kind == PyUnicode_1BYTE_KIND) {
2038        Py_UCS1 *start = (Py_UCS1 *) data;
2039        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2040    }
2041    else if (kind == PyUnicode_2BYTE_KIND) {
2042        Py_UCS2 *start = (Py_UCS2 *) data;
2043        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2044    }
2045    else {
2046        assert(kind == PyUnicode_4BYTE_KIND);
2047        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2048    }
2049    if (copy_null)
2050        target[len] = 0;
2051    return target;
2052}
2053
2054Py_UCS4*
2055PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2056                 int copy_null)
2057{
2058    if (target == NULL || targetsize < 0) {
2059        PyErr_BadInternalCall();
2060        return NULL;
2061    }
2062    return as_ucs4(string, target, targetsize, copy_null);
2063}
2064
2065Py_UCS4*
2066PyUnicode_AsUCS4Copy(PyObject *string)
2067{
2068    return as_ucs4(string, NULL, 0, 1);
2069}
2070
2071#ifdef HAVE_WCHAR_H
2072
2073PyObject *
2074PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2075{
2076    if (w == NULL) {
2077        if (size == 0)
2078            return PyUnicode_New(0, 0);
2079        PyErr_BadInternalCall();
2080        return NULL;
2081    }
2082
2083    if (size == -1) {
2084        size = wcslen(w);
2085    }
2086
2087    return PyUnicode_FromUnicode(w, size);
2088}
2089
2090#endif /* HAVE_WCHAR_H */
2091
2092static void
2093makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2094        int zeropad, int width, int precision, char c)
2095{
2096    *fmt++ = '%';
2097    if (width) {
2098        if (zeropad)
2099            *fmt++ = '0';
2100        fmt += sprintf(fmt, "%d", width);
2101    }
2102    if (precision)
2103        fmt += sprintf(fmt, ".%d", precision);
2104    if (longflag)
2105        *fmt++ = 'l';
2106    else if (longlongflag) {
2107        /* longlongflag should only ever be nonzero on machines with
2108           HAVE_LONG_LONG defined */
2109#ifdef HAVE_LONG_LONG
2110        char *f = PY_FORMAT_LONG_LONG;
2111        while (*f)
2112            *fmt++ = *f++;
2113#else
2114        /* we shouldn't ever get here */
2115        assert(0);
2116        *fmt++ = 'l';
2117#endif
2118    }
2119    else if (size_tflag) {
2120        char *f = PY_FORMAT_SIZE_T;
2121        while (*f)
2122            *fmt++ = *f++;
2123    }
2124    *fmt++ = c;
2125    *fmt = '\0';
2126}
2127
2128/* helper for PyUnicode_FromFormatV() */
2129
2130static const char*
2131parse_format_flags(const char *f,
2132                   int *p_width, int *p_precision,
2133                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2134{
2135    int width, precision, longflag, longlongflag, size_tflag;
2136
2137    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2138    f++;
2139    width = 0;
2140    while (Py_ISDIGIT((unsigned)*f))
2141        width = (width*10) + *f++ - '0';
2142    precision = 0;
2143    if (*f == '.') {
2144        f++;
2145        while (Py_ISDIGIT((unsigned)*f))
2146            precision = (precision*10) + *f++ - '0';
2147        if (*f == '%') {
2148            /* "%.3%s" => f points to "3" */
2149            f--;
2150        }
2151    }
2152    if (*f == '\0') {
2153        /* bogus format "%.1" => go backward, f points to "1" */
2154        f--;
2155    }
2156    if (p_width != NULL)
2157        *p_width = width;
2158    if (p_precision != NULL)
2159        *p_precision = precision;
2160
2161    /* Handle %ld, %lu, %lld and %llu. */
2162    longflag = 0;
2163    longlongflag = 0;
2164    size_tflag = 0;
2165
2166    if (*f == 'l') {
2167        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2168            longflag = 1;
2169            ++f;
2170        }
2171#ifdef HAVE_LONG_LONG
2172        else if (f[1] == 'l' &&
2173                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2174            longlongflag = 1;
2175            f += 2;
2176        }
2177#endif
2178    }
2179    /* handle the size_t flag. */
2180    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2181        size_tflag = 1;
2182        ++f;
2183    }
2184    if (p_longflag != NULL)
2185        *p_longflag = longflag;
2186    if (p_longlongflag != NULL)
2187        *p_longlongflag = longlongflag;
2188    if (p_size_tflag != NULL)
2189        *p_size_tflag = size_tflag;
2190    return f;
2191}
2192
2193/* maximum number of characters required for output of %ld.  21 characters
2194   allows for 64-bit integers (in decimal) and an optional sign. */
2195#define MAX_LONG_CHARS 21
2196/* maximum number of characters required for output of %lld.
2197   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2198   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2199#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2200
2201PyObject *
2202PyUnicode_FromFormatV(const char *format, va_list vargs)
2203{
2204    va_list count;
2205    Py_ssize_t callcount = 0;
2206    PyObject **callresults = NULL;
2207    PyObject **callresult = NULL;
2208    Py_ssize_t n = 0;
2209    int width = 0;
2210    int precision = 0;
2211    int zeropad;
2212    const char* f;
2213    PyObject *string;
2214    /* used by sprintf */
2215    char fmt[61]; /* should be enough for %0width.precisionlld */
2216    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2217    Py_UCS4 argmaxchar;
2218    Py_ssize_t numbersize = 0;
2219    char *numberresults = NULL;
2220    char *numberresult = NULL;
2221    Py_ssize_t i;
2222    int kind;
2223    void *data;
2224
2225    Py_VA_COPY(count, vargs);
2226    /* step 1: count the number of %S/%R/%A/%s format specifications
2227     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2228     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2229     * result in an array)
2230     * also estimate a upper bound for all the number formats in the string,
2231     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2232     * buffer before putting everything together. */
2233    for (f = format; *f; f++) {
2234        if (*f == '%') {
2235            int longlongflag;
2236            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2237            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2238            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2239                ++callcount;
2240
2241            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2242#ifdef HAVE_LONG_LONG
2243                if (longlongflag) {
2244                    if (width < MAX_LONG_LONG_CHARS)
2245                        width = MAX_LONG_LONG_CHARS;
2246                }
2247                else
2248#endif
2249                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2250                       including sign.  Decimal takes the most space.  This
2251                       isn't enough for octal.  If a width is specified we
2252                       need more (which we allocate later). */
2253                    if (width < MAX_LONG_CHARS)
2254                        width = MAX_LONG_CHARS;
2255
2256                /* account for the size + '\0' to separate numbers
2257                   inside of the numberresults buffer */
2258                numbersize += (width + 1);
2259            }
2260        }
2261        else if ((unsigned char)*f > 127) {
2262            PyErr_Format(PyExc_ValueError,
2263                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2264                "string, got a non-ASCII byte: 0x%02x",
2265                (unsigned char)*f);
2266            return NULL;
2267        }
2268    }
2269    /* step 2: allocate memory for the results of
2270     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2271    if (callcount) {
2272        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2273        if (!callresults) {
2274            PyErr_NoMemory();
2275            return NULL;
2276        }
2277        callresult = callresults;
2278    }
2279    /* step 2.5: allocate memory for the results of formating numbers */
2280    if (numbersize) {
2281        numberresults = PyObject_Malloc(numbersize);
2282        if (!numberresults) {
2283            PyErr_NoMemory();
2284            goto fail;
2285        }
2286        numberresult = numberresults;
2287    }
2288
2289    /* step 3: format numbers and figure out how large a buffer we need */
2290    for (f = format; *f; f++) {
2291        if (*f == '%') {
2292            const char* p;
2293            int longflag;
2294            int longlongflag;
2295            int size_tflag;
2296            int numprinted;
2297
2298            p = f;
2299            zeropad = (f[1] == '0');
2300            f = parse_format_flags(f, &width, &precision,
2301                                   &longflag, &longlongflag, &size_tflag);
2302            switch (*f) {
2303            case 'c':
2304            {
2305                Py_UCS4 ordinal = va_arg(count, int);
2306                maxchar = Py_MAX(maxchar, ordinal);
2307                n++;
2308                break;
2309            }
2310            case '%':
2311                n++;
2312                break;
2313            case 'i':
2314            case 'd':
2315                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2316                        width, precision, *f);
2317                if (longflag)
2318                    numprinted = sprintf(numberresult, fmt,
2319                                         va_arg(count, long));
2320#ifdef HAVE_LONG_LONG
2321                else if (longlongflag)
2322                    numprinted = sprintf(numberresult, fmt,
2323                                         va_arg(count, PY_LONG_LONG));
2324#endif
2325                else if (size_tflag)
2326                    numprinted = sprintf(numberresult, fmt,
2327                                         va_arg(count, Py_ssize_t));
2328                else
2329                    numprinted = sprintf(numberresult, fmt,
2330                                         va_arg(count, int));
2331                n += numprinted;
2332                /* advance by +1 to skip over the '\0' */
2333                numberresult += (numprinted + 1);
2334                assert(*(numberresult - 1) == '\0');
2335                assert(*(numberresult - 2) != '\0');
2336                assert(numprinted >= 0);
2337                assert(numberresult <= numberresults + numbersize);
2338                break;
2339            case 'u':
2340                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2341                        width, precision, 'u');
2342                if (longflag)
2343                    numprinted = sprintf(numberresult, fmt,
2344                                         va_arg(count, unsigned long));
2345#ifdef HAVE_LONG_LONG
2346                else if (longlongflag)
2347                    numprinted = sprintf(numberresult, fmt,
2348                                         va_arg(count, unsigned PY_LONG_LONG));
2349#endif
2350                else if (size_tflag)
2351                    numprinted = sprintf(numberresult, fmt,
2352                                         va_arg(count, size_t));
2353                else
2354                    numprinted = sprintf(numberresult, fmt,
2355                                         va_arg(count, unsigned int));
2356                n += numprinted;
2357                numberresult += (numprinted + 1);
2358                assert(*(numberresult - 1) == '\0');
2359                assert(*(numberresult - 2) != '\0');
2360                assert(numprinted >= 0);
2361                assert(numberresult <= numberresults + numbersize);
2362                break;
2363            case 'x':
2364                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2365                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2366                n += numprinted;
2367                numberresult += (numprinted + 1);
2368                assert(*(numberresult - 1) == '\0');
2369                assert(*(numberresult - 2) != '\0');
2370                assert(numprinted >= 0);
2371                assert(numberresult <= numberresults + numbersize);
2372                break;
2373            case 'p':
2374                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2375                /* %p is ill-defined:  ensure leading 0x. */
2376                if (numberresult[1] == 'X')
2377                    numberresult[1] = 'x';
2378                else if (numberresult[1] != 'x') {
2379                    memmove(numberresult + 2, numberresult,
2380                            strlen(numberresult) + 1);
2381                    numberresult[0] = '0';
2382                    numberresult[1] = 'x';
2383                    numprinted += 2;
2384                }
2385                n += numprinted;
2386                numberresult += (numprinted + 1);
2387                assert(*(numberresult - 1) == '\0');
2388                assert(*(numberresult - 2) != '\0');
2389                assert(numprinted >= 0);
2390                assert(numberresult <= numberresults + numbersize);
2391                break;
2392            case 's':
2393            {
2394                /* UTF-8 */
2395                const char *s = va_arg(count, const char*);
2396                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2397                if (!str)
2398                    goto fail;
2399                /* since PyUnicode_DecodeUTF8 returns already flexible
2400                   unicode objects, there is no need to call ready on them */
2401                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2402                maxchar = Py_MAX(maxchar, argmaxchar);
2403                n += PyUnicode_GET_LENGTH(str);
2404                /* Remember the str and switch to the next slot */
2405                *callresult++ = str;
2406                break;
2407            }
2408            case 'U':
2409            {
2410                PyObject *obj = va_arg(count, PyObject *);
2411                assert(obj && _PyUnicode_CHECK(obj));
2412                if (PyUnicode_READY(obj) == -1)
2413                    goto fail;
2414                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2415                maxchar = Py_MAX(maxchar, argmaxchar);
2416                n += PyUnicode_GET_LENGTH(obj);
2417                break;
2418            }
2419            case 'V':
2420            {
2421                PyObject *obj = va_arg(count, PyObject *);
2422                const char *str = va_arg(count, const char *);
2423                PyObject *str_obj;
2424                assert(obj || str);
2425                assert(!obj || _PyUnicode_CHECK(obj));
2426                if (obj) {
2427                    if (PyUnicode_READY(obj) == -1)
2428                        goto fail;
2429                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2430                    maxchar = Py_MAX(maxchar, argmaxchar);
2431                    n += PyUnicode_GET_LENGTH(obj);
2432                    *callresult++ = NULL;
2433                }
2434                else {
2435                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2436                    if (!str_obj)
2437                        goto fail;
2438                    if (PyUnicode_READY(str_obj)) {
2439                        Py_DECREF(str_obj);
2440                        goto fail;
2441                    }
2442                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2443                    maxchar = Py_MAX(maxchar, argmaxchar);
2444                    n += PyUnicode_GET_LENGTH(str_obj);
2445                    *callresult++ = str_obj;
2446                }
2447                break;
2448            }
2449            case 'S':
2450            {
2451                PyObject *obj = va_arg(count, PyObject *);
2452                PyObject *str;
2453                assert(obj);
2454                str = PyObject_Str(obj);
2455                if (!str || PyUnicode_READY(str) == -1)
2456                    goto fail;
2457                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2458                maxchar = Py_MAX(maxchar, argmaxchar);
2459                n += PyUnicode_GET_LENGTH(str);
2460                /* Remember the str and switch to the next slot */
2461                *callresult++ = str;
2462                break;
2463            }
2464            case 'R':
2465            {
2466                PyObject *obj = va_arg(count, PyObject *);
2467                PyObject *repr;
2468                assert(obj);
2469                repr = PyObject_Repr(obj);
2470                if (!repr || PyUnicode_READY(repr) == -1)
2471                    goto fail;
2472                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2473                maxchar = Py_MAX(maxchar, argmaxchar);
2474                n += PyUnicode_GET_LENGTH(repr);
2475                /* Remember the repr and switch to the next slot */
2476                *callresult++ = repr;
2477                break;
2478            }
2479            case 'A':
2480            {
2481                PyObject *obj = va_arg(count, PyObject *);
2482                PyObject *ascii;
2483                assert(obj);
2484                ascii = PyObject_ASCII(obj);
2485                if (!ascii || PyUnicode_READY(ascii) == -1)
2486                    goto fail;
2487                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2488                maxchar = Py_MAX(maxchar, argmaxchar);
2489                n += PyUnicode_GET_LENGTH(ascii);
2490                /* Remember the repr and switch to the next slot */
2491                *callresult++ = ascii;
2492                break;
2493            }
2494            default:
2495                /* if we stumble upon an unknown
2496                   formatting code, copy the rest of
2497                   the format string to the output
2498                   string. (we cannot just skip the
2499                   code, since there's no way to know
2500                   what's in the argument list) */
2501                n += strlen(p);
2502                goto expand;
2503            }
2504        } else
2505            n++;
2506    }
2507  expand:
2508    /* step 4: fill the buffer */
2509    /* Since we've analyzed how much space we need,
2510       we don't have to resize the string.
2511       There can be no errors beyond this point. */
2512    string = PyUnicode_New(n, maxchar);
2513    if (!string)
2514        goto fail;
2515    kind = PyUnicode_KIND(string);
2516    data = PyUnicode_DATA(string);
2517    callresult = callresults;
2518    numberresult = numberresults;
2519
2520    for (i = 0, f = format; *f; f++) {
2521        if (*f == '%') {
2522            const char* p;
2523
2524            p = f;
2525            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2526            /* checking for == because the last argument could be a empty
2527               string, which causes i to point to end, the assert at the end of
2528               the loop */
2529            assert(i <= PyUnicode_GET_LENGTH(string));
2530
2531            switch (*f) {
2532            case 'c':
2533            {
2534                const int ordinal = va_arg(vargs, int);
2535                PyUnicode_WRITE(kind, data, i++, ordinal);
2536                break;
2537            }
2538            case 'i':
2539            case 'd':
2540            case 'u':
2541            case 'x':
2542            case 'p':
2543                /* unused, since we already have the result */
2544                if (*f == 'p')
2545                    (void) va_arg(vargs, void *);
2546                else
2547                    (void) va_arg(vargs, int);
2548                /* extract the result from numberresults and append. */
2549                for (; *numberresult; ++i, ++numberresult)
2550                    PyUnicode_WRITE(kind, data, i, *numberresult);
2551                /* skip over the separating '\0' */
2552                assert(*numberresult == '\0');
2553                numberresult++;
2554                assert(numberresult <= numberresults + numbersize);
2555                break;
2556            case 's':
2557            {
2558                /* unused, since we already have the result */
2559                Py_ssize_t size;
2560                (void) va_arg(vargs, char *);
2561                size = PyUnicode_GET_LENGTH(*callresult);
2562                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2563                copy_characters(string, i, *callresult, 0, size);
2564                i += size;
2565                /* We're done with the unicode()/repr() => forget it */
2566                Py_DECREF(*callresult);
2567                /* switch to next unicode()/repr() result */
2568                ++callresult;
2569                break;
2570            }
2571            case 'U':
2572            {
2573                PyObject *obj = va_arg(vargs, PyObject *);
2574                Py_ssize_t size;
2575                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2576                size = PyUnicode_GET_LENGTH(obj);
2577                copy_characters(string, i, obj, 0, size);
2578                i += size;
2579                break;
2580            }
2581            case 'V':
2582            {
2583                Py_ssize_t size;
2584                PyObject *obj = va_arg(vargs, PyObject *);
2585                va_arg(vargs, const char *);
2586                if (obj) {
2587                    size = PyUnicode_GET_LENGTH(obj);
2588                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2589                    copy_characters(string, i, obj, 0, size);
2590                    i += size;
2591                } else {
2592                    size = PyUnicode_GET_LENGTH(*callresult);
2593                    assert(PyUnicode_KIND(*callresult) <=
2594                           PyUnicode_KIND(string));
2595                    copy_characters(string, i, *callresult, 0, size);
2596                    i += size;
2597                    Py_DECREF(*callresult);
2598                }
2599                ++callresult;
2600                break;
2601            }
2602            case 'S':
2603            case 'R':
2604            case 'A':
2605            {
2606                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2607                /* unused, since we already have the result */
2608                (void) va_arg(vargs, PyObject *);
2609                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2610                copy_characters(string, i, *callresult, 0,  size);
2611                i += size;
2612                /* We're done with the unicode()/repr() => forget it */
2613                Py_DECREF(*callresult);
2614                /* switch to next unicode()/repr() result */
2615                ++callresult;
2616                break;
2617            }
2618            case '%':
2619                PyUnicode_WRITE(kind, data, i++, '%');
2620                break;
2621            default:
2622                for (; *p; ++p, ++i)
2623                    PyUnicode_WRITE(kind, data, i, *p);
2624                assert(i == PyUnicode_GET_LENGTH(string));
2625                goto end;
2626            }
2627        }
2628        else {
2629            assert(i < PyUnicode_GET_LENGTH(string));
2630            PyUnicode_WRITE(kind, data, i++, *f);
2631        }
2632    }
2633    assert(i == PyUnicode_GET_LENGTH(string));
2634
2635  end:
2636    if (callresults)
2637        PyObject_Free(callresults);
2638    if (numberresults)
2639        PyObject_Free(numberresults);
2640    assert(_PyUnicode_CheckConsistency(string, 1));
2641    return string;
2642  fail:
2643    if (callresults) {
2644        PyObject **callresult2 = callresults;
2645        while (callresult2 < callresult) {
2646            Py_XDECREF(*callresult2);
2647            ++callresult2;
2648        }
2649        PyObject_Free(callresults);
2650    }
2651    if (numberresults)
2652        PyObject_Free(numberresults);
2653    return NULL;
2654}
2655
2656PyObject *
2657PyUnicode_FromFormat(const char *format, ...)
2658{
2659    PyObject* ret;
2660    va_list vargs;
2661
2662#ifdef HAVE_STDARG_PROTOTYPES
2663    va_start(vargs, format);
2664#else
2665    va_start(vargs);
2666#endif
2667    ret = PyUnicode_FromFormatV(format, vargs);
2668    va_end(vargs);
2669    return ret;
2670}
2671
2672#ifdef HAVE_WCHAR_H
2673
2674/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2675   convert a Unicode object to a wide character string.
2676
2677   - If w is NULL: return the number of wide characters (including the null
2678     character) required to convert the unicode object. Ignore size argument.
2679
2680   - Otherwise: return the number of wide characters (excluding the null
2681     character) written into w. Write at most size wide characters (including
2682     the null character). */
2683static Py_ssize_t
2684unicode_aswidechar(PyObject *unicode,
2685                   wchar_t *w,
2686                   Py_ssize_t size)
2687{
2688    Py_ssize_t res;
2689    const wchar_t *wstr;
2690
2691    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2692    if (wstr == NULL)
2693        return -1;
2694
2695    if (w != NULL) {
2696        if (size > res)
2697            size = res + 1;
2698        else
2699            res = size;
2700        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2701        return res;
2702    }
2703    else
2704        return res + 1;
2705}
2706
2707Py_ssize_t
2708PyUnicode_AsWideChar(PyObject *unicode,
2709                     wchar_t *w,
2710                     Py_ssize_t size)
2711{
2712    if (unicode == NULL) {
2713        PyErr_BadInternalCall();
2714        return -1;
2715    }
2716    return unicode_aswidechar(unicode, w, size);
2717}
2718
2719wchar_t*
2720PyUnicode_AsWideCharString(PyObject *unicode,
2721                           Py_ssize_t *size)
2722{
2723    wchar_t* buffer;
2724    Py_ssize_t buflen;
2725
2726    if (unicode == NULL) {
2727        PyErr_BadInternalCall();
2728        return NULL;
2729    }
2730
2731    buflen = unicode_aswidechar(unicode, NULL, 0);
2732    if (buflen == -1)
2733        return NULL;
2734    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2735        PyErr_NoMemory();
2736        return NULL;
2737    }
2738
2739    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2740    if (buffer == NULL) {
2741        PyErr_NoMemory();
2742        return NULL;
2743    }
2744    buflen = unicode_aswidechar(unicode, buffer, buflen);
2745    if (buflen == -1)
2746        return NULL;
2747    if (size != NULL)
2748        *size = buflen;
2749    return buffer;
2750}
2751
2752#endif /* HAVE_WCHAR_H */
2753
2754PyObject *
2755PyUnicode_FromOrdinal(int ordinal)
2756{
2757    PyObject *v;
2758    if (ordinal < 0 || ordinal > 0x10ffff) {
2759        PyErr_SetString(PyExc_ValueError,
2760                        "chr() arg not in range(0x110000)");
2761        return NULL;
2762    }
2763
2764    if (ordinal < 256)
2765        return get_latin1_char(ordinal);
2766
2767    v = PyUnicode_New(1, ordinal);
2768    if (v == NULL)
2769        return NULL;
2770    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2771    assert(_PyUnicode_CheckConsistency(v, 1));
2772    return v;
2773}
2774
2775PyObject *
2776PyUnicode_FromObject(register PyObject *obj)
2777{
2778    /* XXX Perhaps we should make this API an alias of
2779       PyObject_Str() instead ?! */
2780    if (PyUnicode_CheckExact(obj)) {
2781        if (PyUnicode_READY(obj))
2782            return NULL;
2783        Py_INCREF(obj);
2784        return obj;
2785    }
2786    if (PyUnicode_Check(obj)) {
2787        /* For a Unicode subtype that's not a Unicode object,
2788           return a true Unicode object with the same data. */
2789        return PyUnicode_Copy(obj);
2790    }
2791    PyErr_Format(PyExc_TypeError,
2792                 "Can't convert '%.100s' object to str implicitly",
2793                 Py_TYPE(obj)->tp_name);
2794    return NULL;
2795}
2796
2797PyObject *
2798PyUnicode_FromEncodedObject(register PyObject *obj,
2799                            const char *encoding,
2800                            const char *errors)
2801{
2802    Py_buffer buffer;
2803    PyObject *v;
2804
2805    if (obj == NULL) {
2806        PyErr_BadInternalCall();
2807        return NULL;
2808    }
2809
2810    /* Decoding bytes objects is the most common case and should be fast */
2811    if (PyBytes_Check(obj)) {
2812        if (PyBytes_GET_SIZE(obj) == 0) {
2813            Py_INCREF(unicode_empty);
2814            v = unicode_empty;
2815        }
2816        else {
2817            v = PyUnicode_Decode(
2818                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2819                    encoding, errors);
2820        }
2821        return v;
2822    }
2823
2824    if (PyUnicode_Check(obj)) {
2825        PyErr_SetString(PyExc_TypeError,
2826                        "decoding str is not supported");
2827        return NULL;
2828    }
2829
2830    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2831    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2832        PyErr_Format(PyExc_TypeError,
2833                     "coercing to str: need bytes, bytearray "
2834                     "or buffer-like object, %.80s found",
2835                     Py_TYPE(obj)->tp_name);
2836        return NULL;
2837    }
2838
2839    if (buffer.len == 0) {
2840        Py_INCREF(unicode_empty);
2841        v = unicode_empty;
2842    }
2843    else
2844        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2845
2846    PyBuffer_Release(&buffer);
2847    return v;
2848}
2849
2850/* Convert encoding to lower case and replace '_' with '-' in order to
2851   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2852   1 on success. */
2853static int
2854normalize_encoding(const char *encoding,
2855                   char *lower,
2856                   size_t lower_len)
2857{
2858    const char *e;
2859    char *l;
2860    char *l_end;
2861
2862    if (encoding == NULL) {
2863        strcpy(lower, "utf-8");
2864        return 1;
2865    }
2866    e = encoding;
2867    l = lower;
2868    l_end = &lower[lower_len - 1];
2869    while (*e) {
2870        if (l == l_end)
2871            return 0;
2872        if (Py_ISUPPER(*e)) {
2873            *l++ = Py_TOLOWER(*e++);
2874        }
2875        else if (*e == '_') {
2876            *l++ = '-';
2877            e++;
2878        }
2879        else {
2880            *l++ = *e++;
2881        }
2882    }
2883    *l = '\0';
2884    return 1;
2885}
2886
2887PyObject *
2888PyUnicode_Decode(const char *s,
2889                 Py_ssize_t size,
2890                 const char *encoding,
2891                 const char *errors)
2892{
2893    PyObject *buffer = NULL, *unicode;
2894    Py_buffer info;
2895    char lower[11];  /* Enough for any encoding shortcut */
2896
2897    /* Shortcuts for common default encodings */
2898    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2899        if ((strcmp(lower, "utf-8") == 0) ||
2900            (strcmp(lower, "utf8") == 0))
2901            return PyUnicode_DecodeUTF8(s, size, errors);
2902        else if ((strcmp(lower, "latin-1") == 0) ||
2903                 (strcmp(lower, "latin1") == 0) ||
2904                 (strcmp(lower, "iso-8859-1") == 0))
2905            return PyUnicode_DecodeLatin1(s, size, errors);
2906#ifdef HAVE_MBCS
2907        else if (strcmp(lower, "mbcs") == 0)
2908            return PyUnicode_DecodeMBCS(s, size, errors);
2909#endif
2910        else if (strcmp(lower, "ascii") == 0)
2911            return PyUnicode_DecodeASCII(s, size, errors);
2912        else if (strcmp(lower, "utf-16") == 0)
2913            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2914        else if (strcmp(lower, "utf-32") == 0)
2915            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2916    }
2917
2918    /* Decode via the codec registry */
2919    buffer = NULL;
2920    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2921        goto onError;
2922    buffer = PyMemoryView_FromBuffer(&info);
2923    if (buffer == NULL)
2924        goto onError;
2925    unicode = PyCodec_Decode(buffer, encoding, errors);
2926    if (unicode == NULL)
2927        goto onError;
2928    if (!PyUnicode_Check(unicode)) {
2929        PyErr_Format(PyExc_TypeError,
2930                     "decoder did not return a str object (type=%.400s)",
2931                     Py_TYPE(unicode)->tp_name);
2932        Py_DECREF(unicode);
2933        goto onError;
2934    }
2935    Py_DECREF(buffer);
2936#ifndef DONT_MAKE_RESULT_READY
2937    if (_PyUnicode_READY_REPLACE(&unicode)) {
2938        Py_DECREF(unicode);
2939        return NULL;
2940    }
2941#endif
2942    assert(_PyUnicode_CheckConsistency(unicode, 1));
2943    return unicode;
2944
2945  onError:
2946    Py_XDECREF(buffer);
2947    return NULL;
2948}
2949
2950PyObject *
2951PyUnicode_AsDecodedObject(PyObject *unicode,
2952                          const char *encoding,
2953                          const char *errors)
2954{
2955    PyObject *v;
2956
2957    if (!PyUnicode_Check(unicode)) {
2958        PyErr_BadArgument();
2959        goto onError;
2960    }
2961
2962    if (encoding == NULL)
2963        encoding = PyUnicode_GetDefaultEncoding();
2964
2965    /* Decode via the codec registry */
2966    v = PyCodec_Decode(unicode, encoding, errors);
2967    if (v == NULL)
2968        goto onError;
2969    assert(_PyUnicode_CheckConsistency(v, 1));
2970    return v;
2971
2972  onError:
2973    return NULL;
2974}
2975
2976PyObject *
2977PyUnicode_AsDecodedUnicode(PyObject *unicode,
2978                           const char *encoding,
2979                           const char *errors)
2980{
2981    PyObject *v;
2982
2983    if (!PyUnicode_Check(unicode)) {
2984        PyErr_BadArgument();
2985        goto onError;
2986    }
2987
2988    if (encoding == NULL)
2989        encoding = PyUnicode_GetDefaultEncoding();
2990
2991    /* Decode via the codec registry */
2992    v = PyCodec_Decode(unicode, encoding, errors);
2993    if (v == NULL)
2994        goto onError;
2995    if (!PyUnicode_Check(v)) {
2996        PyErr_Format(PyExc_TypeError,
2997                     "decoder did not return a str object (type=%.400s)",
2998                     Py_TYPE(v)->tp_name);
2999        Py_DECREF(v);
3000        goto onError;
3001    }
3002    assert(_PyUnicode_CheckConsistency(v, 1));
3003    return v;
3004
3005  onError:
3006    return NULL;
3007}
3008
3009PyObject *
3010PyUnicode_Encode(const Py_UNICODE *s,
3011                 Py_ssize_t size,
3012                 const char *encoding,
3013                 const char *errors)
3014{
3015    PyObject *v, *unicode;
3016
3017    unicode = PyUnicode_FromUnicode(s, size);
3018    if (unicode == NULL)
3019        return NULL;
3020    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3021    Py_DECREF(unicode);
3022    return v;
3023}
3024
3025PyObject *
3026PyUnicode_AsEncodedObject(PyObject *unicode,
3027                          const char *encoding,
3028                          const char *errors)
3029{
3030    PyObject *v;
3031
3032    if (!PyUnicode_Check(unicode)) {
3033        PyErr_BadArgument();
3034        goto onError;
3035    }
3036
3037    if (encoding == NULL)
3038        encoding = PyUnicode_GetDefaultEncoding();
3039
3040    /* Encode via the codec registry */
3041    v = PyCodec_Encode(unicode, encoding, errors);
3042    if (v == NULL)
3043        goto onError;
3044    return v;
3045
3046  onError:
3047    return NULL;
3048}
3049
3050PyObject *
3051PyUnicode_EncodeFSDefault(PyObject *unicode)
3052{
3053#ifdef HAVE_MBCS
3054    const Py_UNICODE *wstr;
3055    Py_ssize_t wlen;
3056
3057    wstr = PyUnicode_AsUnicodeAndSize(unicode, &wlen);
3058    if (wstr == NULL)
3059        return NULL;
3060    return PyUnicode_EncodeMBCS(wstr, wlen, NULL);
3061#elif defined(__APPLE__)
3062    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3063#else
3064    PyInterpreterState *interp = PyThreadState_GET()->interp;
3065    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3066       cannot use it to encode and decode filenames before it is loaded. Load
3067       the Python codec requires to encode at least its own filename. Use the C
3068       version of the locale codec until the codec registry is initialized and
3069       the Python codec is loaded.
3070
3071       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3072       cannot only rely on it: check also interp->fscodec_initialized for
3073       subinterpreters. */
3074    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3075        return PyUnicode_AsEncodedString(unicode,
3076                                         Py_FileSystemDefaultEncoding,
3077                                         "surrogateescape");
3078    }
3079    else {
3080        /* locale encoding with surrogateescape */
3081        wchar_t *wchar;
3082        char *bytes;
3083        PyObject *bytes_obj;
3084        size_t error_pos;
3085
3086        wchar = PyUnicode_AsWideCharString(unicode, NULL);
3087        if (wchar == NULL)
3088            return NULL;
3089        bytes = _Py_wchar2char(wchar, &error_pos);
3090        if (bytes == NULL) {
3091            if (error_pos != (size_t)-1) {
3092                char *errmsg = strerror(errno);
3093                PyObject *exc = NULL;
3094                if (errmsg == NULL)
3095                    errmsg = "Py_wchar2char() failed";
3096                raise_encode_exception(&exc,
3097                    "filesystemencoding", unicode,
3098                    error_pos, error_pos+1,
3099                    errmsg);
3100                Py_XDECREF(exc);
3101            }
3102            else
3103                PyErr_NoMemory();
3104            PyMem_Free(wchar);
3105            return NULL;
3106        }
3107        PyMem_Free(wchar);
3108
3109        bytes_obj = PyBytes_FromString(bytes);
3110        PyMem_Free(bytes);
3111        return bytes_obj;
3112    }
3113#endif
3114}
3115
3116PyObject *
3117PyUnicode_AsEncodedString(PyObject *unicode,
3118                          const char *encoding,
3119                          const char *errors)
3120{
3121    PyObject *v;
3122    char lower[11];  /* Enough for any encoding shortcut */
3123
3124    if (!PyUnicode_Check(unicode)) {
3125        PyErr_BadArgument();
3126        return NULL;
3127    }
3128
3129    /* Shortcuts for common default encodings */
3130    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3131        if ((strcmp(lower, "utf-8") == 0) ||
3132            (strcmp(lower, "utf8") == 0))
3133        {
3134            if (errors == NULL || strcmp(errors, "strict") == 0)
3135                return _PyUnicode_AsUTF8String(unicode, NULL);
3136            else
3137                return _PyUnicode_AsUTF8String(unicode, errors);
3138        }
3139        else if ((strcmp(lower, "latin-1") == 0) ||
3140                 (strcmp(lower, "latin1") == 0) ||
3141                 (strcmp(lower, "iso-8859-1") == 0))
3142            return _PyUnicode_AsLatin1String(unicode, errors);
3143#ifdef HAVE_MBCS
3144        else if (strcmp(lower, "mbcs") == 0) {
3145            const Py_UNICODE *wstr;
3146            Py_ssize_t wlen;
3147
3148            wstr = PyUnicode_AsUnicodeAndSize(unicode, &wlen);
3149            if (wstr == NULL)
3150                return NULL;
3151            return PyUnicode_EncodeMBCS(wstr, wlen, errors);
3152        }
3153#endif
3154        else if (strcmp(lower, "ascii") == 0)
3155            return _PyUnicode_AsASCIIString(unicode, errors);
3156    }
3157
3158    /* Encode via the codec registry */
3159    v = PyCodec_Encode(unicode, encoding, errors);
3160    if (v == NULL)
3161        return NULL;
3162
3163    /* The normal path */
3164    if (PyBytes_Check(v))
3165        return v;
3166
3167    /* If the codec returns a buffer, raise a warning and convert to bytes */
3168    if (PyByteArray_Check(v)) {
3169        int error;
3170        PyObject *b;
3171
3172        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3173            "encoder %s returned bytearray instead of bytes",
3174            encoding);
3175        if (error) {
3176            Py_DECREF(v);
3177            return NULL;
3178        }
3179
3180        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3181        Py_DECREF(v);
3182        return b;
3183    }
3184
3185    PyErr_Format(PyExc_TypeError,
3186                 "encoder did not return a bytes object (type=%.400s)",
3187                 Py_TYPE(v)->tp_name);
3188    Py_DECREF(v);
3189    return NULL;
3190}
3191
3192PyObject *
3193PyUnicode_AsEncodedUnicode(PyObject *unicode,
3194                           const char *encoding,
3195                           const char *errors)
3196{
3197    PyObject *v;
3198
3199    if (!PyUnicode_Check(unicode)) {
3200        PyErr_BadArgument();
3201        goto onError;
3202    }
3203
3204    if (encoding == NULL)
3205        encoding = PyUnicode_GetDefaultEncoding();
3206
3207    /* Encode via the codec registry */
3208    v = PyCodec_Encode(unicode, encoding, errors);
3209    if (v == NULL)
3210        goto onError;
3211    if (!PyUnicode_Check(v)) {
3212        PyErr_Format(PyExc_TypeError,
3213                     "encoder did not return an str object (type=%.400s)",
3214                     Py_TYPE(v)->tp_name);
3215        Py_DECREF(v);
3216        goto onError;
3217    }
3218    return v;
3219
3220  onError:
3221    return NULL;
3222}
3223
3224PyObject*
3225PyUnicode_DecodeFSDefault(const char *s) {
3226    Py_ssize_t size = (Py_ssize_t)strlen(s);
3227    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3228}
3229
3230PyObject*
3231PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3232{
3233#ifdef HAVE_MBCS
3234    return PyUnicode_DecodeMBCS(s, size, NULL);
3235#elif defined(__APPLE__)
3236    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3237#else
3238    PyInterpreterState *interp = PyThreadState_GET()->interp;
3239    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3240       cannot use it to encode and decode filenames before it is loaded. Load
3241       the Python codec requires to encode at least its own filename. Use the C
3242       version of the locale codec until the codec registry is initialized and
3243       the Python codec is loaded.
3244
3245       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3246       cannot only rely on it: check also interp->fscodec_initialized for
3247       subinterpreters. */
3248    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3249        return PyUnicode_Decode(s, size,
3250                                Py_FileSystemDefaultEncoding,
3251                                "surrogateescape");
3252    }
3253    else {
3254        /* locale encoding with surrogateescape */
3255        wchar_t *wchar;
3256        PyObject *unicode;
3257        size_t len;
3258
3259        if (s[size] != '\0' || size != strlen(s)) {
3260            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3261            return NULL;
3262        }
3263
3264        wchar = _Py_char2wchar(s, &len);
3265        if (wchar == NULL)
3266            return PyErr_NoMemory();
3267
3268        unicode = PyUnicode_FromWideChar(wchar, len);
3269        PyMem_Free(wchar);
3270        return unicode;
3271    }
3272#endif
3273}
3274
3275
3276int
3277PyUnicode_FSConverter(PyObject* arg, void* addr)
3278{
3279    PyObject *output = NULL;
3280    Py_ssize_t size;
3281    void *data;
3282    if (arg == NULL) {
3283        Py_DECREF(*(PyObject**)addr);
3284        return 1;
3285    }
3286    if (PyBytes_Check(arg)) {
3287        output = arg;
3288        Py_INCREF(output);
3289    }
3290    else {
3291        arg = PyUnicode_FromObject(arg);
3292        if (!arg)
3293            return 0;
3294        output = PyUnicode_EncodeFSDefault(arg);
3295        Py_DECREF(arg);
3296        if (!output)
3297            return 0;
3298        if (!PyBytes_Check(output)) {
3299            Py_DECREF(output);
3300            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3301            return 0;
3302        }
3303    }
3304    size = PyBytes_GET_SIZE(output);
3305    data = PyBytes_AS_STRING(output);
3306    if (size != strlen(data)) {
3307        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3308        Py_DECREF(output);
3309        return 0;
3310    }
3311    *(PyObject**)addr = output;
3312    return Py_CLEANUP_SUPPORTED;
3313}
3314
3315
3316int
3317PyUnicode_FSDecoder(PyObject* arg, void* addr)
3318{
3319    PyObject *output = NULL;
3320    if (arg == NULL) {
3321        Py_DECREF(*(PyObject**)addr);
3322        return 1;
3323    }
3324    if (PyUnicode_Check(arg)) {
3325        if (PyUnicode_READY(arg))
3326            return 0;
3327        output = arg;
3328        Py_INCREF(output);
3329    }
3330    else {
3331        arg = PyBytes_FromObject(arg);
3332        if (!arg)
3333            return 0;
3334        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3335                                                  PyBytes_GET_SIZE(arg));
3336        Py_DECREF(arg);
3337        if (!output)
3338            return 0;
3339        if (!PyUnicode_Check(output)) {
3340            Py_DECREF(output);
3341            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3342            return 0;
3343        }
3344    }
3345    if (PyUnicode_READY(output) < 0) {
3346        Py_DECREF(output);
3347        return 0;
3348    }
3349    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3350                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3351        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3352        Py_DECREF(output);
3353        return 0;
3354    }
3355    *(PyObject**)addr = output;
3356    return Py_CLEANUP_SUPPORTED;
3357}
3358
3359
3360char*
3361PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3362{
3363    PyObject *bytes;
3364
3365    if (!PyUnicode_Check(unicode)) {
3366        PyErr_BadArgument();
3367        return NULL;
3368    }
3369    if (PyUnicode_READY(unicode) == -1)
3370        return NULL;
3371
3372    if (PyUnicode_UTF8(unicode) == NULL) {
3373        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3374        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3375        if (bytes == NULL)
3376            return NULL;
3377        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3378        if (_PyUnicode_UTF8(unicode) == NULL) {
3379            Py_DECREF(bytes);
3380            return NULL;
3381        }
3382        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3383        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3384                  PyBytes_AS_STRING(bytes),
3385                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3386        Py_DECREF(bytes);
3387    }
3388
3389    if (psize)
3390        *psize = PyUnicode_UTF8_LENGTH(unicode);
3391    return PyUnicode_UTF8(unicode);
3392}
3393
3394char*
3395PyUnicode_AsUTF8(PyObject *unicode)
3396{
3397    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3398}
3399
3400#ifdef Py_DEBUG
3401static int unicode_as_unicode_calls = 0;
3402#endif
3403
3404
3405Py_UNICODE *
3406PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3407{
3408    const unsigned char *one_byte;
3409#if SIZEOF_WCHAR_T == 4
3410    const Py_UCS2 *two_bytes;
3411#else
3412    const Py_UCS4 *four_bytes;
3413    const Py_UCS4 *ucs4_end;
3414    Py_ssize_t num_surrogates;
3415#endif
3416    wchar_t *w;
3417    wchar_t *wchar_end;
3418
3419    if (!PyUnicode_Check(unicode)) {
3420        PyErr_BadArgument();
3421        return NULL;
3422    }
3423    if (_PyUnicode_WSTR(unicode) == NULL) {
3424        /* Non-ASCII compact unicode object */
3425        assert(_PyUnicode_KIND(unicode) != 0);
3426        assert(PyUnicode_IS_READY(unicode));
3427
3428#ifdef Py_DEBUG
3429        ++unicode_as_unicode_calls;
3430#endif
3431
3432        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3433#if SIZEOF_WCHAR_T == 2
3434            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3435            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3436            num_surrogates = 0;
3437
3438            for (; four_bytes < ucs4_end; ++four_bytes) {
3439                if (*four_bytes > 0xFFFF)
3440                    ++num_surrogates;
3441            }
3442
3443            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3444                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3445            if (!_PyUnicode_WSTR(unicode)) {
3446                PyErr_NoMemory();
3447                return NULL;
3448            }
3449            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3450
3451            w = _PyUnicode_WSTR(unicode);
3452            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3453            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3454            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3455                if (*four_bytes > 0xFFFF) {
3456                    /* encode surrogate pair in this case */
3457                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3458                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3459                }
3460                else
3461                    *w = *four_bytes;
3462
3463                if (w > wchar_end) {
3464                    assert(0 && "Miscalculated string end");
3465                }
3466            }
3467            *w = 0;
3468#else
3469            /* sizeof(wchar_t) == 4 */
3470            Py_FatalError("Impossible unicode object state, wstr and str "
3471                          "should share memory already.");
3472            return NULL;
3473#endif
3474        }
3475        else {
3476            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3477                                                  (_PyUnicode_LENGTH(unicode) + 1));
3478            if (!_PyUnicode_WSTR(unicode)) {
3479                PyErr_NoMemory();
3480                return NULL;
3481            }
3482            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3483                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3484            w = _PyUnicode_WSTR(unicode);
3485            wchar_end = w + _PyUnicode_LENGTH(unicode);
3486
3487            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3488                one_byte = PyUnicode_1BYTE_DATA(unicode);
3489                for (; w < wchar_end; ++one_byte, ++w)
3490                    *w = *one_byte;
3491                /* null-terminate the wstr */
3492                *w = 0;
3493            }
3494            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3495#if SIZEOF_WCHAR_T == 4
3496                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3497                for (; w < wchar_end; ++two_bytes, ++w)
3498                    *w = *two_bytes;
3499                /* null-terminate the wstr */
3500                *w = 0;
3501#else
3502                /* sizeof(wchar_t) == 2 */
3503                PyObject_FREE(_PyUnicode_WSTR(unicode));
3504                _PyUnicode_WSTR(unicode) = NULL;
3505                Py_FatalError("Impossible unicode object state, wstr "
3506                              "and str should share memory already.");
3507                return NULL;
3508#endif
3509            }
3510            else {
3511                assert(0 && "This should never happen.");
3512            }
3513        }
3514    }
3515    if (size != NULL)
3516        *size = PyUnicode_WSTR_LENGTH(unicode);
3517    return _PyUnicode_WSTR(unicode);
3518}
3519
3520Py_UNICODE *
3521PyUnicode_AsUnicode(PyObject *unicode)
3522{
3523    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3524}
3525
3526
3527Py_ssize_t
3528PyUnicode_GetSize(PyObject *unicode)
3529{
3530    if (!PyUnicode_Check(unicode)) {
3531        PyErr_BadArgument();
3532        goto onError;
3533    }
3534    return PyUnicode_GET_SIZE(unicode);
3535
3536  onError:
3537    return -1;
3538}
3539
3540Py_ssize_t
3541PyUnicode_GetLength(PyObject *unicode)
3542{
3543    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3544        PyErr_BadArgument();
3545        return -1;
3546    }
3547
3548    return PyUnicode_GET_LENGTH(unicode);
3549}
3550
3551Py_UCS4
3552PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3553{
3554    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3555        PyErr_BadArgument();
3556        return (Py_UCS4)-1;
3557    }
3558    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3559        PyErr_SetString(PyExc_IndexError, "string index out of range");
3560        return (Py_UCS4)-1;
3561    }
3562    return PyUnicode_READ_CHAR(unicode, index);
3563}
3564
3565int
3566PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3567{
3568    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3569        PyErr_BadArgument();
3570        return -1;
3571    }
3572    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3573        PyErr_SetString(PyExc_IndexError, "string index out of range");
3574        return -1;
3575    }
3576    if (_PyUnicode_Dirty(unicode))
3577        return -1;
3578    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3579                    index, ch);
3580    return 0;
3581}
3582
3583const char *
3584PyUnicode_GetDefaultEncoding(void)
3585{
3586    return "utf-8";
3587}
3588
3589/* create or adjust a UnicodeDecodeError */
3590static void
3591make_decode_exception(PyObject **exceptionObject,
3592                      const char *encoding,
3593                      const char *input, Py_ssize_t length,
3594                      Py_ssize_t startpos, Py_ssize_t endpos,
3595                      const char *reason)
3596{
3597    if (*exceptionObject == NULL) {
3598        *exceptionObject = PyUnicodeDecodeError_Create(
3599            encoding, input, length, startpos, endpos, reason);
3600    }
3601    else {
3602        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3603            goto onError;
3604        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3605            goto onError;
3606        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3607            goto onError;
3608    }
3609    return;
3610
3611onError:
3612    Py_DECREF(*exceptionObject);
3613    *exceptionObject = NULL;
3614}
3615
3616/* error handling callback helper:
3617   build arguments, call the callback and check the arguments,
3618   if no exception occurred, copy the replacement to the output
3619   and adjust various state variables.
3620   return 0 on success, -1 on error
3621*/
3622
3623static int
3624unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3625                                 const char *encoding, const char *reason,
3626                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3627                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3628                                 PyObject **output, Py_ssize_t *outpos)
3629{
3630    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3631
3632    PyObject *restuple = NULL;
3633    PyObject *repunicode = NULL;
3634    Py_ssize_t outsize;
3635    Py_ssize_t insize;
3636    Py_ssize_t requiredsize;
3637    Py_ssize_t newpos;
3638    PyObject *inputobj = NULL;
3639    int res = -1;
3640
3641    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3642        outsize = PyUnicode_GET_LENGTH(*output);
3643    else
3644        outsize = _PyUnicode_WSTR_LENGTH(*output);
3645
3646    if (*errorHandler == NULL) {
3647        *errorHandler = PyCodec_LookupError(errors);
3648        if (*errorHandler == NULL)
3649            goto onError;
3650    }
3651
3652    make_decode_exception(exceptionObject,
3653        encoding,
3654        *input, *inend - *input,
3655        *startinpos, *endinpos,
3656        reason);
3657    if (*exceptionObject == NULL)
3658        goto onError;
3659
3660    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3661    if (restuple == NULL)
3662        goto onError;
3663    if (!PyTuple_Check(restuple)) {
3664        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3665        goto onError;
3666    }
3667    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3668        goto onError;
3669    if (PyUnicode_READY(repunicode) < 0)
3670        goto onError;
3671
3672    /* Copy back the bytes variables, which might have been modified by the
3673       callback */
3674    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3675    if (!inputobj)
3676        goto onError;
3677    if (!PyBytes_Check(inputobj)) {
3678        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3679    }
3680    *input = PyBytes_AS_STRING(inputobj);
3681    insize = PyBytes_GET_SIZE(inputobj);
3682    *inend = *input + insize;
3683    /* we can DECREF safely, as the exception has another reference,
3684       so the object won't go away. */
3685    Py_DECREF(inputobj);
3686
3687    if (newpos<0)
3688        newpos = insize+newpos;
3689    if (newpos<0 || newpos>insize) {
3690        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3691        goto onError;
3692    }
3693
3694    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3695        /* need more space? (at least enough for what we
3696           have+the replacement+the rest of the string (starting
3697           at the new input position), so we won't have to check space
3698           when there are no errors in the rest of the string) */
3699        Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3700        requiredsize = *outpos + replen + insize-newpos;
3701        if (requiredsize > outsize) {
3702            if (requiredsize<2*outsize)
3703                requiredsize = 2*outsize;
3704            if (unicode_resize(output, requiredsize) < 0)
3705                goto onError;
3706        }
3707        if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
3708            goto onError;
3709        copy_characters(*output, *outpos, repunicode, 0, replen);
3710        *outpos += replen;
3711    }
3712    else {
3713        wchar_t *repwstr;
3714        Py_ssize_t repwlen;
3715        repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3716        if (repwstr == NULL)
3717            goto onError;
3718        /* need more space? (at least enough for what we
3719           have+the replacement+the rest of the string (starting
3720           at the new input position), so we won't have to check space
3721           when there are no errors in the rest of the string) */
3722        requiredsize = *outpos + repwlen + insize-newpos;
3723        if (requiredsize > outsize) {
3724            if (requiredsize < 2*outsize)
3725                requiredsize = 2*outsize;
3726            if (unicode_resize(output, requiredsize) < 0)
3727                goto onError;
3728        }
3729        wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3730        *outpos += repwlen;
3731    }
3732    *endinpos = newpos;
3733    *inptr = *input + newpos;
3734
3735    /* we made it! */
3736    res = 0;
3737
3738  onError:
3739    Py_XDECREF(restuple);
3740    return res;
3741}
3742
3743/* --- UTF-7 Codec -------------------------------------------------------- */
3744
3745/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3746
3747/* Three simple macros defining base-64. */
3748
3749/* Is c a base-64 character? */
3750
3751#define IS_BASE64(c) \
3752    (((c) >= 'A' && (c) <= 'Z') ||     \
3753     ((c) >= 'a' && (c) <= 'z') ||     \
3754     ((c) >= '0' && (c) <= '9') ||     \
3755     (c) == '+' || (c) == '/')
3756
3757/* given that c is a base-64 character, what is its base-64 value? */
3758
3759#define FROM_BASE64(c)                                                  \
3760    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3761     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3762     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3763     (c) == '+' ? 62 : 63)
3764
3765/* What is the base-64 character of the bottom 6 bits of n? */
3766
3767#define TO_BASE64(n)  \
3768    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3769
3770/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3771 * decoded as itself.  We are permissive on decoding; the only ASCII
3772 * byte not decoding to itself is the + which begins a base64
3773 * string. */
3774
3775#define DECODE_DIRECT(c)                                \
3776    ((c) <= 127 && (c) != '+')
3777
3778/* The UTF-7 encoder treats ASCII characters differently according to
3779 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3780 * the above).  See RFC2152.  This array identifies these different
3781 * sets:
3782 * 0 : "Set D"
3783 *     alphanumeric and '(),-./:?
3784 * 1 : "Set O"
3785 *     !"#$%&*;<=>@[]^_`{|}
3786 * 2 : "whitespace"
3787 *     ht nl cr sp
3788 * 3 : special (must be base64 encoded)
3789 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3790 */
3791
3792static
3793char utf7_category[128] = {
3794/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3795    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3796/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3797    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3798/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3799    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3800/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3801    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3802/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3803    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3804/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3805    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3806/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3807    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3808/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3809    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3810};
3811
3812/* ENCODE_DIRECT: this character should be encoded as itself.  The
3813 * answer depends on whether we are encoding set O as itself, and also
3814 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3815 * clear that the answers to these questions vary between
3816 * applications, so this code needs to be flexible.  */
3817
3818#define ENCODE_DIRECT(c, directO, directWS)             \
3819    ((c) < 128 && (c) > 0 &&                            \
3820     ((utf7_category[(c)] == 0) ||                      \
3821      (directWS && (utf7_category[(c)] == 2)) ||        \
3822      (directO && (utf7_category[(c)] == 1))))
3823
3824PyObject *
3825PyUnicode_DecodeUTF7(const char *s,
3826                     Py_ssize_t size,
3827                     const char *errors)
3828{
3829    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3830}
3831
3832/* The decoder.  The only state we preserve is our read position,
3833 * i.e. how many characters we have consumed.  So if we end in the
3834 * middle of a shift sequence we have to back off the read position
3835 * and the output to the beginning of the sequence, otherwise we lose
3836 * all the shift state (seen bits, number of bits seen, high
3837 * surrogate). */
3838
3839PyObject *
3840PyUnicode_DecodeUTF7Stateful(const char *s,
3841                             Py_ssize_t size,
3842                             const char *errors,
3843                             Py_ssize_t *consumed)
3844{
3845    const char *starts = s;
3846    Py_ssize_t startinpos;
3847    Py_ssize_t endinpos;
3848    Py_ssize_t outpos;
3849    const char *e;
3850    PyObject *unicode;
3851    const char *errmsg = "";
3852    int inShift = 0;
3853    Py_ssize_t shiftOutStart;
3854    unsigned int base64bits = 0;
3855    unsigned long base64buffer = 0;
3856    Py_UCS4 surrogate = 0;
3857    PyObject *errorHandler = NULL;
3858    PyObject *exc = NULL;
3859
3860    /* Start off assuming it's all ASCII. Widen later as necessary. */
3861    unicode = PyUnicode_New(size, 127);
3862    if (!unicode)
3863        return NULL;
3864    if (size == 0) {
3865        if (consumed)
3866            *consumed = 0;
3867        return unicode;
3868    }
3869
3870    shiftOutStart = outpos = 0;
3871    e = s + size;
3872
3873    while (s < e) {
3874        Py_UCS4 ch;
3875      restart:
3876        ch = (unsigned char) *s;
3877
3878        if (inShift) { /* in a base-64 section */
3879            if (IS_BASE64(ch)) { /* consume a base-64 character */
3880                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3881                base64bits += 6;
3882                s++;
3883                if (base64bits >= 16) {
3884                    /* we have enough bits for a UTF-16 value */
3885                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
3886                    base64bits -= 16;
3887                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3888                    if (surrogate) {
3889                        /* expecting a second surrogate */
3890                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3891                            Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3892                                           | (outCh & 0x3FF)) + 0x10000;
3893                            if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3894                                goto onError;
3895                            surrogate = 0;
3896                            continue;
3897                        }
3898                        else {
3899                            if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3900                                goto onError;
3901                            surrogate = 0;
3902                        }
3903                    }
3904                    if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3905                        /* first surrogate */
3906                        surrogate = outCh;
3907                    }
3908                    else {
3909                        if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3910                            goto onError;
3911                    }
3912                }
3913            }
3914            else { /* now leaving a base-64 section */
3915                inShift = 0;
3916                s++;
3917                if (surrogate) {
3918                    if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3919                        goto onError;
3920                    surrogate = 0;
3921                }
3922                if (base64bits > 0) { /* left-over bits */
3923                    if (base64bits >= 6) {
3924                        /* We've seen at least one base-64 character */
3925                        errmsg = "partial character in shift sequence";
3926                        goto utf7Error;
3927                    }
3928                    else {
3929                        /* Some bits remain; they should be zero */
3930                        if (base64buffer != 0) {
3931                            errmsg = "non-zero padding bits in shift sequence";
3932                            goto utf7Error;
3933                        }
3934                    }
3935                }
3936                if (ch != '-') {
3937                    /* '-' is absorbed; other terminating
3938                       characters are preserved */
3939                    if (unicode_putchar(&unicode, &outpos, ch) < 0)
3940                        goto onError;
3941                }
3942            }
3943        }
3944        else if ( ch == '+' ) {
3945            startinpos = s-starts;
3946            s++; /* consume '+' */
3947            if (s < e && *s == '-') { /* '+-' encodes '+' */
3948                s++;
3949                if (unicode_putchar(&unicode, &outpos, '+') < 0)
3950                    goto onError;
3951            }
3952            else { /* begin base64-encoded section */
3953                inShift = 1;
3954                shiftOutStart = outpos;
3955                base64bits = 0;
3956            }
3957        }
3958        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3959            if (unicode_putchar(&unicode, &outpos, ch) < 0)
3960                goto onError;
3961            s++;
3962        }
3963        else {
3964            startinpos = s-starts;
3965            s++;
3966            errmsg = "unexpected special character";
3967            goto utf7Error;
3968        }
3969        continue;
3970utf7Error:
3971        endinpos = s-starts;
3972        if (unicode_decode_call_errorhandler(
3973                errors, &errorHandler,
3974                "utf7", errmsg,
3975                &starts, &e, &startinpos, &endinpos, &exc, &s,
3976                &unicode, &outpos))
3977            goto onError;
3978    }
3979
3980    /* end of string */
3981
3982    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3983        /* if we're in an inconsistent state, that's an error */
3984        if (surrogate ||
3985                (base64bits >= 6) ||
3986                (base64bits > 0 && base64buffer != 0)) {
3987            endinpos = size;
3988            if (unicode_decode_call_errorhandler(
3989                    errors, &errorHandler,
3990                    "utf7", "unterminated shift sequence",
3991                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3992                    &unicode, &outpos))
3993                goto onError;
3994            if (s < e)
3995                goto restart;
3996        }
3997    }
3998
3999    /* return state */
4000    if (consumed) {
4001        if (inShift) {
4002            outpos = shiftOutStart; /* back off output */
4003            *consumed = startinpos;
4004        }
4005        else {
4006            *consumed = s-starts;
4007        }
4008    }
4009
4010    if (unicode_resize(&unicode, outpos) < 0)
4011        goto onError;
4012
4013    Py_XDECREF(errorHandler);
4014    Py_XDECREF(exc);
4015#ifndef DONT_MAKE_RESULT_READY
4016    if (_PyUnicode_READY_REPLACE(&unicode)) {
4017        Py_DECREF(unicode);
4018        return NULL;
4019    }
4020#endif
4021    assert(_PyUnicode_CheckConsistency(unicode, 1));
4022    return unicode;
4023
4024  onError:
4025    Py_XDECREF(errorHandler);
4026    Py_XDECREF(exc);
4027    Py_DECREF(unicode);
4028    return NULL;
4029}
4030
4031
4032PyObject *
4033_PyUnicode_EncodeUTF7(PyObject *str,
4034                      int base64SetO,
4035                      int base64WhiteSpace,
4036                      const char *errors)
4037{
4038    int kind;
4039    void *data;
4040    Py_ssize_t len;
4041    PyObject *v;
4042    Py_ssize_t allocated;
4043    int inShift = 0;
4044    Py_ssize_t i;
4045    unsigned int base64bits = 0;
4046    unsigned long base64buffer = 0;
4047    char * out;
4048    char * start;
4049
4050    if (PyUnicode_READY(str) < 0)
4051        return NULL;
4052    kind = PyUnicode_KIND(str);
4053    data = PyUnicode_DATA(str);
4054    len = PyUnicode_GET_LENGTH(str);
4055
4056    if (len == 0)
4057        return PyBytes_FromStringAndSize(NULL, 0);
4058
4059    /* It might be possible to tighten this worst case */
4060    allocated = 8 * len;
4061    if (allocated / 8 != len)
4062        return PyErr_NoMemory();
4063
4064    v = PyBytes_FromStringAndSize(NULL, allocated);
4065    if (v == NULL)
4066        return NULL;
4067
4068    start = out = PyBytes_AS_STRING(v);
4069    for (i = 0; i < len; ++i) {
4070        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4071
4072        if (inShift) {
4073            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4074                /* shifting out */
4075                if (base64bits) { /* output remaining bits */
4076                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4077                    base64buffer = 0;
4078                    base64bits = 0;
4079                }
4080                inShift = 0;
4081                /* Characters not in the BASE64 set implicitly unshift the sequence
4082                   so no '-' is required, except if the character is itself a '-' */
4083                if (IS_BASE64(ch) || ch == '-') {
4084                    *out++ = '-';
4085                }
4086                *out++ = (char) ch;
4087            }
4088            else {
4089                goto encode_char;
4090            }
4091        }
4092        else { /* not in a shift sequence */
4093            if (ch == '+') {
4094                *out++ = '+';
4095                        *out++ = '-';
4096            }
4097            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4098                *out++ = (char) ch;
4099            }
4100            else {
4101                *out++ = '+';
4102                inShift = 1;
4103                goto encode_char;
4104            }
4105        }
4106        continue;
4107encode_char:
4108        if (ch >= 0x10000) {
4109            /* code first surrogate */
4110            base64bits += 16;
4111            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4112            while (base64bits >= 6) {
4113                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4114                base64bits -= 6;
4115            }
4116            /* prepare second surrogate */
4117            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
4118        }
4119        base64bits += 16;
4120        base64buffer = (base64buffer << 16) | ch;
4121        while (base64bits >= 6) {
4122            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4123            base64bits -= 6;
4124        }
4125    }
4126    if (base64bits)
4127        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4128    if (inShift)
4129        *out++ = '-';
4130    if (_PyBytes_Resize(&v, out - start) < 0)
4131        return NULL;
4132    return v;
4133}
4134PyObject *
4135PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4136                     Py_ssize_t size,
4137                     int base64SetO,
4138                     int base64WhiteSpace,
4139                     const char *errors)
4140{
4141    PyObject *result;
4142    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4143    if (tmp == NULL)
4144        return NULL;
4145    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4146                                   base64WhiteSpace, errors);
4147    Py_DECREF(tmp);
4148    return result;
4149}
4150
4151#undef IS_BASE64
4152#undef FROM_BASE64
4153#undef TO_BASE64
4154#undef DECODE_DIRECT
4155#undef ENCODE_DIRECT
4156
4157/* --- UTF-8 Codec -------------------------------------------------------- */
4158
4159static
4160char utf8_code_length[256] = {
4161    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
4162       illegal prefix.  See RFC 3629 for details */
4163    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4164    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4165    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4166    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4167    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4168    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4169    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4170    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4171    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
4172    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4173    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4174    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4175    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4176    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4177    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4178    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
4179};
4180
4181PyObject *
4182PyUnicode_DecodeUTF8(const char *s,
4183                     Py_ssize_t size,
4184                     const char *errors)
4185{
4186    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4187}
4188
4189/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4190#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4191
4192/* Mask to quickly check whether a C 'long' contains a
4193   non-ASCII, UTF8-encoded char. */
4194#if (SIZEOF_LONG == 8)
4195# define ASCII_CHAR_MASK 0x8080808080808080L
4196#elif (SIZEOF_LONG == 4)
4197# define ASCII_CHAR_MASK 0x80808080L
4198#else
4199# error C 'long' size should be either 4 or 8!
4200#endif
4201
4202/* Scans a UTF-8 string and returns the maximum character to be expected,
4203   the size of the decoded unicode string and if any major errors were
4204   encountered.
4205
4206   This function does check basic UTF-8 sanity, it does however NOT CHECK
4207   if the string contains surrogates, and if all continuation bytes are
4208   within the correct ranges, these checks are performed in
4209   PyUnicode_DecodeUTF8Stateful.
4210
4211   If it sets has_errors to 1, it means the value of unicode_size and max_char
4212   will be bogus and you should not rely on useful information in them.
4213   */
4214static Py_UCS4
4215utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4216                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4217                                  int *has_errors)
4218{
4219    Py_ssize_t n;
4220    Py_ssize_t char_count = 0;
4221    Py_UCS4 max_char = 127, new_max;
4222    Py_UCS4 upper_bound;
4223    const unsigned char *p = (const unsigned char *)s;
4224    const unsigned char *end = p + string_size;
4225    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4226    int err = 0;
4227
4228    for (; p < end && !err; ++p, ++char_count) {
4229        /* Only check value if it's not a ASCII char... */
4230        if (*p < 0x80) {
4231            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4232               an explanation. */
4233            if (!((size_t) p & LONG_PTR_MASK)) {
4234                /* Help register allocation */
4235                register const unsigned char *_p = p;
4236                while (_p < aligned_end) {
4237                    unsigned long value = *(unsigned long *) _p;
4238                    if (value & ASCII_CHAR_MASK)
4239                        break;
4240                    _p += SIZEOF_LONG;
4241                    char_count += SIZEOF_LONG;
4242                }
4243                p = _p;
4244                if (p == end)
4245                    break;
4246            }
4247        }
4248        if (*p >= 0x80) {
4249            n = utf8_code_length[*p];
4250            new_max = max_char;
4251            switch (n) {
4252            /* invalid start byte */
4253            case 0:
4254                err = 1;
4255                break;
4256            case 2:
4257                /* Code points between 0x00FF and 0x07FF inclusive.
4258                   Approximate the upper bound of the code point,
4259                   if this flips over 255 we can be sure it will be more
4260                   than 255 and the string will need 2 bytes per code coint,
4261                   if it stays under or equal to 255, we can be sure 1 byte
4262                   is enough.
4263                   ((*p & 0b00011111) << 6) | 0b00111111 */
4264                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4265                if (max_char < upper_bound)
4266                    new_max = upper_bound;
4267                /* Ensure we track at least that we left ASCII space. */
4268                if (new_max < 128)
4269                    new_max = 128;
4270                break;
4271            case 3:
4272                /* Between 0x0FFF and 0xFFFF inclusive, so values are
4273                   always > 255 and <= 65535 and will always need 2 bytes. */
4274                if (max_char < 65535)
4275                    new_max = 65535;
4276                break;
4277            case 4:
4278                /* Code point will be above 0xFFFF for sure in this case. */
4279                new_max = 65537;
4280                break;
4281            /* Internal error, this should be caught by the first if */
4282            case 1:
4283            default:
4284                assert(0 && "Impossible case in utf8_max_char_and_size");
4285                err = 1;
4286            }
4287            /* Instead of number of overall bytes for this code point,
4288               n contains the number of following bytes: */
4289            --n;
4290            /* Check if the follow up chars are all valid continuation bytes */
4291            if (n >= 1) {
4292                const unsigned char *cont;
4293                if ((p + n) >= end) {
4294                    if (consumed == 0)
4295                        /* incomplete data, non-incremental decoding */
4296                        err = 1;
4297                    break;
4298                }
4299                for (cont = p + 1; cont <= (p + n); ++cont) {
4300                    if ((*cont & 0xc0) != 0x80) {
4301                        err = 1;
4302                        break;
4303                    }
4304                }
4305                p += n;
4306            }
4307            else
4308                err = 1;
4309            max_char = new_max;
4310        }
4311    }
4312
4313    if (unicode_size)
4314        *unicode_size = char_count;
4315    if (has_errors)
4316        *has_errors = err;
4317    return max_char;
4318}
4319
4320/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4321   in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4322   onError. Potential resizing overallocates, so the result needs to shrink
4323   at the end.
4324*/
4325#define WRITE_MAYBE_FAIL(index, value)                                  \
4326    do {                                                                \
4327        if (has_errors) {                                               \
4328            Py_ssize_t pos = index;                                     \
4329            if (pos > PyUnicode_GET_LENGTH(unicode) &&                  \
4330                unicode_resize(&unicode, pos + pos/8) < 0)              \
4331                goto onError;                                           \
4332            if (unicode_putchar(&unicode, &pos, value) < 0)             \
4333                goto onError;                                           \
4334        }                                                               \
4335        else                                                            \
4336            PyUnicode_WRITE(kind, data, index, value);                  \
4337    } while (0)
4338
4339PyObject *
4340PyUnicode_DecodeUTF8Stateful(const char *s,
4341                             Py_ssize_t size,
4342                             const char *errors,
4343                             Py_ssize_t *consumed)
4344{
4345    const char *starts = s;
4346    int n;
4347    int k;
4348    Py_ssize_t startinpos;
4349    Py_ssize_t endinpos;
4350    const char *e, *aligned_end;
4351    PyObject *unicode;
4352    const char *errmsg = "";
4353    PyObject *errorHandler = NULL;
4354    PyObject *exc = NULL;
4355    Py_UCS4 maxchar = 0;
4356    Py_ssize_t unicode_size;
4357    Py_ssize_t i;
4358    int kind;
4359    void *data;
4360    int has_errors;
4361
4362    if (size == 0) {
4363        if (consumed)
4364            *consumed = 0;
4365        return (PyObject *)PyUnicode_New(0, 0);
4366    }
4367    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4368                                                consumed, &has_errors);
4369    if (has_errors)
4370        /* maxchar and size computation might be incorrect;
4371           code below widens and resizes as necessary. */
4372        unicode = PyUnicode_New(size, 127);
4373    else
4374        unicode = PyUnicode_New(unicode_size, maxchar);
4375    if (!unicode)
4376        return NULL;
4377    /* When the string is ASCII only, just use memcpy and return.
4378       unicode_size may be != size if there is an incomplete UTF-8
4379       sequence at the end of the ASCII block.  */
4380    if (!has_errors && maxchar < 128 && size == unicode_size) {
4381        Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4382        return unicode;
4383    }
4384    kind = PyUnicode_KIND(unicode);
4385    data = PyUnicode_DATA(unicode);
4386    /* Unpack UTF-8 encoded data */
4387    i = 0;
4388    e = s + size;
4389    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4390
4391    while (s < e) {
4392        Py_UCS4 ch = (unsigned char)*s;
4393
4394        if (ch < 0x80) {
4395            /* Fast path for runs of ASCII characters. Given that common UTF-8
4396               input will consist of an overwhelming majority of ASCII
4397               characters, we try to optimize for this case by checking
4398               as many characters as a C 'long' can contain.
4399               First, check if we can do an aligned read, as most CPUs have
4400               a penalty for unaligned reads.
4401            */
4402            if (!((size_t) s & LONG_PTR_MASK)) {
4403                /* Help register allocation */
4404                register const char *_s = s;
4405                register Py_ssize_t _i = i;
4406                while (_s < aligned_end) {
4407                    /* Read a whole long at a time (either 4 or 8 bytes),
4408                       and do a fast unrolled copy if it only contains ASCII
4409                       characters. */
4410                    unsigned long value = *(unsigned long *) _s;
4411                    if (value & ASCII_CHAR_MASK)
4412                        break;
4413                    WRITE_MAYBE_FAIL(_i+0, _s[0]);
4414                    WRITE_MAYBE_FAIL(_i+1, _s[1]);
4415                    WRITE_MAYBE_FAIL(_i+2, _s[2]);
4416                    WRITE_MAYBE_FAIL(_i+3, _s[3]);
4417#if (SIZEOF_LONG == 8)
4418                    WRITE_MAYBE_FAIL(_i+4, _s[4]);
4419                    WRITE_MAYBE_FAIL(_i+5, _s[5]);
4420                    WRITE_MAYBE_FAIL(_i+6, _s[6]);
4421                    WRITE_MAYBE_FAIL(_i+7, _s[7]);
4422#endif
4423                    _s += SIZEOF_LONG;
4424                    _i += SIZEOF_LONG;
4425                }
4426                s = _s;
4427                i = _i;
4428                if (s == e)
4429                    break;
4430                ch = (unsigned char)*s;
4431            }
4432        }
4433
4434        if (ch < 0x80) {
4435            WRITE_MAYBE_FAIL(i++, ch);
4436            s++;
4437            continue;
4438        }
4439
4440        n = utf8_code_length[ch];
4441
4442        if (s + n > e) {
4443            if (consumed)
4444                break;
4445            else {
4446                errmsg = "unexpected end of data";
4447                startinpos = s-starts;
4448                endinpos = startinpos+1;
4449                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4450                    endinpos++;
4451                goto utf8Error;
4452            }
4453        }
4454
4455        switch (n) {
4456
4457        case 0:
4458            errmsg = "invalid start byte";
4459            startinpos = s-starts;
4460            endinpos = startinpos+1;
4461            goto utf8Error;
4462
4463        case 1:
4464            errmsg = "internal error";
4465            startinpos = s-starts;
4466            endinpos = startinpos+1;
4467            goto utf8Error;
4468
4469        case 2:
4470            if ((s[1] & 0xc0) != 0x80) {
4471                errmsg = "invalid continuation byte";
4472                startinpos = s-starts;
4473                endinpos = startinpos + 1;
4474                goto utf8Error;
4475            }
4476            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4477            assert ((ch > 0x007F) && (ch <= 0x07FF));
4478            WRITE_MAYBE_FAIL(i++, ch);
4479            break;
4480
4481        case 3:
4482            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4483               will result in surrogates in range d800-dfff. Surrogates are
4484               not valid UTF-8 so they are rejected.
4485               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4486               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4487            if ((s[1] & 0xc0) != 0x80 ||
4488                (s[2] & 0xc0) != 0x80 ||
4489                ((unsigned char)s[0] == 0xE0 &&
4490                 (unsigned char)s[1] < 0xA0) ||
4491                ((unsigned char)s[0] == 0xED &&
4492                 (unsigned char)s[1] > 0x9F)) {
4493                errmsg = "invalid continuation byte";
4494                startinpos = s-starts;
4495                endinpos = startinpos + 1;
4496
4497                /* if s[1] first two bits are 1 and 0, then the invalid
4498                   continuation byte is s[2], so increment endinpos by 1,
4499                   if not, s[1] is invalid and endinpos doesn't need to
4500                   be incremented. */
4501                if ((s[1] & 0xC0) == 0x80)
4502                    endinpos++;
4503                goto utf8Error;
4504            }
4505            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4506            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4507            WRITE_MAYBE_FAIL(i++, ch);
4508            break;
4509
4510        case 4:
4511            if ((s[1] & 0xc0) != 0x80 ||
4512                (s[2] & 0xc0) != 0x80 ||
4513                (s[3] & 0xc0) != 0x80 ||
4514                ((unsigned char)s[0] == 0xF0 &&
4515                 (unsigned char)s[1] < 0x90) ||
4516                ((unsigned char)s[0] == 0xF4 &&
4517                 (unsigned char)s[1] > 0x8F)) {
4518                errmsg = "invalid continuation byte";
4519                startinpos = s-starts;
4520                endinpos = startinpos + 1;
4521                if ((s[1] & 0xC0) == 0x80) {
4522                    endinpos++;
4523                    if ((s[2] & 0xC0) == 0x80)
4524                        endinpos++;
4525                }
4526                goto utf8Error;
4527            }
4528            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4529                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4530            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4531
4532            WRITE_MAYBE_FAIL(i++, ch);
4533            break;
4534        }
4535        s += n;
4536        continue;
4537
4538      utf8Error:
4539        if (!has_errors) {
4540            PyObject *tmp;
4541            Py_ssize_t k;
4542            /* We encountered some error that wasn't detected in the original scan,
4543               e.g. an encoded surrogate character. The original maxchar computation may
4544               have been incorrect, so redo it now. */
4545            for (k = 0, maxchar = 0; k < i; k++)
4546                maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4547            tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
4548            if (tmp == NULL)
4549                goto onError;
4550            PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
4551            Py_DECREF(unicode);
4552            unicode = tmp;
4553            has_errors = 1;
4554        }
4555        if (unicode_decode_call_errorhandler(
4556                errors, &errorHandler,
4557                "utf8", errmsg,
4558                &starts, &e, &startinpos, &endinpos, &exc, &s,
4559                &unicode, &i))
4560            goto onError;
4561        /* Update data because unicode_decode_call_errorhandler might have
4562           re-created or resized the unicode object. */
4563        data = PyUnicode_DATA(unicode);
4564        kind = PyUnicode_KIND(unicode);
4565        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4566    }
4567    /* Ensure the unicode_size calculation above was correct: */
4568    assert(has_errors || i == unicode_size);
4569
4570    if (consumed)
4571        *consumed = s-starts;
4572
4573    /* Adjust length and ready string when it contained errors and
4574       is of the old resizable kind. */
4575    if (has_errors) {
4576        if (PyUnicode_Resize(&unicode, i) < 0)
4577            goto onError;
4578    }
4579
4580    Py_XDECREF(errorHandler);
4581    Py_XDECREF(exc);
4582    assert(_PyUnicode_CheckConsistency(unicode, 1));
4583    return unicode;
4584
4585  onError:
4586    Py_XDECREF(errorHandler);
4587    Py_XDECREF(exc);
4588    Py_DECREF(unicode);
4589    return NULL;
4590}
4591
4592#undef WRITE_MAYBE_FAIL
4593
4594#ifdef __APPLE__
4595
4596/* Simplified UTF-8 decoder using surrogateescape error handler,
4597   used to decode the command line arguments on Mac OS X. */
4598
4599wchar_t*
4600_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4601{
4602    int n;
4603    const char *e;
4604    wchar_t *unicode, *p;
4605
4606    /* Note: size will always be longer than the resulting Unicode
4607       character count */
4608    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4609        PyErr_NoMemory();
4610        return NULL;
4611    }
4612    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4613    if (!unicode)
4614        return NULL;
4615
4616    /* Unpack UTF-8 encoded data */
4617    p = unicode;
4618    e = s + size;
4619    while (s < e) {
4620        Py_UCS4 ch = (unsigned char)*s;
4621
4622        if (ch < 0x80) {
4623            *p++ = (wchar_t)ch;
4624            s++;
4625            continue;
4626        }
4627
4628        n = utf8_code_length[ch];
4629        if (s + n > e) {
4630            goto surrogateescape;
4631        }
4632
4633        switch (n) {
4634        case 0:
4635        case 1:
4636            goto surrogateescape;
4637
4638        case 2:
4639            if ((s[1] & 0xc0) != 0x80)
4640                goto surrogateescape;
4641            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4642            assert ((ch > 0x007F) && (ch <= 0x07FF));
4643            *p++ = (wchar_t)ch;
4644            break;
4645
4646        case 3:
4647            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4648               will result in surrogates in range d800-dfff. Surrogates are
4649               not valid UTF-8 so they are rejected.
4650               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4651               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4652            if ((s[1] & 0xc0) != 0x80 ||
4653                (s[2] & 0xc0) != 0x80 ||
4654                ((unsigned char)s[0] == 0xE0 &&
4655                 (unsigned char)s[1] < 0xA0) ||
4656                ((unsigned char)s[0] == 0xED &&
4657                 (unsigned char)s[1] > 0x9F)) {
4658
4659                goto surrogateescape;
4660            }
4661            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4662            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4663            *p++ = (wchar_t)ch;
4664            break;
4665
4666        case 4:
4667            if ((s[1] & 0xc0) != 0x80 ||
4668                (s[2] & 0xc0) != 0x80 ||
4669                (s[3] & 0xc0) != 0x80 ||
4670                ((unsigned char)s[0] == 0xF0 &&
4671                 (unsigned char)s[1] < 0x90) ||
4672                ((unsigned char)s[0] == 0xF4 &&
4673                 (unsigned char)s[1] > 0x8F)) {
4674                goto surrogateescape;
4675            }
4676            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4677                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4678            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4679
4680#if SIZEOF_WCHAR_T == 4
4681            *p++ = (wchar_t)ch;
4682#else
4683            /*  compute and append the two surrogates: */
4684
4685            /*  translate from 10000..10FFFF to 0..FFFF */
4686            ch -= 0x10000;
4687
4688            /*  high surrogate = top 10 bits added to D800 */
4689            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4690
4691            /*  low surrogate = bottom 10 bits added to DC00 */
4692            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4693#endif
4694            break;
4695        }
4696        s += n;
4697        continue;
4698
4699      surrogateescape:
4700        *p++ = 0xDC00 + ch;
4701        s++;
4702    }
4703    *p = L'\0';
4704    return unicode;
4705}
4706
4707#endif /* __APPLE__ */
4708
4709/* Primary internal function which creates utf8 encoded bytes objects.
4710
4711   Allocation strategy:  if the string is short, convert into a stack buffer
4712   and allocate exactly as much space needed at the end.  Else allocate the
4713   maximum possible needed (4 result bytes per Unicode character), and return
4714   the excess memory at the end.
4715*/
4716PyObject *
4717_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4718{
4719#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4720
4721    Py_ssize_t i;                /* index into s of next input byte */
4722    PyObject *result;            /* result string object */
4723    char *p;                     /* next free byte in output buffer */
4724    Py_ssize_t nallocated;      /* number of result bytes allocated */
4725    Py_ssize_t nneeded;            /* number of result bytes needed */
4726    char stackbuf[MAX_SHORT_UNICHARS * 4];
4727    PyObject *errorHandler = NULL;
4728    PyObject *exc = NULL;
4729    int kind;
4730    void *data;
4731    Py_ssize_t size;
4732    PyObject *rep = NULL;
4733
4734    if (!PyUnicode_Check(unicode)) {
4735        PyErr_BadArgument();
4736        return NULL;
4737    }
4738
4739    if (PyUnicode_READY(unicode) == -1)
4740        return NULL;
4741
4742    if (PyUnicode_UTF8(unicode))
4743        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4744                                         PyUnicode_UTF8_LENGTH(unicode));
4745
4746    kind = PyUnicode_KIND(unicode);
4747    data = PyUnicode_DATA(unicode);
4748    size = PyUnicode_GET_LENGTH(unicode);
4749
4750    assert(size >= 0);
4751
4752    if (size <= MAX_SHORT_UNICHARS) {
4753        /* Write into the stack buffer; nallocated can't overflow.
4754         * At the end, we'll allocate exactly as much heap space as it
4755         * turns out we need.
4756         */
4757        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4758        result = NULL;   /* will allocate after we're done */
4759        p = stackbuf;
4760    }
4761    else {
4762        /* Overallocate on the heap, and give the excess back at the end. */
4763        nallocated = size * 4;
4764        if (nallocated / 4 != size)  /* overflow! */
4765            return PyErr_NoMemory();
4766        result = PyBytes_FromStringAndSize(NULL, nallocated);
4767        if (result == NULL)
4768            return NULL;
4769        p = PyBytes_AS_STRING(result);
4770    }
4771
4772    for (i = 0; i < size;) {
4773        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4774
4775        if (ch < 0x80)
4776            /* Encode ASCII */
4777            *p++ = (char) ch;
4778
4779        else if (ch < 0x0800) {
4780            /* Encode Latin-1 */
4781            *p++ = (char)(0xc0 | (ch >> 6));
4782            *p++ = (char)(0x80 | (ch & 0x3f));
4783        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4784            Py_ssize_t newpos;
4785            Py_ssize_t repsize, k, startpos;
4786            startpos = i-1;
4787            rep = unicode_encode_call_errorhandler(
4788                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4789                  unicode, &exc, startpos, startpos+1, &newpos);
4790            if (!rep)
4791                goto error;
4792
4793            if (PyBytes_Check(rep))
4794                repsize = PyBytes_GET_SIZE(rep);
4795            else
4796                repsize = PyUnicode_GET_SIZE(rep);
4797
4798            if (repsize > 4) {
4799                Py_ssize_t offset;
4800
4801                if (result == NULL)
4802                    offset = p - stackbuf;
4803                else
4804                    offset = p - PyBytes_AS_STRING(result);
4805
4806                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4807                    /* integer overflow */
4808                    PyErr_NoMemory();
4809                    goto error;
4810                }
4811                nallocated += repsize - 4;
4812                if (result != NULL) {
4813                    if (_PyBytes_Resize(&result, nallocated) < 0)
4814                        goto error;
4815                } else {
4816                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4817                    if (result == NULL)
4818                        goto error;
4819                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4820                }
4821                p = PyBytes_AS_STRING(result) + offset;
4822            }
4823
4824            if (PyBytes_Check(rep)) {
4825                char *prep = PyBytes_AS_STRING(rep);
4826                for(k = repsize; k > 0; k--)
4827                    *p++ = *prep++;
4828            } else /* rep is unicode */ {
4829                enum PyUnicode_Kind repkind;
4830                void *repdata;
4831
4832                if (PyUnicode_READY(rep) < 0)
4833                    goto error;
4834                repkind = PyUnicode_KIND(rep);
4835                repdata = PyUnicode_DATA(rep);
4836
4837                for(k=0; k<repsize; k++) {
4838                    Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
4839                    if (0x80 <= c) {
4840                        raise_encode_exception(&exc, "utf-8",
4841                                               unicode,
4842                                               i-1, i,
4843                                               "surrogates not allowed");
4844                        goto error;
4845                    }
4846                    *p++ = (char)c;
4847                }
4848            }
4849            Py_CLEAR(rep);
4850        } else if (ch < 0x10000) {
4851            *p++ = (char)(0xe0 | (ch >> 12));
4852            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4853            *p++ = (char)(0x80 | (ch & 0x3f));
4854        } else /* ch >= 0x10000 */ {
4855            /* Encode UCS4 Unicode ordinals */
4856            *p++ = (char)(0xf0 | (ch >> 18));
4857            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4858            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4859            *p++ = (char)(0x80 | (ch & 0x3f));
4860        }
4861    }
4862
4863    if (result == NULL) {
4864        /* This was stack allocated. */
4865        nneeded = p - stackbuf;
4866        assert(nneeded <= nallocated);
4867        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4868    }
4869    else {
4870        /* Cut back to size actually needed. */
4871        nneeded = p - PyBytes_AS_STRING(result);
4872        assert(nneeded <= nallocated);
4873        _PyBytes_Resize(&result, nneeded);
4874    }
4875
4876    Py_XDECREF(errorHandler);
4877    Py_XDECREF(exc);
4878    return result;
4879 error:
4880    Py_XDECREF(rep);
4881    Py_XDECREF(errorHandler);
4882    Py_XDECREF(exc);
4883    Py_XDECREF(result);
4884    return NULL;
4885
4886#undef MAX_SHORT_UNICHARS
4887}
4888
4889PyObject *
4890PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4891                     Py_ssize_t size,
4892                     const char *errors)
4893{
4894    PyObject *v, *unicode;
4895
4896    unicode = PyUnicode_FromUnicode(s, size);
4897    if (unicode == NULL)
4898        return NULL;
4899    v = _PyUnicode_AsUTF8String(unicode, errors);
4900    Py_DECREF(unicode);
4901    return v;
4902}
4903
4904PyObject *
4905PyUnicode_AsUTF8String(PyObject *unicode)
4906{
4907    return _PyUnicode_AsUTF8String(unicode, NULL);
4908}
4909
4910/* --- UTF-32 Codec ------------------------------------------------------- */
4911
4912PyObject *
4913PyUnicode_DecodeUTF32(const char *s,
4914                      Py_ssize_t size,
4915                      const char *errors,
4916                      int *byteorder)
4917{
4918    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4919}
4920
4921PyObject *
4922PyUnicode_DecodeUTF32Stateful(const char *s,
4923                              Py_ssize_t size,
4924                              const char *errors,
4925                              int *byteorder,
4926                              Py_ssize_t *consumed)
4927{
4928    const char *starts = s;
4929    Py_ssize_t startinpos;
4930    Py_ssize_t endinpos;
4931    Py_ssize_t outpos;
4932    PyObject *unicode;
4933    const unsigned char *q, *e;
4934    int bo = 0;       /* assume native ordering by default */
4935    const char *errmsg = "";
4936    /* Offsets from q for retrieving bytes in the right order. */
4937#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4938    int iorder[] = {0, 1, 2, 3};
4939#else
4940    int iorder[] = {3, 2, 1, 0};
4941#endif
4942    PyObject *errorHandler = NULL;
4943    PyObject *exc = NULL;
4944
4945    q = (unsigned char *)s;
4946    e = q + size;
4947
4948    if (byteorder)
4949        bo = *byteorder;
4950
4951    /* Check for BOM marks (U+FEFF) in the input and adjust current
4952       byte order setting accordingly. In native mode, the leading BOM
4953       mark is skipped, in all other modes, it is copied to the output
4954       stream as-is (giving a ZWNBSP character). */
4955    if (bo == 0) {
4956        if (size >= 4) {
4957            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4958                (q[iorder[1]] << 8) | q[iorder[0]];
4959#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4960            if (bom == 0x0000FEFF) {
4961                q += 4;
4962                bo = -1;
4963            }
4964            else if (bom == 0xFFFE0000) {
4965                q += 4;
4966                bo = 1;
4967            }
4968#else
4969            if (bom == 0x0000FEFF) {
4970                q += 4;
4971                bo = 1;
4972            }
4973            else if (bom == 0xFFFE0000) {
4974                q += 4;
4975                bo = -1;
4976            }
4977#endif
4978        }
4979    }
4980
4981    if (bo == -1) {
4982        /* force LE */
4983        iorder[0] = 0;
4984        iorder[1] = 1;
4985        iorder[2] = 2;
4986        iorder[3] = 3;
4987    }
4988    else if (bo == 1) {
4989        /* force BE */
4990        iorder[0] = 3;
4991        iorder[1] = 2;
4992        iorder[2] = 1;
4993        iorder[3] = 0;
4994    }
4995
4996    /* This might be one to much, because of a BOM */
4997    unicode = PyUnicode_New((size+3)/4, 127);
4998    if (!unicode)
4999        return NULL;
5000    if (size == 0)
5001        return unicode;
5002    outpos = 0;
5003
5004    while (q < e) {
5005        Py_UCS4 ch;
5006        /* remaining bytes at the end? (size should be divisible by 4) */
5007        if (e-q<4) {
5008            if (consumed)
5009                break;
5010            errmsg = "truncated data";
5011            startinpos = ((const char *)q)-starts;
5012            endinpos = ((const char *)e)-starts;
5013            goto utf32Error;
5014            /* The remaining input chars are ignored if the callback
5015               chooses to skip the input */
5016        }
5017        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5018            (q[iorder[1]] << 8) | q[iorder[0]];
5019
5020        if (ch >= 0x110000)
5021        {
5022            errmsg = "codepoint not in range(0x110000)";
5023            startinpos = ((const char *)q)-starts;
5024            endinpos = startinpos+4;
5025            goto utf32Error;
5026        }
5027        if (unicode_putchar(&unicode, &outpos, ch) < 0)
5028            goto onError;
5029        q += 4;
5030        continue;
5031      utf32Error:
5032        if (unicode_decode_call_errorhandler(
5033                errors, &errorHandler,
5034                "utf32", errmsg,
5035                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5036                &unicode, &outpos))
5037            goto onError;
5038    }
5039
5040    if (byteorder)
5041        *byteorder = bo;
5042
5043    if (consumed)
5044        *consumed = (const char *)q-starts;
5045
5046    /* Adjust length */
5047    if (PyUnicode_Resize(&unicode, outpos) < 0)
5048        goto onError;
5049
5050    Py_XDECREF(errorHandler);
5051    Py_XDECREF(exc);
5052#ifndef DONT_MAKE_RESULT_READY
5053    if (_PyUnicode_READY_REPLACE(&unicode)) {
5054        Py_DECREF(unicode);
5055        return NULL;
5056    }
5057#endif
5058    assert(_PyUnicode_CheckConsistency(unicode, 1));
5059    return unicode;
5060
5061  onError:
5062    Py_DECREF(unicode);
5063    Py_XDECREF(errorHandler);
5064    Py_XDECREF(exc);
5065    return NULL;
5066}
5067
5068PyObject *
5069_PyUnicode_EncodeUTF32(PyObject *str,
5070                       const char *errors,
5071                       int byteorder)
5072{
5073    int kind;
5074    void *data;
5075    Py_ssize_t len;
5076    PyObject *v;
5077    unsigned char *p;
5078    Py_ssize_t nsize, bytesize, i;
5079    /* Offsets from p for storing byte pairs in the right order. */
5080#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5081    int iorder[] = {0, 1, 2, 3};
5082#else
5083    int iorder[] = {3, 2, 1, 0};
5084#endif
5085
5086#define STORECHAR(CH)                           \
5087    do {                                        \
5088        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5089        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5090        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5091        p[iorder[0]] = (CH) & 0xff;             \
5092        p += 4;                                 \
5093    } while(0)
5094
5095    if (!PyUnicode_Check(str)) {
5096        PyErr_BadArgument();
5097        return NULL;
5098    }
5099    if (PyUnicode_READY(str) < 0)
5100        return NULL;
5101    kind = PyUnicode_KIND(str);
5102    data = PyUnicode_DATA(str);
5103    len = PyUnicode_GET_LENGTH(str);
5104
5105    nsize = len + (byteorder == 0);
5106    bytesize = nsize * 4;
5107    if (bytesize / 4 != nsize)
5108        return PyErr_NoMemory();
5109    v = PyBytes_FromStringAndSize(NULL, bytesize);
5110    if (v == NULL)
5111        return NULL;
5112
5113    p = (unsigned char *)PyBytes_AS_STRING(v);
5114    if (byteorder == 0)
5115        STORECHAR(0xFEFF);
5116    if (len == 0)
5117        goto done;
5118
5119    if (byteorder == -1) {
5120        /* force LE */
5121        iorder[0] = 0;
5122        iorder[1] = 1;
5123        iorder[2] = 2;
5124        iorder[3] = 3;
5125    }
5126    else if (byteorder == 1) {
5127        /* force BE */
5128        iorder[0] = 3;
5129        iorder[1] = 2;
5130        iorder[2] = 1;
5131        iorder[3] = 0;
5132    }
5133
5134    for (i = 0; i < len; i++)
5135        STORECHAR(PyUnicode_READ(kind, data, i));
5136
5137  done:
5138    return v;
5139#undef STORECHAR
5140}
5141
5142PyObject *
5143PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5144                      Py_ssize_t size,
5145                      const char *errors,
5146                      int byteorder)
5147{
5148    PyObject *result;
5149    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5150    if (tmp == NULL)
5151        return NULL;
5152    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5153    Py_DECREF(tmp);
5154    return result;
5155}
5156
5157PyObject *
5158PyUnicode_AsUTF32String(PyObject *unicode)
5159{
5160    const Py_UNICODE *wstr;
5161    Py_ssize_t wlen;
5162    wstr = PyUnicode_AsUnicodeAndSize(unicode, &wlen);
5163    if (wstr == NULL)
5164        return NULL;
5165    return PyUnicode_EncodeUTF32(wstr, wlen, NULL, 0);
5166}
5167
5168/* --- UTF-16 Codec ------------------------------------------------------- */
5169
5170PyObject *
5171PyUnicode_DecodeUTF16(const char *s,
5172                      Py_ssize_t size,
5173                      const char *errors,
5174                      int *byteorder)
5175{
5176    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5177}
5178
5179/* Two masks for fast checking of whether a C 'long' may contain
5180   UTF16-encoded surrogate characters. This is an efficient heuristic,
5181   assuming that non-surrogate characters with a code point >= 0x8000 are
5182   rare in most input.
5183   FAST_CHAR_MASK is used when the input is in native byte ordering,
5184   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
5185*/
5186#if (SIZEOF_LONG == 8)
5187# define FAST_CHAR_MASK         0x8000800080008000L
5188# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5189#elif (SIZEOF_LONG == 4)
5190# define FAST_CHAR_MASK         0x80008000L
5191# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5192#else
5193# error C 'long' size should be either 4 or 8!
5194#endif
5195
5196PyObject *
5197PyUnicode_DecodeUTF16Stateful(const char *s,
5198                              Py_ssize_t size,
5199                              const char *errors,
5200                              int *byteorder,
5201                              Py_ssize_t *consumed)
5202{
5203    const char *starts = s;
5204    Py_ssize_t startinpos;
5205    Py_ssize_t endinpos;
5206    Py_ssize_t outpos;
5207    PyObject *unicode;
5208    const unsigned char *q, *e, *aligned_end;
5209    int bo = 0;       /* assume native ordering by default */
5210    int native_ordering = 0;
5211    const char *errmsg = "";
5212    /* Offsets from q for retrieving byte pairs in the right order. */
5213#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5214    int ihi = 1, ilo = 0;
5215#else
5216    int ihi = 0, ilo = 1;
5217#endif
5218    PyObject *errorHandler = NULL;
5219    PyObject *exc = NULL;
5220
5221    /* Note: size will always be longer than the resulting Unicode
5222       character count */
5223    unicode = PyUnicode_New(size, 127);
5224    if (!unicode)
5225        return NULL;
5226    if (size == 0)
5227        return unicode;
5228    outpos = 0;
5229
5230    q = (unsigned char *)s;
5231    e = q + size - 1;
5232
5233    if (byteorder)
5234        bo = *byteorder;
5235
5236    /* Check for BOM marks (U+FEFF) in the input and adjust current
5237       byte order setting accordingly. In native mode, the leading BOM
5238       mark is skipped, in all other modes, it is copied to the output
5239       stream as-is (giving a ZWNBSP character). */
5240    if (bo == 0) {
5241        if (size >= 2) {
5242            const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
5243#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5244            if (bom == 0xFEFF) {
5245                q += 2;
5246                bo = -1;
5247            }
5248            else if (bom == 0xFFFE) {
5249                q += 2;
5250                bo = 1;
5251            }
5252#else
5253            if (bom == 0xFEFF) {
5254                q += 2;
5255                bo = 1;
5256            }
5257            else if (bom == 0xFFFE) {
5258                q += 2;
5259                bo = -1;
5260            }
5261#endif
5262        }
5263    }
5264
5265    if (bo == -1) {
5266        /* force LE */
5267        ihi = 1;
5268        ilo = 0;
5269    }
5270    else if (bo == 1) {
5271        /* force BE */
5272        ihi = 0;
5273        ilo = 1;
5274    }
5275#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5276    native_ordering = ilo < ihi;
5277#else
5278    native_ordering = ilo > ihi;
5279#endif
5280
5281    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5282    while (q < e) {
5283        Py_UCS4 ch;
5284        /* First check for possible aligned read of a C 'long'. Unaligned
5285           reads are more expensive, better to defer to another iteration. */
5286        if (!((size_t) q & LONG_PTR_MASK)) {
5287            /* Fast path for runs of non-surrogate chars. */
5288            register const unsigned char *_q = q;
5289            int kind = PyUnicode_KIND(unicode);
5290            void *data = PyUnicode_DATA(unicode);
5291            while (_q < aligned_end) {
5292                unsigned long block = * (unsigned long *) _q;
5293                unsigned short *pblock = (unsigned short*)&block;
5294                Py_UCS4 maxch;
5295                if (native_ordering) {
5296                    /* Can use buffer directly */
5297                    if (block & FAST_CHAR_MASK)
5298                        break;
5299                }
5300                else {
5301                    /* Need to byte-swap */
5302                    unsigned char *_p = (unsigned char*)pblock;
5303                    if (block & SWAPPED_FAST_CHAR_MASK)
5304                        break;
5305                    _p[0] = _q[1];
5306                    _p[1] = _q[0];
5307                    _p[2] = _q[3];
5308                    _p[3] = _q[2];
5309#if (SIZEOF_LONG == 8)
5310                    _p[4] = _q[5];
5311                    _p[5] = _q[4];
5312                    _p[6] = _q[7];
5313                    _p[7] = _q[6];
5314#endif
5315                }
5316                maxch = Py_MAX(pblock[0], pblock[1]);
5317#if SIZEOF_LONG == 8
5318                maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5319#endif
5320                if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5321                    if (unicode_widen(&unicode, maxch) < 0)
5322                        goto onError;
5323                    kind = PyUnicode_KIND(unicode);
5324                    data = PyUnicode_DATA(unicode);
5325                }
5326                PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5327                PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5328#if SIZEOF_LONG == 8
5329                PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5330                PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5331#endif
5332                _q += SIZEOF_LONG;
5333            }
5334            q = _q;
5335            if (q >= e)
5336                break;
5337        }
5338        ch = (q[ihi] << 8) | q[ilo];
5339
5340        q += 2;
5341
5342        if (ch < 0xD800 || ch > 0xDFFF) {
5343            if (unicode_putchar(&unicode, &outpos, ch) < 0)
5344                goto onError;
5345            continue;
5346        }
5347
5348        /* UTF-16 code pair: */
5349        if (q > e) {
5350            errmsg = "unexpected end of data";
5351            startinpos = (((const char *)q) - 2) - starts;
5352            endinpos = ((const char *)e) + 1 - starts;
5353            goto utf16Error;
5354        }
5355        if (0xD800 <= ch && ch <= 0xDBFF) {
5356            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5357            q += 2;
5358            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5359                if (unicode_putchar(&unicode, &outpos,
5360                                    (((ch & 0x3FF)<<10) |
5361                                     (ch2 & 0x3FF)) + 0x10000) < 0)
5362                    goto onError;
5363                continue;
5364            }
5365            else {
5366                errmsg = "illegal UTF-16 surrogate";
5367                startinpos = (((const char *)q)-4)-starts;
5368                endinpos = startinpos+2;
5369                goto utf16Error;
5370            }
5371
5372        }
5373        errmsg = "illegal encoding";
5374        startinpos = (((const char *)q)-2)-starts;
5375        endinpos = startinpos+2;
5376        /* Fall through to report the error */
5377
5378      utf16Error:
5379        if (unicode_decode_call_errorhandler(
5380                errors,
5381                &errorHandler,
5382                "utf16", errmsg,
5383                &starts,
5384                (const char **)&e,
5385                &startinpos,
5386                &endinpos,
5387                &exc,
5388                (const char **)&q,
5389                &unicode,
5390                &outpos))
5391            goto onError;
5392    }
5393    /* remaining byte at the end? (size should be even) */
5394    if (e == q) {
5395        if (!consumed) {
5396            errmsg = "truncated data";
5397            startinpos = ((const char *)q) - starts;
5398            endinpos = ((const char *)e) + 1 - starts;
5399            if (unicode_decode_call_errorhandler(
5400                    errors,
5401                    &errorHandler,
5402                    "utf16", errmsg,
5403                    &starts,
5404                    (const char **)&e,
5405                    &startinpos,
5406                    &endinpos,
5407                    &exc,
5408                    (const char **)&q,
5409                    &unicode,
5410                    &outpos))
5411                goto onError;
5412            /* The remaining input chars are ignored if the callback
5413               chooses to skip the input */
5414        }
5415    }
5416
5417    if (byteorder)
5418        *byteorder = bo;
5419
5420    if (consumed)
5421        *consumed = (const char *)q-starts;
5422
5423    /* Adjust length */
5424    if (PyUnicode_Resize(&unicode, outpos) < 0)
5425        goto onError;
5426
5427    Py_XDECREF(errorHandler);
5428    Py_XDECREF(exc);
5429    assert(_PyUnicode_CheckConsistency(unicode, 1));
5430    return unicode;
5431
5432  onError:
5433    Py_DECREF(unicode);
5434    Py_XDECREF(errorHandler);
5435    Py_XDECREF(exc);
5436    return NULL;
5437}
5438
5439#undef FAST_CHAR_MASK
5440#undef SWAPPED_FAST_CHAR_MASK
5441
5442PyObject *
5443_PyUnicode_EncodeUTF16(PyObject *str,
5444                       const char *errors,
5445                       int byteorder)
5446{
5447    int kind;
5448    void *data;
5449    Py_ssize_t len;
5450    PyObject *v;
5451    unsigned char *p;
5452    Py_ssize_t nsize, bytesize;
5453    Py_ssize_t i, pairs;
5454    /* Offsets from p for storing byte pairs in the right order. */
5455#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5456    int ihi = 1, ilo = 0;
5457#else
5458    int ihi = 0, ilo = 1;
5459#endif
5460
5461#define STORECHAR(CH)                           \
5462    do {                                        \
5463        p[ihi] = ((CH) >> 8) & 0xff;            \
5464        p[ilo] = (CH) & 0xff;                   \
5465        p += 2;                                 \
5466    } while(0)
5467
5468    if (!PyUnicode_Check(str)) {
5469        PyErr_BadArgument();
5470        return NULL;
5471    }
5472    if (PyUnicode_READY(str) < 0)
5473        return NULL;
5474    kind = PyUnicode_KIND(str);
5475    data = PyUnicode_DATA(str);
5476    len = PyUnicode_GET_LENGTH(str);
5477
5478    pairs = 0;
5479    if (kind == PyUnicode_4BYTE_KIND)
5480        for (i = 0; i < len; i++)
5481            if (PyUnicode_READ(kind, data, i) >= 0x10000)
5482                pairs++;
5483    /* 2 * (len + pairs + (byteorder == 0)) */
5484    if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5485        return PyErr_NoMemory();
5486    nsize = len + pairs + (byteorder == 0);
5487    bytesize = nsize * 2;
5488    if (bytesize / 2 != nsize)
5489        return PyErr_NoMemory();
5490    v = PyBytes_FromStringAndSize(NULL, bytesize);
5491    if (v == NULL)
5492        return NULL;
5493
5494    p = (unsigned char *)PyBytes_AS_STRING(v);
5495    if (byteorder == 0)
5496        STORECHAR(0xFEFF);
5497    if (len == 0)
5498        goto done;
5499
5500    if (byteorder == -1) {
5501        /* force LE */
5502        ihi = 1;
5503        ilo = 0;
5504    }
5505    else if (byteorder == 1) {
5506        /* force BE */
5507        ihi = 0;
5508        ilo = 1;
5509    }
5510
5511    for (i = 0; i < len; i++) {
5512        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5513        Py_UCS4 ch2 = 0;
5514        if (ch >= 0x10000) {
5515            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5516            ch  = 0xD800 | ((ch-0x10000) >> 10);
5517        }
5518        STORECHAR(ch);
5519        if (ch2)
5520            STORECHAR(ch2);
5521    }
5522
5523  done:
5524    return v;
5525#undef STORECHAR
5526}
5527
5528PyObject *
5529PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5530                      Py_ssize_t size,
5531                      const char *errors,
5532                      int byteorder)
5533{
5534    PyObject *result;
5535    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5536    if (tmp == NULL)
5537        return NULL;
5538    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5539    Py_DECREF(tmp);
5540    return result;
5541}
5542
5543PyObject *
5544PyUnicode_AsUTF16String(PyObject *unicode)
5545{
5546    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5547}
5548
5549/* --- Unicode Escape Codec ----------------------------------------------- */
5550
5551/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5552   if all the escapes in the string make it still a valid ASCII string.
5553   Returns -1 if any escapes were found which cause the string to
5554   pop out of ASCII range.  Otherwise returns the length of the
5555   required buffer to hold the string.
5556   */
5557static Py_ssize_t
5558length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5559{
5560    const unsigned char *p = (const unsigned char *)s;
5561    const unsigned char *end = p + size;
5562    Py_ssize_t length = 0;
5563
5564    if (size < 0)
5565        return -1;
5566
5567    for (; p < end; ++p) {
5568        if (*p > 127) {
5569            /* Non-ASCII */
5570            return -1;
5571        }
5572        else if (*p != '\\') {
5573            /* Normal character */
5574            ++length;
5575        }
5576        else {
5577            /* Backslash-escape, check next char */
5578            ++p;
5579            /* Escape sequence reaches till end of string or
5580               non-ASCII follow-up. */
5581            if (p >= end || *p > 127)
5582                return -1;
5583            switch (*p) {
5584            case '\n':
5585                /* backslash + \n result in zero characters */
5586                break;
5587            case '\\': case '\'': case '\"':
5588            case 'b': case 'f': case 't':
5589            case 'n': case 'r': case 'v': case 'a':
5590                ++length;
5591                break;
5592            case '0': case '1': case '2': case '3':
5593            case '4': case '5': case '6': case '7':
5594            case 'x': case 'u': case 'U': case 'N':
5595                /* these do not guarantee ASCII characters */
5596                return -1;
5597            default:
5598                /* count the backslash + the other character */
5599                length += 2;
5600            }
5601        }
5602    }
5603    return length;
5604}
5605
5606/* Similar to PyUnicode_WRITE but either write into wstr field
5607   or treat string as ASCII. */
5608#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5609    do { \
5610        if ((kind) != PyUnicode_WCHAR_KIND) \
5611            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5612        else \
5613            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5614    } while (0)
5615
5616#define WRITE_WSTR(buf, index, value) \
5617    assert(kind == PyUnicode_WCHAR_KIND), \
5618    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5619
5620
5621static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5622
5623PyObject *
5624PyUnicode_DecodeUnicodeEscape(const char *s,
5625                              Py_ssize_t size,
5626                              const char *errors)
5627{
5628    const char *starts = s;
5629    Py_ssize_t startinpos;
5630    Py_ssize_t endinpos;
5631    int j;
5632    PyObject *v;
5633    const char *end;
5634    char* message;
5635    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5636    PyObject *errorHandler = NULL;
5637    PyObject *exc = NULL;
5638    Py_ssize_t len;
5639    Py_ssize_t i;
5640
5641    len = length_of_escaped_ascii_string(s, size);
5642
5643    /* After length_of_escaped_ascii_string() there are two alternatives,
5644       either the string is pure ASCII with named escapes like \n, etc.
5645       and we determined it's exact size (common case)
5646       or it contains \x, \u, ... escape sequences.  then we create a
5647       legacy wchar string and resize it at the end of this function. */
5648    if (len >= 0) {
5649        v = PyUnicode_New(len, 127);
5650        if (!v)
5651            goto onError;
5652        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5653    }
5654    else {
5655        /* Escaped strings will always be longer than the resulting
5656           Unicode string, so we start with size here and then reduce the
5657           length after conversion to the true value.
5658           (but if the error callback returns a long replacement string
5659           we'll have to allocate more space) */
5660        v = PyUnicode_New(size, 127);
5661        if (!v)
5662            goto onError;
5663        len = size;
5664    }
5665
5666    if (size == 0)
5667        return v;
5668    i = 0;
5669    end = s + size;
5670
5671    while (s < end) {
5672        unsigned char c;
5673        Py_UCS4 x;
5674        int digits;
5675
5676        /* The only case in which i == ascii_length is a backslash
5677           followed by a newline. */
5678        assert(i <= len);
5679
5680        /* Non-escape characters are interpreted as Unicode ordinals */
5681        if (*s != '\\') {
5682            if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5683                goto onError;
5684            continue;
5685        }
5686
5687        startinpos = s-starts;
5688        /* \ - Escapes */
5689        s++;
5690        c = *s++;
5691        if (s > end)
5692            c = '\0'; /* Invalid after \ */
5693
5694        /* The only case in which i == ascii_length is a backslash
5695           followed by a newline. */
5696        assert(i < len || (i == len && c == '\n'));
5697
5698        switch (c) {
5699
5700            /* \x escapes */
5701#define WRITECHAR(ch)                                   \
5702            do {                                        \
5703                if (unicode_putchar(&v, &i, ch) < 0)    \
5704                    goto onError;                       \
5705            }while(0)
5706
5707        case '\n': break;
5708        case '\\': WRITECHAR('\\'); break;
5709        case '\'': WRITECHAR('\''); break;
5710        case '\"': WRITECHAR('\"'); break;
5711        case 'b': WRITECHAR('\b'); break;
5712        /* FF */
5713        case 'f': WRITECHAR('\014'); break;
5714        case 't': WRITECHAR('\t'); break;
5715        case 'n': WRITECHAR('\n'); break;
5716        case 'r': WRITECHAR('\r'); break;
5717        /* VT */
5718        case 'v': WRITECHAR('\013'); break;
5719        /* BEL, not classic C */
5720        case 'a': WRITECHAR('\007'); break;
5721
5722            /* \OOO (octal) escapes */
5723        case '0': case '1': case '2': case '3':
5724        case '4': case '5': case '6': case '7':
5725            x = s[-1] - '0';
5726            if (s < end && '0' <= *s && *s <= '7') {
5727                x = (x<<3) + *s++ - '0';
5728                if (s < end && '0' <= *s && *s <= '7')
5729                    x = (x<<3) + *s++ - '0';
5730            }
5731            WRITECHAR(x);
5732            break;
5733
5734            /* hex escapes */
5735            /* \xXX */
5736        case 'x':
5737            digits = 2;
5738            message = "truncated \\xXX escape";
5739            goto hexescape;
5740
5741            /* \uXXXX */
5742        case 'u':
5743            digits = 4;
5744            message = "truncated \\uXXXX escape";
5745            goto hexescape;
5746
5747            /* \UXXXXXXXX */
5748        case 'U':
5749            digits = 8;
5750            message = "truncated \\UXXXXXXXX escape";
5751        hexescape:
5752            chr = 0;
5753            if (s+digits>end) {
5754                endinpos = size;
5755                if (unicode_decode_call_errorhandler(
5756                        errors, &errorHandler,
5757                        "unicodeescape", "end of string in escape sequence",
5758                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5759                        &v, &i))
5760                    goto onError;
5761                goto nextByte;
5762            }
5763            for (j = 0; j < digits; ++j) {
5764                c = (unsigned char) s[j];
5765                if (!Py_ISXDIGIT(c)) {
5766                    endinpos = (s+j+1)-starts;
5767                    if (unicode_decode_call_errorhandler(
5768                            errors, &errorHandler,
5769                            "unicodeescape", message,
5770                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5771                            &v, &i))
5772                        goto onError;
5773                    len = PyUnicode_GET_LENGTH(v);
5774                    goto nextByte;
5775                }
5776                chr = (chr<<4) & ~0xF;
5777                if (c >= '0' && c <= '9')
5778                    chr += c - '0';
5779                else if (c >= 'a' && c <= 'f')
5780                    chr += 10 + c - 'a';
5781                else
5782                    chr += 10 + c - 'A';
5783            }
5784            s += j;
5785            if (chr == 0xffffffff && PyErr_Occurred())
5786                /* _decoding_error will have already written into the
5787                   target buffer. */
5788                break;
5789        store:
5790            /* when we get here, chr is a 32-bit unicode character */
5791            if (chr <= 0x10ffff) {
5792                WRITECHAR(chr);
5793            } else {
5794                endinpos = s-starts;
5795                if (unicode_decode_call_errorhandler(
5796                        errors, &errorHandler,
5797                        "unicodeescape", "illegal Unicode character",
5798                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5799                        &v, &i))
5800                    goto onError;
5801            }
5802            break;
5803
5804            /* \N{name} */
5805        case 'N':
5806            message = "malformed \\N character escape";
5807            if (ucnhash_CAPI == NULL) {
5808                /* load the unicode data module */
5809                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5810                                                PyUnicodeData_CAPSULE_NAME, 1);
5811                if (ucnhash_CAPI == NULL)
5812                    goto ucnhashError;
5813            }
5814            if (*s == '{') {
5815                const char *start = s+1;
5816                /* look for the closing brace */
5817                while (*s != '}' && s < end)
5818                    s++;
5819                if (s > start && s < end && *s == '}') {
5820                    /* found a name.  look it up in the unicode database */
5821                    message = "unknown Unicode character name";
5822                    s++;
5823                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5824                                              &chr, 0))
5825                        goto store;
5826                }
5827            }
5828            endinpos = s-starts;
5829            if (unicode_decode_call_errorhandler(
5830                    errors, &errorHandler,
5831                    "unicodeescape", message,
5832                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5833                    &v, &i))
5834                goto onError;
5835            break;
5836
5837        default:
5838            if (s > end) {
5839                message = "\\ at end of string";
5840                s--;
5841                endinpos = s-starts;
5842                if (unicode_decode_call_errorhandler(
5843                        errors, &errorHandler,
5844                        "unicodeescape", message,
5845                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5846                        &v, &i))
5847                    goto onError;
5848            }
5849            else {
5850                WRITECHAR('\\');
5851                WRITECHAR(s[-1]);
5852            }
5853            break;
5854        }
5855      nextByte:
5856        ;
5857    }
5858#undef WRITECHAR
5859
5860    if (PyUnicode_Resize(&v, i) < 0)
5861        goto onError;
5862    Py_XDECREF(errorHandler);
5863    Py_XDECREF(exc);
5864#ifndef DONT_MAKE_RESULT_READY
5865    if (_PyUnicode_READY_REPLACE(&v)) {
5866        Py_DECREF(v);
5867        return NULL;
5868    }
5869#endif
5870    assert(_PyUnicode_CheckConsistency(v, 1));
5871    return v;
5872
5873  ucnhashError:
5874    PyErr_SetString(
5875        PyExc_UnicodeError,
5876        "\\N escapes not supported (can't load unicodedata module)"
5877        );
5878    Py_XDECREF(v);
5879    Py_XDECREF(errorHandler);
5880    Py_XDECREF(exc);
5881    return NULL;
5882
5883  onError:
5884    Py_XDECREF(v);
5885    Py_XDECREF(errorHandler);
5886    Py_XDECREF(exc);
5887    return NULL;
5888}
5889
5890#undef WRITE_ASCII_OR_WSTR
5891#undef WRITE_WSTR
5892
5893/* Return a Unicode-Escape string version of the Unicode object.
5894
5895   If quotes is true, the string is enclosed in u"" or u'' quotes as
5896   appropriate.
5897
5898*/
5899
5900PyObject *
5901PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5902{
5903    Py_ssize_t i, len;
5904    PyObject *repr;
5905    char *p;
5906    int kind;
5907    void *data;
5908    Py_ssize_t expandsize = 0;
5909
5910    /* Initial allocation is based on the longest-possible unichr
5911       escape.
5912
5913       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5914       unichr, so in this case it's the longest unichr escape. In
5915       narrow (UTF-16) builds this is five chars per source unichr
5916       since there are two unichrs in the surrogate pair, so in narrow
5917       (UTF-16) builds it's not the longest unichr escape.
5918
5919       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5920       so in the narrow (UTF-16) build case it's the longest unichr
5921       escape.
5922    */
5923
5924    if (!PyUnicode_Check(unicode)) {
5925        PyErr_BadArgument();
5926        return NULL;
5927    }
5928    if (PyUnicode_READY(unicode) < 0)
5929        return NULL;
5930    len = PyUnicode_GET_LENGTH(unicode);
5931    kind = PyUnicode_KIND(unicode);
5932    data = PyUnicode_DATA(unicode);
5933    switch(kind) {
5934    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5935    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5936    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5937    }
5938
5939    if (len == 0)
5940        return PyBytes_FromStringAndSize(NULL, 0);
5941
5942    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5943        return PyErr_NoMemory();
5944
5945    repr = PyBytes_FromStringAndSize(NULL,
5946                                     2
5947                                     + expandsize*len
5948                                     + 1);
5949    if (repr == NULL)
5950        return NULL;
5951
5952    p = PyBytes_AS_STRING(repr);
5953
5954    for (i = 0; i < len; i++) {
5955        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5956
5957        /* Escape backslashes */
5958        if (ch == '\\') {
5959            *p++ = '\\';
5960            *p++ = (char) ch;
5961            continue;
5962        }
5963
5964        /* Map 21-bit characters to '\U00xxxxxx' */
5965        else if (ch >= 0x10000) {
5966            *p++ = '\\';
5967            *p++ = 'U';
5968            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5969            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5970            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5971            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5972            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5973            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5974            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5975            *p++ = Py_hexdigits[ch & 0x0000000F];
5976            continue;
5977        }
5978
5979        /* Map 16-bit characters to '\uxxxx' */
5980        if (ch >= 256) {
5981            *p++ = '\\';
5982            *p++ = 'u';
5983            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5984            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5985            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5986            *p++ = Py_hexdigits[ch & 0x000F];
5987        }
5988
5989        /* Map special whitespace to '\t', \n', '\r' */
5990        else if (ch == '\t') {
5991            *p++ = '\\';
5992            *p++ = 't';
5993        }
5994        else if (ch == '\n') {
5995            *p++ = '\\';
5996            *p++ = 'n';
5997        }
5998        else if (ch == '\r') {
5999            *p++ = '\\';
6000            *p++ = 'r';
6001        }
6002
6003        /* Map non-printable US ASCII to '\xhh' */
6004        else if (ch < ' ' || ch >= 0x7F) {
6005            *p++ = '\\';
6006            *p++ = 'x';
6007            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6008            *p++ = Py_hexdigits[ch & 0x000F];
6009        }
6010
6011        /* Copy everything else as-is */
6012        else
6013            *p++ = (char) ch;
6014    }
6015
6016    assert(p - PyBytes_AS_STRING(repr) > 0);
6017    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6018        return NULL;
6019    return repr;
6020}
6021
6022PyObject *
6023PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6024                              Py_ssize_t size)
6025{
6026    PyObject *result;
6027    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6028    if (tmp == NULL)
6029        return NULL;
6030    result = PyUnicode_AsUnicodeEscapeString(tmp);
6031    Py_DECREF(tmp);
6032    return result;
6033}
6034
6035/* --- Raw Unicode Escape Codec ------------------------------------------- */
6036
6037PyObject *
6038PyUnicode_DecodeRawUnicodeEscape(const char *s,
6039                                 Py_ssize_t size,
6040                                 const char *errors)
6041{
6042    const char *starts = s;
6043    Py_ssize_t startinpos;
6044    Py_ssize_t endinpos;
6045    Py_ssize_t outpos;
6046    PyObject *v;
6047    const char *end;
6048    const char *bs;
6049    PyObject *errorHandler = NULL;
6050    PyObject *exc = NULL;
6051
6052    /* Escaped strings will always be longer than the resulting
6053       Unicode string, so we start with size here and then reduce the
6054       length after conversion to the true value. (But decoding error
6055       handler might have to resize the string) */
6056    v = PyUnicode_New(size, 127);
6057    if (v == NULL)
6058        goto onError;
6059    if (size == 0)
6060        return v;
6061    outpos = 0;
6062    end = s + size;
6063    while (s < end) {
6064        unsigned char c;
6065        Py_UCS4 x;
6066        int i;
6067        int count;
6068
6069        /* Non-escape characters are interpreted as Unicode ordinals */
6070        if (*s != '\\') {
6071            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6072                goto onError;
6073            continue;
6074        }
6075        startinpos = s-starts;
6076
6077        /* \u-escapes are only interpreted iff the number of leading
6078           backslashes if odd */
6079        bs = s;
6080        for (;s < end;) {
6081            if (*s != '\\')
6082                break;
6083            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6084                goto onError;
6085        }
6086        if (((s - bs) & 1) == 0 ||
6087            s >= end ||
6088            (*s != 'u' && *s != 'U')) {
6089            continue;
6090        }
6091        outpos--;
6092        count = *s=='u' ? 4 : 8;
6093        s++;
6094
6095        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6096        for (x = 0, i = 0; i < count; ++i, ++s) {
6097            c = (unsigned char)*s;
6098            if (!Py_ISXDIGIT(c)) {
6099                endinpos = s-starts;
6100                if (unicode_decode_call_errorhandler(
6101                        errors, &errorHandler,
6102                        "rawunicodeescape", "truncated \\uXXXX",
6103                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6104                        &v, &outpos))
6105                    goto onError;
6106                goto nextByte;
6107            }
6108            x = (x<<4) & ~0xF;
6109            if (c >= '0' && c <= '9')
6110                x += c - '0';
6111            else if (c >= 'a' && c <= 'f')
6112                x += 10 + c - 'a';
6113            else
6114                x += 10 + c - 'A';
6115        }
6116        if (x <= 0x10ffff) {
6117            if (unicode_putchar(&v, &outpos, x) < 0)
6118                goto onError;
6119        } else {
6120            endinpos = s-starts;
6121            if (unicode_decode_call_errorhandler(
6122                    errors, &errorHandler,
6123                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6124                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6125                    &v, &outpos))
6126                goto onError;
6127        }
6128      nextByte:
6129        ;
6130    }
6131    if (PyUnicode_Resize(&v, outpos) < 0)
6132        goto onError;
6133    Py_XDECREF(errorHandler);
6134    Py_XDECREF(exc);
6135    assert(_PyUnicode_CheckConsistency(v, 1));
6136    return v;
6137
6138  onError:
6139    Py_XDECREF(v);
6140    Py_XDECREF(errorHandler);
6141    Py_XDECREF(exc);
6142    return NULL;
6143}
6144
6145
6146PyObject *
6147PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6148{
6149    PyObject *repr;
6150    char *p;
6151    char *q;
6152    Py_ssize_t expandsize, pos;
6153    int kind;
6154    void *data;
6155    Py_ssize_t len;
6156
6157    if (!PyUnicode_Check(unicode)) {
6158        PyErr_BadArgument();
6159        return NULL;
6160    }
6161    if (PyUnicode_READY(unicode) < 0)
6162        return NULL;
6163    kind = PyUnicode_KIND(unicode);
6164    data = PyUnicode_DATA(unicode);
6165    len = PyUnicode_GET_LENGTH(unicode);
6166
6167    switch(kind) {
6168    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6169    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6170    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6171    }
6172
6173    if (len > PY_SSIZE_T_MAX / expandsize)
6174        return PyErr_NoMemory();
6175
6176    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6177    if (repr == NULL)
6178        return NULL;
6179    if (len == 0)
6180        return repr;
6181
6182    p = q = PyBytes_AS_STRING(repr);
6183    for (pos = 0; pos < len; pos++) {
6184        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6185        /* Map 32-bit characters to '\Uxxxxxxxx' */
6186        if (ch >= 0x10000) {
6187            *p++ = '\\';
6188            *p++ = 'U';
6189            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6190            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6191            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6192            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6193            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6194            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6195            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6196            *p++ = Py_hexdigits[ch & 15];
6197        }
6198        /* Map 16-bit characters to '\uxxxx' */
6199        else if (ch >= 256) {
6200            *p++ = '\\';
6201            *p++ = 'u';
6202            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6203            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6204            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6205            *p++ = Py_hexdigits[ch & 15];
6206        }
6207        /* Copy everything else as-is */
6208        else
6209            *p++ = (char) ch;
6210    }
6211
6212    assert(p > q);
6213    if (_PyBytes_Resize(&repr, p - q) < 0)
6214        return NULL;
6215    return repr;
6216}
6217
6218PyObject *
6219PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6220                                 Py_ssize_t size)
6221{
6222    PyObject *result;
6223    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6224    if (tmp == NULL)
6225        return NULL;
6226    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6227    Py_DECREF(tmp);
6228    return result;
6229}
6230
6231/* --- Unicode Internal Codec ------------------------------------------- */
6232
6233PyObject *
6234_PyUnicode_DecodeUnicodeInternal(const char *s,
6235                                 Py_ssize_t size,
6236                                 const char *errors)
6237{
6238    const char *starts = s;
6239    Py_ssize_t startinpos;
6240    Py_ssize_t endinpos;
6241    Py_ssize_t outpos;
6242    PyObject *v;
6243    const char *end;
6244    const char *reason;
6245    PyObject *errorHandler = NULL;
6246    PyObject *exc = NULL;
6247
6248    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6249                     "unicode_internal codec has been deprecated",
6250                     1))
6251        return NULL;
6252
6253    /* XXX overflow detection missing */
6254    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
6255    if (v == NULL)
6256        goto onError;
6257    if (PyUnicode_GET_LENGTH(v) == 0)
6258        return v;
6259    outpos = 0;
6260    end = s + size;
6261
6262    while (s < end) {
6263        Py_UNICODE uch;
6264        Py_UCS4 ch;
6265        /* We copy the raw representation one byte at a time because the
6266           pointer may be unaligned (see test_codeccallbacks). */
6267        ((char *) &uch)[0] = s[0];
6268        ((char *) &uch)[1] = s[1];
6269#ifdef Py_UNICODE_WIDE
6270        ((char *) &uch)[2] = s[2];
6271        ((char *) &uch)[3] = s[3];
6272#endif
6273        ch = uch;
6274
6275        /* We have to sanity check the raw data, otherwise doom looms for
6276           some malformed UCS-4 data. */
6277        if (
6278#ifdef Py_UNICODE_WIDE
6279            ch > 0x10ffff ||
6280#endif
6281            end-s < Py_UNICODE_SIZE
6282            )
6283        {
6284            startinpos = s - starts;
6285            if (end-s < Py_UNICODE_SIZE) {
6286                endinpos = end-starts;
6287                reason = "truncated input";
6288            }
6289            else {
6290                endinpos = s - starts + Py_UNICODE_SIZE;
6291                reason = "illegal code point (> 0x10FFFF)";
6292            }
6293            if (unicode_decode_call_errorhandler(
6294                    errors, &errorHandler,
6295                    "unicode_internal", reason,
6296                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6297                    &v, &outpos))
6298                goto onError;
6299            continue;
6300        }
6301
6302        s += Py_UNICODE_SIZE;
6303#ifndef Py_UNICODE_WIDE
6304        if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6305        {
6306            Py_UNICODE uch2;
6307            ((char *) &uch2)[0] = s[0];
6308            ((char *) &uch2)[1] = s[1];
6309            if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
6310            {
6311                ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
6312                s += Py_UNICODE_SIZE;
6313            }
6314        }
6315#endif
6316
6317        if (unicode_putchar(&v, &outpos, ch) < 0)
6318            goto onError;
6319    }
6320
6321    if (PyUnicode_Resize(&v, outpos) < 0)
6322        goto onError;
6323    Py_XDECREF(errorHandler);
6324    Py_XDECREF(exc);
6325    assert(_PyUnicode_CheckConsistency(v, 1));
6326    return v;
6327
6328  onError:
6329    Py_XDECREF(v);
6330    Py_XDECREF(errorHandler);
6331    Py_XDECREF(exc);
6332    return NULL;
6333}
6334
6335/* --- Latin-1 Codec ------------------------------------------------------ */
6336
6337PyObject *
6338PyUnicode_DecodeLatin1(const char *s,
6339                       Py_ssize_t size,
6340                       const char *errors)
6341{
6342    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6343    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6344}
6345
6346/* create or adjust a UnicodeEncodeError */
6347static void
6348make_encode_exception(PyObject **exceptionObject,
6349                      const char *encoding,
6350                      PyObject *unicode,
6351                      Py_ssize_t startpos, Py_ssize_t endpos,
6352                      const char *reason)
6353{
6354    if (*exceptionObject == NULL) {
6355        *exceptionObject = PyObject_CallFunction(
6356            PyExc_UnicodeEncodeError, "sOnns",
6357            encoding, unicode, startpos, endpos, reason);
6358    }
6359    else {
6360        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6361            goto onError;
6362        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6363            goto onError;
6364        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6365            goto onError;
6366        return;
6367      onError:
6368        Py_DECREF(*exceptionObject);
6369        *exceptionObject = NULL;
6370    }
6371}
6372
6373/* raises a UnicodeEncodeError */
6374static void
6375raise_encode_exception(PyObject **exceptionObject,
6376                       const char *encoding,
6377                       PyObject *unicode,
6378                       Py_ssize_t startpos, Py_ssize_t endpos,
6379                       const char *reason)
6380{
6381    make_encode_exception(exceptionObject,
6382                          encoding, unicode, startpos, endpos, reason);
6383    if (*exceptionObject != NULL)
6384        PyCodec_StrictErrors(*exceptionObject);
6385}
6386
6387/* error handling callback helper:
6388   build arguments, call the callback and check the arguments,
6389   put the result into newpos and return the replacement string, which
6390   has to be freed by the caller */
6391static PyObject *
6392unicode_encode_call_errorhandler(const char *errors,
6393                                 PyObject **errorHandler,
6394                                 const char *encoding, const char *reason,
6395                                 PyObject *unicode, PyObject **exceptionObject,
6396                                 Py_ssize_t startpos, Py_ssize_t endpos,
6397                                 Py_ssize_t *newpos)
6398{
6399    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6400    Py_ssize_t len;
6401    PyObject *restuple;
6402    PyObject *resunicode;
6403
6404    if (*errorHandler == NULL) {
6405        *errorHandler = PyCodec_LookupError(errors);
6406        if (*errorHandler == NULL)
6407            return NULL;
6408    }
6409
6410    if (PyUnicode_READY(unicode) < 0)
6411        return NULL;
6412    len = PyUnicode_GET_LENGTH(unicode);
6413
6414    make_encode_exception(exceptionObject,
6415                          encoding, unicode, startpos, endpos, reason);
6416    if (*exceptionObject == NULL)
6417        return NULL;
6418
6419    restuple = PyObject_CallFunctionObjArgs(
6420        *errorHandler, *exceptionObject, NULL);
6421    if (restuple == NULL)
6422        return NULL;
6423    if (!PyTuple_Check(restuple)) {
6424        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6425        Py_DECREF(restuple);
6426        return NULL;
6427    }
6428    if (!PyArg_ParseTuple(restuple, argparse,
6429                          &resunicode, newpos)) {
6430        Py_DECREF(restuple);
6431        return NULL;
6432    }
6433    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6434        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6435        Py_DECREF(restuple);
6436        return NULL;
6437    }
6438    if (*newpos<0)
6439        *newpos = len + *newpos;
6440    if (*newpos<0 || *newpos>len) {
6441        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6442        Py_DECREF(restuple);
6443        return NULL;
6444    }
6445    Py_INCREF(resunicode);
6446    Py_DECREF(restuple);
6447    return resunicode;
6448}
6449
6450static PyObject *
6451unicode_encode_ucs1(PyObject *unicode,
6452                    const char *errors,
6453                    unsigned int limit)
6454{
6455    /* input state */
6456    Py_ssize_t pos=0, size;
6457    int kind;
6458    void *data;
6459    /* output object */
6460    PyObject *res;
6461    /* pointer into the output */
6462    char *str;
6463    /* current output position */
6464    Py_ssize_t ressize;
6465    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6466    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6467    PyObject *errorHandler = NULL;
6468    PyObject *exc = NULL;
6469    /* the following variable is used for caching string comparisons
6470     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6471    int known_errorHandler = -1;
6472
6473    if (PyUnicode_READY(unicode) < 0)
6474        return NULL;
6475    size = PyUnicode_GET_LENGTH(unicode);
6476    kind = PyUnicode_KIND(unicode);
6477    data = PyUnicode_DATA(unicode);
6478    /* allocate enough for a simple encoding without
6479       replacements, if we need more, we'll resize */
6480    if (size == 0)
6481        return PyBytes_FromStringAndSize(NULL, 0);
6482    res = PyBytes_FromStringAndSize(NULL, size);
6483    if (res == NULL)
6484        return NULL;
6485    str = PyBytes_AS_STRING(res);
6486    ressize = size;
6487
6488    while (pos < size) {
6489        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6490
6491        /* can we encode this? */
6492        if (c<limit) {
6493            /* no overflow check, because we know that the space is enough */
6494            *str++ = (char)c;
6495            ++pos;
6496        }
6497        else {
6498            Py_ssize_t requiredsize;
6499            PyObject *repunicode;
6500            Py_ssize_t repsize, newpos, respos, i;
6501            /* startpos for collecting unencodable chars */
6502            Py_ssize_t collstart = pos;
6503            Py_ssize_t collend = pos;
6504            /* find all unecodable characters */
6505            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6506                ++collend;
6507            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6508            if (known_errorHandler==-1) {
6509                if ((errors==NULL) || (!strcmp(errors, "strict")))
6510                    known_errorHandler = 1;
6511                else if (!strcmp(errors, "replace"))
6512                    known_errorHandler = 2;
6513                else if (!strcmp(errors, "ignore"))
6514                    known_errorHandler = 3;
6515                else if (!strcmp(errors, "xmlcharrefreplace"))
6516                    known_errorHandler = 4;
6517                else
6518                    known_errorHandler = 0;
6519            }
6520            switch (known_errorHandler) {
6521            case 1: /* strict */
6522                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6523                goto onError;
6524            case 2: /* replace */
6525                while (collstart++<collend)
6526                    *str++ = '?'; /* fall through */
6527            case 3: /* ignore */
6528                pos = collend;
6529                break;
6530            case 4: /* xmlcharrefreplace */
6531                respos = str - PyBytes_AS_STRING(res);
6532                /* determine replacement size */
6533                for (i = collstart, repsize = 0; i < collend; ++i) {
6534                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6535                    if (ch < 10)
6536                        repsize += 2+1+1;
6537                    else if (ch < 100)
6538                        repsize += 2+2+1;
6539                    else if (ch < 1000)
6540                        repsize += 2+3+1;
6541                    else if (ch < 10000)
6542                        repsize += 2+4+1;
6543#ifndef Py_UNICODE_WIDE
6544                    else
6545                        repsize += 2+5+1;
6546#else
6547                    else if (ch < 100000)
6548                        repsize += 2+5+1;
6549                    else if (ch < 1000000)
6550                        repsize += 2+6+1;
6551                    else
6552                        repsize += 2+7+1;
6553#endif
6554                }
6555                requiredsize = respos+repsize+(size-collend);
6556                if (requiredsize > ressize) {
6557                    if (requiredsize<2*ressize)
6558                        requiredsize = 2*ressize;
6559                    if (_PyBytes_Resize(&res, requiredsize))
6560                        goto onError;
6561                    str = PyBytes_AS_STRING(res) + respos;
6562                    ressize = requiredsize;
6563                }
6564                /* generate replacement */
6565                for (i = collstart; i < collend; ++i) {
6566                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6567                }
6568                pos = collend;
6569                break;
6570            default:
6571                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6572                                                              encoding, reason, unicode, &exc,
6573                                                              collstart, collend, &newpos);
6574                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6575                                           PyUnicode_READY(repunicode) < 0))
6576                    goto onError;
6577                if (PyBytes_Check(repunicode)) {
6578                    /* Directly copy bytes result to output. */
6579                    repsize = PyBytes_Size(repunicode);
6580                    if (repsize > 1) {
6581                        /* Make room for all additional bytes. */
6582                        respos = str - PyBytes_AS_STRING(res);
6583                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6584                            Py_DECREF(repunicode);
6585                            goto onError;
6586                        }
6587                        str = PyBytes_AS_STRING(res) + respos;
6588                        ressize += repsize-1;
6589                    }
6590                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6591                    str += repsize;
6592                    pos = newpos;
6593                    Py_DECREF(repunicode);
6594                    break;
6595                }
6596                /* need more space? (at least enough for what we
6597                   have+the replacement+the rest of the string, so
6598                   we won't have to check space for encodable characters) */
6599                respos = str - PyBytes_AS_STRING(res);
6600                repsize = PyUnicode_GET_LENGTH(repunicode);
6601                requiredsize = respos+repsize+(size-collend);
6602                if (requiredsize > ressize) {
6603                    if (requiredsize<2*ressize)
6604                        requiredsize = 2*ressize;
6605                    if (_PyBytes_Resize(&res, requiredsize)) {
6606                        Py_DECREF(repunicode);
6607                        goto onError;
6608                    }
6609                    str = PyBytes_AS_STRING(res) + respos;
6610                    ressize = requiredsize;
6611                }
6612                /* check if there is anything unencodable in the replacement
6613                   and copy it to the output */
6614                for (i = 0; repsize-->0; ++i, ++str) {
6615                    c = PyUnicode_READ_CHAR(repunicode, i);
6616                    if (c >= limit) {
6617                        raise_encode_exception(&exc, encoding, unicode,
6618                                               pos, pos+1, reason);
6619                        Py_DECREF(repunicode);
6620                        goto onError;
6621                    }
6622                    *str = (char)c;
6623                }
6624                pos = newpos;
6625                Py_DECREF(repunicode);
6626            }
6627        }
6628    }
6629    /* Resize if we allocated to much */
6630    size = str - PyBytes_AS_STRING(res);
6631    if (size < ressize) { /* If this falls res will be NULL */
6632        assert(size >= 0);
6633        if (_PyBytes_Resize(&res, size) < 0)
6634            goto onError;
6635    }
6636
6637    Py_XDECREF(errorHandler);
6638    Py_XDECREF(exc);
6639    return res;
6640
6641  onError:
6642    Py_XDECREF(res);
6643    Py_XDECREF(errorHandler);
6644    Py_XDECREF(exc);
6645    return NULL;
6646}
6647
6648/* Deprecated */
6649PyObject *
6650PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6651                       Py_ssize_t size,
6652                       const char *errors)
6653{
6654    PyObject *result;
6655    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6656    if (unicode == NULL)
6657        return NULL;
6658    result = unicode_encode_ucs1(unicode, errors, 256);
6659    Py_DECREF(unicode);
6660    return result;
6661}
6662
6663PyObject *
6664_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6665{
6666    if (!PyUnicode_Check(unicode)) {
6667        PyErr_BadArgument();
6668        return NULL;
6669    }
6670    if (PyUnicode_READY(unicode) == -1)
6671        return NULL;
6672    /* Fast path: if it is a one-byte string, construct
6673       bytes object directly. */
6674    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6675        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6676                                         PyUnicode_GET_LENGTH(unicode));
6677    /* Non-Latin-1 characters present. Defer to above function to
6678       raise the exception. */
6679    return unicode_encode_ucs1(unicode, errors, 256);
6680}
6681
6682PyObject*
6683PyUnicode_AsLatin1String(PyObject *unicode)
6684{
6685    return _PyUnicode_AsLatin1String(unicode, NULL);
6686}
6687
6688/* --- 7-bit ASCII Codec -------------------------------------------------- */
6689
6690PyObject *
6691PyUnicode_DecodeASCII(const char *s,
6692                      Py_ssize_t size,
6693                      const char *errors)
6694{
6695    const char *starts = s;
6696    PyObject *v;
6697    int kind;
6698    void *data;
6699    Py_ssize_t startinpos;
6700    Py_ssize_t endinpos;
6701    Py_ssize_t outpos;
6702    const char *e;
6703    int has_error;
6704    const unsigned char *p = (const unsigned char *)s;
6705    const unsigned char *end = p + size;
6706    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
6707    PyObject *errorHandler = NULL;
6708    PyObject *exc = NULL;
6709
6710    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6711    if (size == 1 && (unsigned char)s[0] < 128)
6712        return get_latin1_char((unsigned char)s[0]);
6713
6714    has_error = 0;
6715    while (p < end && !has_error) {
6716        /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6717           an explanation. */
6718        if (!((size_t) p & LONG_PTR_MASK)) {
6719            /* Help register allocation */
6720            register const unsigned char *_p = p;
6721            while (_p < aligned_end) {
6722                unsigned long value = *(unsigned long *) _p;
6723                if (value & ASCII_CHAR_MASK) {
6724                    has_error = 1;
6725                    break;
6726                }
6727                _p += SIZEOF_LONG;
6728            }
6729            if (_p == end)
6730                break;
6731            if (has_error)
6732                break;
6733            p = _p;
6734        }
6735        if (*p & 0x80) {
6736            has_error = 1;
6737            break;
6738        }
6739        else {
6740            ++p;
6741        }
6742    }
6743    if (!has_error)
6744        return unicode_fromascii((const unsigned char *)s, size);
6745
6746    v = PyUnicode_New(size, 127);
6747    if (v == NULL)
6748        goto onError;
6749    if (size == 0)
6750        return v;
6751    kind = PyUnicode_KIND(v);
6752    data = PyUnicode_DATA(v);
6753    outpos = 0;
6754    e = s + size;
6755    while (s < e) {
6756        register unsigned char c = (unsigned char)*s;
6757        if (c < 128) {
6758            PyUnicode_WRITE(kind, data, outpos++, c);
6759            ++s;
6760        }
6761        else {
6762            startinpos = s-starts;
6763            endinpos = startinpos + 1;
6764            if (unicode_decode_call_errorhandler(
6765                    errors, &errorHandler,
6766                    "ascii", "ordinal not in range(128)",
6767                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6768                    &v, &outpos))
6769                goto onError;
6770            kind = PyUnicode_KIND(v);
6771            data = PyUnicode_DATA(v);
6772        }
6773    }
6774    if (PyUnicode_Resize(&v, outpos) < 0)
6775        goto onError;
6776    Py_XDECREF(errorHandler);
6777    Py_XDECREF(exc);
6778    assert(_PyUnicode_CheckConsistency(v, 1));
6779    return v;
6780
6781  onError:
6782    Py_XDECREF(v);
6783    Py_XDECREF(errorHandler);
6784    Py_XDECREF(exc);
6785    return NULL;
6786}
6787
6788/* Deprecated */
6789PyObject *
6790PyUnicode_EncodeASCII(const Py_UNICODE *p,
6791                      Py_ssize_t size,
6792                      const char *errors)
6793{
6794    PyObject *result;
6795    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6796    if (unicode == NULL)
6797        return NULL;
6798    result = unicode_encode_ucs1(unicode, errors, 128);
6799    Py_DECREF(unicode);
6800    return result;
6801}
6802
6803PyObject *
6804_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6805{
6806    if (!PyUnicode_Check(unicode)) {
6807        PyErr_BadArgument();
6808        return NULL;
6809    }
6810    if (PyUnicode_READY(unicode) == -1)
6811        return NULL;
6812    /* Fast path: if it is an ASCII-only string, construct bytes object
6813       directly. Else defer to above function to raise the exception. */
6814    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6815        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6816                                         PyUnicode_GET_LENGTH(unicode));
6817    return unicode_encode_ucs1(unicode, errors, 128);
6818}
6819
6820PyObject *
6821PyUnicode_AsASCIIString(PyObject *unicode)
6822{
6823    return _PyUnicode_AsASCIIString(unicode, NULL);
6824}
6825
6826#ifdef HAVE_MBCS
6827
6828/* --- MBCS codecs for Windows -------------------------------------------- */
6829
6830#if SIZEOF_INT < SIZEOF_SIZE_T
6831#define NEED_RETRY
6832#endif
6833
6834#ifndef WC_ERR_INVALID_CHARS
6835#  define WC_ERR_INVALID_CHARS 0x0080
6836#endif
6837
6838static char*
6839code_page_name(UINT code_page, PyObject **obj)
6840{
6841    *obj = NULL;
6842    if (code_page == CP_ACP)
6843        return "mbcs";
6844    if (code_page == CP_UTF7)
6845        return "CP_UTF7";
6846    if (code_page == CP_UTF8)
6847        return "CP_UTF8";
6848
6849    *obj = PyBytes_FromFormat("cp%u", code_page);
6850    if (*obj == NULL)
6851        return NULL;
6852    return PyBytes_AS_STRING(*obj);
6853}
6854
6855static int
6856is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6857{
6858    const char *curr = s + offset;
6859    const char *prev;
6860
6861    if (!IsDBCSLeadByteEx(code_page, *curr))
6862        return 0;
6863
6864    prev = CharPrevExA(code_page, s, curr, 0);
6865    if (prev == curr)
6866        return 1;
6867    /* FIXME: This code is limited to "true" double-byte encodings,
6868       as it assumes an incomplete character consists of a single
6869       byte. */
6870    if (curr - prev == 2)
6871        return 1;
6872    if (!IsDBCSLeadByteEx(code_page, *prev))
6873        return 1;
6874    return 0;
6875}
6876
6877static DWORD
6878decode_code_page_flags(UINT code_page)
6879{
6880    if (code_page == CP_UTF7) {
6881        /* The CP_UTF7 decoder only supports flags=0 */
6882        return 0;
6883    }
6884    else
6885        return MB_ERR_INVALID_CHARS;
6886}
6887
6888/*
6889 * Decode a byte string from a Windows code page into unicode object in strict
6890 * mode.
6891 *
6892 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6893 * WindowsError and returns -1 on other error.
6894 */
6895static int
6896decode_code_page_strict(UINT code_page,
6897                        PyObject **v,
6898                        const char *in,
6899                        int insize)
6900{
6901    const DWORD flags = decode_code_page_flags(code_page);
6902    wchar_t *out;
6903    DWORD outsize;
6904
6905    /* First get the size of the result */
6906    assert(insize > 0);
6907    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6908    if (outsize <= 0)
6909        goto error;
6910
6911    if (*v == NULL) {
6912        /* Create unicode object */
6913        *v = (PyObject*)_PyUnicode_New(outsize);
6914        if (*v == NULL)
6915            return -1;
6916        out = PyUnicode_AS_UNICODE(*v);
6917    }
6918    else {
6919        /* Extend unicode object */
6920        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6921        if (PyUnicode_Resize(v, n + outsize) < 0)
6922            return -1;
6923        out = PyUnicode_AS_UNICODE(*v) + n;
6924    }
6925
6926    /* Do the conversion */
6927    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6928    if (outsize <= 0)
6929        goto error;
6930    return insize;
6931
6932error:
6933    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6934        return -2;
6935    PyErr_SetFromWindowsErr(0);
6936    return -1;
6937}
6938
6939/*
6940 * Decode a byte string from a code page into unicode object with an error
6941 * handler.
6942 *
6943 * Returns consumed size if succeed, or raise a WindowsError or
6944 * UnicodeDecodeError exception and returns -1 on error.
6945 */
6946static int
6947decode_code_page_errors(UINT code_page,
6948                        PyObject **v,
6949                        const char *in, const int size,
6950                        const char *errors)
6951{
6952    const char *startin = in;
6953    const char *endin = in + size;
6954    const DWORD flags = decode_code_page_flags(code_page);
6955    /* Ideally, we should get reason from FormatMessage. This is the Windows
6956       2000 English version of the message. */
6957    const char *reason = "No mapping for the Unicode character exists "
6958                         "in the target code page.";
6959    /* each step cannot decode more than 1 character, but a character can be
6960       represented as a surrogate pair */
6961    wchar_t buffer[2], *startout, *out;
6962    int insize, outsize;
6963    PyObject *errorHandler = NULL;
6964    PyObject *exc = NULL;
6965    PyObject *encoding_obj = NULL;
6966    char *encoding;
6967    DWORD err;
6968    int ret = -1;
6969
6970    assert(size > 0);
6971
6972    encoding = code_page_name(code_page, &encoding_obj);
6973    if (encoding == NULL)
6974        return -1;
6975
6976    if (errors == NULL || strcmp(errors, "strict") == 0) {
6977        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6978           UnicodeDecodeError. */
6979        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6980        if (exc != NULL) {
6981            PyCodec_StrictErrors(exc);
6982            Py_CLEAR(exc);
6983        }
6984        goto error;
6985    }
6986
6987    if (*v == NULL) {
6988        /* Create unicode object */
6989        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6990            PyErr_NoMemory();
6991            goto error;
6992        }
6993        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6994        if (*v == NULL)
6995            goto error;
6996        startout = PyUnicode_AS_UNICODE(*v);
6997    }
6998    else {
6999        /* Extend unicode object */
7000        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7001        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7002            PyErr_NoMemory();
7003            goto error;
7004        }
7005        if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7006            goto error;
7007        startout = PyUnicode_AS_UNICODE(*v) + n;
7008    }
7009
7010    /* Decode the byte string character per character */
7011    out = startout;
7012    while (in < endin)
7013    {
7014        /* Decode a character */
7015        insize = 1;
7016        do
7017        {
7018            outsize = MultiByteToWideChar(code_page, flags,
7019                                          in, insize,
7020                                          buffer, Py_ARRAY_LENGTH(buffer));
7021            if (outsize > 0)
7022                break;
7023            err = GetLastError();
7024            if (err != ERROR_NO_UNICODE_TRANSLATION
7025                && err != ERROR_INSUFFICIENT_BUFFER)
7026            {
7027                PyErr_SetFromWindowsErr(0);
7028                goto error;
7029            }
7030            insize++;
7031        }
7032        /* 4=maximum length of a UTF-8 sequence */
7033        while (insize <= 4 && (in + insize) <= endin);
7034
7035        if (outsize <= 0) {
7036            Py_ssize_t startinpos, endinpos, outpos;
7037
7038            startinpos = in - startin;
7039            endinpos = startinpos + 1;
7040            outpos = out - PyUnicode_AS_UNICODE(*v);
7041            if (unicode_decode_call_errorhandler(
7042                    errors, &errorHandler,
7043                    encoding, reason,
7044                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7045                    v, &outpos))
7046            {
7047                goto error;
7048            }
7049            out = PyUnicode_AS_UNICODE(*v) + outpos;
7050        }
7051        else {
7052            in += insize;
7053            memcpy(out, buffer, outsize * sizeof(wchar_t));
7054            out += outsize;
7055        }
7056    }
7057
7058    /* write a NUL character at the end */
7059    *out = 0;
7060
7061    /* Extend unicode object */
7062    outsize = out - startout;
7063    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7064    if (PyUnicode_Resize(v, outsize) < 0)
7065        goto error;
7066    ret = size;
7067
7068error:
7069    Py_XDECREF(encoding_obj);
7070    Py_XDECREF(errorHandler);
7071    Py_XDECREF(exc);
7072    return ret;
7073}
7074
7075static PyObject *
7076decode_code_page_stateful(int code_page,
7077                          const char *s, Py_ssize_t size,
7078                          const char *errors, Py_ssize_t *consumed)
7079{
7080    PyObject *v = NULL;
7081    int chunk_size, final, converted, done;
7082
7083    if (code_page < 0) {
7084        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7085        return NULL;
7086    }
7087
7088    if (consumed)
7089        *consumed = 0;
7090
7091    do
7092    {
7093#ifdef NEED_RETRY
7094        if (size > INT_MAX) {
7095            chunk_size = INT_MAX;
7096            final = 0;
7097            done = 0;
7098        }
7099        else
7100#endif
7101        {
7102            chunk_size = (int)size;
7103            final = (consumed == NULL);
7104            done = 1;
7105        }
7106
7107        /* Skip trailing lead-byte unless 'final' is set */
7108        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7109            --chunk_size;
7110
7111        if (chunk_size == 0 && done) {
7112            if (v != NULL)
7113                break;
7114            Py_INCREF(unicode_empty);
7115            return unicode_empty;
7116        }
7117
7118
7119        converted = decode_code_page_strict(code_page, &v,
7120                                            s, chunk_size);
7121        if (converted == -2)
7122            converted = decode_code_page_errors(code_page, &v,
7123                                                s, chunk_size,
7124                                                errors);
7125        assert(converted != 0);
7126
7127        if (converted < 0) {
7128            Py_XDECREF(v);
7129            return NULL;
7130        }
7131
7132        if (consumed)
7133            *consumed += converted;
7134
7135        s += converted;
7136        size -= converted;
7137    } while (!done);
7138
7139#ifndef DONT_MAKE_RESULT_READY
7140    if (_PyUnicode_READY_REPLACE(&v)) {
7141        Py_DECREF(v);
7142        return NULL;
7143    }
7144#endif
7145    assert(_PyUnicode_CheckConsistency(v, 1));
7146    return v;
7147}
7148
7149PyObject *
7150PyUnicode_DecodeCodePageStateful(int code_page,
7151                                 const char *s,
7152                                 Py_ssize_t size,
7153                                 const char *errors,
7154                                 Py_ssize_t *consumed)
7155{
7156    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7157}
7158
7159PyObject *
7160PyUnicode_DecodeMBCSStateful(const char *s,
7161                             Py_ssize_t size,
7162                             const char *errors,
7163                             Py_ssize_t *consumed)
7164{
7165    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7166}
7167
7168PyObject *
7169PyUnicode_DecodeMBCS(const char *s,
7170                     Py_ssize_t size,
7171                     const char *errors)
7172{
7173    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7174}
7175
7176static DWORD
7177encode_code_page_flags(UINT code_page, const char *errors)
7178{
7179    if (code_page == CP_UTF8) {
7180        if (winver.dwMajorVersion >= 6)
7181            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7182               and later */
7183            return WC_ERR_INVALID_CHARS;
7184        else
7185            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7186            return 0;
7187    }
7188    else if (code_page == CP_UTF7) {
7189        /* CP_UTF7 only supports flags=0 */
7190        return 0;
7191    }
7192    else {
7193        if (errors != NULL && strcmp(errors, "replace") == 0)
7194            return 0;
7195        else
7196            return WC_NO_BEST_FIT_CHARS;
7197    }
7198}
7199
7200/*
7201 * Encode a Unicode string to a Windows code page into a byte string in strict
7202 * mode.
7203 *
7204 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7205 * a WindowsError and returns -1 on other error.
7206 */
7207static int
7208encode_code_page_strict(UINT code_page, PyObject **outbytes,
7209                        PyObject *unicode, Py_ssize_t offset, int len,
7210                        const char* errors)
7211{
7212    BOOL usedDefaultChar = FALSE;
7213    BOOL *pusedDefaultChar = &usedDefaultChar;
7214    int outsize;
7215    PyObject *exc = NULL;
7216    wchar_t *p;
7217    Py_ssize_t size;
7218    const DWORD flags = encode_code_page_flags(code_page, NULL);
7219    char *out;
7220    /* Create a substring so that we can get the UTF-16 representation
7221       of just the slice under consideration. */
7222    PyObject *substring;
7223
7224    assert(len > 0);
7225
7226    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7227        pusedDefaultChar = &usedDefaultChar;
7228    else
7229        pusedDefaultChar = NULL;
7230
7231    substring = PyUnicode_Substring(unicode, offset, offset+len);
7232    if (substring == NULL)
7233        return -1;
7234    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7235    if (p == NULL) {
7236        Py_DECREF(substring);
7237        return -1;
7238    }
7239
7240    /* First get the size of the result */
7241    outsize = WideCharToMultiByte(code_page, flags,
7242                                  p, size,
7243                                  NULL, 0,
7244                                  NULL, pusedDefaultChar);
7245    if (outsize <= 0)
7246        goto error;
7247    /* If we used a default char, then we failed! */
7248    if (pusedDefaultChar && *pusedDefaultChar) {
7249        Py_DECREF(substring);
7250        return -2;
7251    }
7252
7253    if (*outbytes == NULL) {
7254        /* Create string object */
7255        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7256        if (*outbytes == NULL) {
7257            Py_DECREF(substring);
7258            return -1;
7259        }
7260        out = PyBytes_AS_STRING(*outbytes);
7261    }
7262    else {
7263        /* Extend string object */
7264        const Py_ssize_t n = PyBytes_Size(*outbytes);
7265        if (outsize > PY_SSIZE_T_MAX - n) {
7266            PyErr_NoMemory();
7267            Py_DECREF(substring);
7268            return -1;
7269        }
7270        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7271            Py_DECREF(substring);
7272            return -1;
7273        }
7274        out = PyBytes_AS_STRING(*outbytes) + n;
7275    }
7276
7277    /* Do the conversion */
7278    outsize = WideCharToMultiByte(code_page, flags,
7279                                  p, size,
7280                                  out, outsize,
7281                                  NULL, pusedDefaultChar);
7282    Py_CLEAR(substring);
7283    if (outsize <= 0)
7284        goto error;
7285    if (pusedDefaultChar && *pusedDefaultChar)
7286        return -2;
7287    return 0;
7288
7289error:
7290    Py_XDECREF(substring);
7291    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7292        return -2;
7293    PyErr_SetFromWindowsErr(0);
7294    return -1;
7295}
7296
7297/*
7298 * Encode a Unicode string to a Windows code page into a byte string using a
7299 * error handler.
7300 *
7301 * Returns consumed characters if succeed, or raise a WindowsError and returns
7302 * -1 on other error.
7303 */
7304static int
7305encode_code_page_errors(UINT code_page, PyObject **outbytes,
7306                        PyObject *unicode, Py_ssize_t unicode_offset,
7307                        Py_ssize_t insize, const char* errors)
7308{
7309    const DWORD flags = encode_code_page_flags(code_page, errors);
7310    Py_ssize_t pos = unicode_offset;
7311    Py_ssize_t endin = unicode_offset + insize;
7312    /* Ideally, we should get reason from FormatMessage. This is the Windows
7313       2000 English version of the message. */
7314    const char *reason = "invalid character";
7315    /* 4=maximum length of a UTF-8 sequence */
7316    char buffer[4];
7317    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7318    Py_ssize_t outsize;
7319    char *out;
7320    PyObject *errorHandler = NULL;
7321    PyObject *exc = NULL;
7322    PyObject *encoding_obj = NULL;
7323    char *encoding;
7324    Py_ssize_t newpos, newoutsize;
7325    PyObject *rep;
7326    int ret = -1;
7327
7328    assert(insize > 0);
7329
7330    encoding = code_page_name(code_page, &encoding_obj);
7331    if (encoding == NULL)
7332        return -1;
7333
7334    if (errors == NULL || strcmp(errors, "strict") == 0) {
7335        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7336           then we raise a UnicodeEncodeError. */
7337        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7338        if (exc != NULL) {
7339            PyCodec_StrictErrors(exc);
7340            Py_DECREF(exc);
7341        }
7342        Py_XDECREF(encoding_obj);
7343        return -1;
7344    }
7345
7346    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7347        pusedDefaultChar = &usedDefaultChar;
7348    else
7349        pusedDefaultChar = NULL;
7350
7351    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7352        PyErr_NoMemory();
7353        goto error;
7354    }
7355    outsize = insize * Py_ARRAY_LENGTH(buffer);
7356
7357    if (*outbytes == NULL) {
7358        /* Create string object */
7359        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7360        if (*outbytes == NULL)
7361            goto error;
7362        out = PyBytes_AS_STRING(*outbytes);
7363    }
7364    else {
7365        /* Extend string object */
7366        Py_ssize_t n = PyBytes_Size(*outbytes);
7367        if (n > PY_SSIZE_T_MAX - outsize) {
7368            PyErr_NoMemory();
7369            goto error;
7370        }
7371        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7372            goto error;
7373        out = PyBytes_AS_STRING(*outbytes) + n;
7374    }
7375
7376    /* Encode the string character per character */
7377    while (pos < endin)
7378    {
7379        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7380        wchar_t chars[2];
7381        int charsize;
7382        if (ch < 0x10000) {
7383            chars[0] = (wchar_t)ch;
7384            charsize = 1;
7385        }
7386        else {
7387            ch -= 0x10000;
7388            chars[0] = 0xd800 + (ch >> 10);
7389            chars[1] = 0xdc00 + (ch & 0x3ff);
7390            charsize = 2;
7391        }
7392
7393        outsize = WideCharToMultiByte(code_page, flags,
7394                                      chars, charsize,
7395                                      buffer, Py_ARRAY_LENGTH(buffer),
7396                                      NULL, pusedDefaultChar);
7397        if (outsize > 0) {
7398            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7399            {
7400                pos++;
7401                memcpy(out, buffer, outsize);
7402                out += outsize;
7403                continue;
7404            }
7405        }
7406        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7407            PyErr_SetFromWindowsErr(0);
7408            goto error;
7409        }
7410
7411        rep = unicode_encode_call_errorhandler(
7412                  errors, &errorHandler, encoding, reason,
7413                  unicode, &exc,
7414                  pos, pos + 1, &newpos);
7415        if (rep == NULL)
7416            goto error;
7417        pos = newpos;
7418
7419        if (PyBytes_Check(rep)) {
7420            outsize = PyBytes_GET_SIZE(rep);
7421            if (outsize != 1) {
7422                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7423                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7424                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7425                    Py_DECREF(rep);
7426                    goto error;
7427                }
7428                out = PyBytes_AS_STRING(*outbytes) + offset;
7429            }
7430            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7431            out += outsize;
7432        }
7433        else {
7434            Py_ssize_t i;
7435            enum PyUnicode_Kind kind;
7436            void *data;
7437
7438            if (PyUnicode_READY(rep) < 0) {
7439                Py_DECREF(rep);
7440                goto error;
7441            }
7442
7443            outsize = PyUnicode_GET_LENGTH(rep);
7444            if (outsize != 1) {
7445                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7446                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7447                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7448                    Py_DECREF(rep);
7449                    goto error;
7450                }
7451                out = PyBytes_AS_STRING(*outbytes) + offset;
7452            }
7453            kind = PyUnicode_KIND(rep);
7454            data = PyUnicode_DATA(rep);
7455            for (i=0; i < outsize; i++) {
7456                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7457                if (ch > 127) {
7458                    raise_encode_exception(&exc,
7459                        encoding, unicode,
7460                        pos, pos + 1,
7461                        "unable to encode error handler result to ASCII");
7462                    Py_DECREF(rep);
7463                    goto error;
7464                }
7465                *out = (unsigned char)ch;
7466                out++;
7467            }
7468        }
7469        Py_DECREF(rep);
7470    }
7471    /* write a NUL byte */
7472    *out = 0;
7473    outsize = out - PyBytes_AS_STRING(*outbytes);
7474    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7475    if (_PyBytes_Resize(outbytes, outsize) < 0)
7476        goto error;
7477    ret = 0;
7478
7479error:
7480    Py_XDECREF(encoding_obj);
7481    Py_XDECREF(errorHandler);
7482    Py_XDECREF(exc);
7483    return ret;
7484}
7485
7486static PyObject *
7487encode_code_page(int code_page,
7488                 PyObject *unicode,
7489                 const char *errors)
7490{
7491    Py_ssize_t len;
7492    PyObject *outbytes = NULL;
7493    Py_ssize_t offset;
7494    int chunk_len, ret, done;
7495
7496    if (PyUnicode_READY(unicode) < 0)
7497        return NULL;
7498    len = PyUnicode_GET_LENGTH(unicode);
7499
7500    if (code_page < 0) {
7501        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7502        return NULL;
7503    }
7504
7505    if (len == 0)
7506        return PyBytes_FromStringAndSize(NULL, 0);
7507
7508    offset = 0;
7509    do
7510    {
7511#ifdef NEED_RETRY
7512        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7513           chunks. */
7514        if (len > INT_MAX/2) {
7515            chunk_len = INT_MAX/2;
7516            done = 0;
7517        }
7518        else
7519#endif
7520        {
7521            chunk_len = (int)len;
7522            done = 1;
7523        }
7524
7525        ret = encode_code_page_strict(code_page, &outbytes,
7526                                      unicode, offset, chunk_len,
7527                                      errors);
7528        if (ret == -2)
7529            ret = encode_code_page_errors(code_page, &outbytes,
7530                                          unicode, offset,
7531                                          chunk_len, errors);
7532        if (ret < 0) {
7533            Py_XDECREF(outbytes);
7534            return NULL;
7535        }
7536
7537        offset += chunk_len;
7538        len -= chunk_len;
7539    } while (!done);
7540
7541    return outbytes;
7542}
7543
7544PyObject *
7545PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7546                     Py_ssize_t size,
7547                     const char *errors)
7548{
7549    PyObject *unicode, *res;
7550    unicode = PyUnicode_FromUnicode(p, size);
7551    if (unicode == NULL)
7552        return NULL;
7553    res = encode_code_page(CP_ACP, unicode, errors);
7554    Py_DECREF(unicode);
7555    return res;
7556}
7557
7558PyObject *
7559PyUnicode_EncodeCodePage(int code_page,
7560                         PyObject *unicode,
7561                         const char *errors)
7562{
7563    return encode_code_page(code_page, unicode, errors);
7564}
7565
7566PyObject *
7567PyUnicode_AsMBCSString(PyObject *unicode)
7568{
7569    if (!PyUnicode_Check(unicode)) {
7570        PyErr_BadArgument();
7571        return NULL;
7572    }
7573    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7574}
7575
7576#undef NEED_RETRY
7577
7578#endif /* HAVE_MBCS */
7579
7580/* --- Character Mapping Codec -------------------------------------------- */
7581
7582PyObject *
7583PyUnicode_DecodeCharmap(const char *s,
7584                        Py_ssize_t size,
7585                        PyObject *mapping,
7586                        const char *errors)
7587{
7588    const char *starts = s;
7589    Py_ssize_t startinpos;
7590    Py_ssize_t endinpos;
7591    Py_ssize_t outpos;
7592    const char *e;
7593    PyObject *v;
7594    Py_ssize_t extrachars = 0;
7595    PyObject *errorHandler = NULL;
7596    PyObject *exc = NULL;
7597
7598    /* Default to Latin-1 */
7599    if (mapping == NULL)
7600        return PyUnicode_DecodeLatin1(s, size, errors);
7601
7602    v = PyUnicode_New(size, 127);
7603    if (v == NULL)
7604        goto onError;
7605    if (size == 0)
7606        return v;
7607    outpos = 0;
7608    e = s + size;
7609    if (PyUnicode_CheckExact(mapping)) {
7610        Py_ssize_t maplen;
7611        enum PyUnicode_Kind kind;
7612        void *data;
7613        Py_UCS4 x;
7614
7615        if (PyUnicode_READY(mapping) < 0)
7616            return NULL;
7617
7618        maplen = PyUnicode_GET_LENGTH(mapping);
7619        data = PyUnicode_DATA(mapping);
7620        kind = PyUnicode_KIND(mapping);
7621        while (s < e) {
7622            unsigned char ch = *s;
7623
7624            if (ch < maplen)
7625                x = PyUnicode_READ(kind, data, ch);
7626            else
7627                x = 0xfffe; /* invalid value */
7628
7629            if (x == 0xfffe)
7630            {
7631                /* undefined mapping */
7632                startinpos = s-starts;
7633                endinpos = startinpos+1;
7634                if (unicode_decode_call_errorhandler(
7635                        errors, &errorHandler,
7636                        "charmap", "character maps to <undefined>",
7637                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7638                        &v, &outpos)) {
7639                    goto onError;
7640                }
7641                continue;
7642            }
7643
7644            if (unicode_putchar(&v, &outpos, x) < 0)
7645                goto onError;
7646            ++s;
7647        }
7648    }
7649    else {
7650        while (s < e) {
7651            unsigned char ch = *s;
7652            PyObject *w, *x;
7653
7654            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7655            w = PyLong_FromLong((long)ch);
7656            if (w == NULL)
7657                goto onError;
7658            x = PyObject_GetItem(mapping, w);
7659            Py_DECREF(w);
7660            if (x == NULL) {
7661                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7662                    /* No mapping found means: mapping is undefined. */
7663                    PyErr_Clear();
7664                    x = Py_None;
7665                    Py_INCREF(x);
7666                } else
7667                    goto onError;
7668            }
7669
7670            /* Apply mapping */
7671            if (PyLong_Check(x)) {
7672                long value = PyLong_AS_LONG(x);
7673                if (value < 0 || value > 65535) {
7674                    PyErr_SetString(PyExc_TypeError,
7675                                    "character mapping must be in range(65536)");
7676                    Py_DECREF(x);
7677                    goto onError;
7678                }
7679                if (unicode_putchar(&v, &outpos, value) < 0)
7680                    goto onError;
7681            }
7682            else if (x == Py_None) {
7683                /* undefined mapping */
7684                startinpos = s-starts;
7685                endinpos = startinpos+1;
7686                if (unicode_decode_call_errorhandler(
7687                        errors, &errorHandler,
7688                        "charmap", "character maps to <undefined>",
7689                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7690                        &v, &outpos)) {
7691                    Py_DECREF(x);
7692                    goto onError;
7693                }
7694                Py_DECREF(x);
7695                continue;
7696            }
7697            else if (PyUnicode_Check(x)) {
7698                Py_ssize_t targetsize;
7699
7700                if (PyUnicode_READY(x) < 0)
7701                    goto onError;
7702                targetsize = PyUnicode_GET_LENGTH(x);
7703
7704                if (targetsize == 1) {
7705                    /* 1-1 mapping */
7706                    if (unicode_putchar(&v, &outpos,
7707                                        PyUnicode_READ_CHAR(x, 0)) < 0)
7708                        goto onError;
7709                }
7710                else if (targetsize > 1) {
7711                    /* 1-n mapping */
7712                    if (targetsize > extrachars) {
7713                        /* resize first */
7714                        Py_ssize_t needed = (targetsize - extrachars) + \
7715                            (targetsize << 2);
7716                        extrachars += needed;
7717                        /* XXX overflow detection missing */
7718                        if (PyUnicode_Resize(&v,
7719                                             PyUnicode_GET_LENGTH(v) + needed) < 0) {
7720                            Py_DECREF(x);
7721                            goto onError;
7722                        }
7723                    }
7724                    if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7725                        goto onError;
7726                    PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7727                    outpos += targetsize;
7728                    extrachars -= targetsize;
7729                }
7730                /* 1-0 mapping: skip the character */
7731            }
7732            else {
7733                /* wrong return value */
7734                PyErr_SetString(PyExc_TypeError,
7735                                "character mapping must return integer, None or str");
7736                Py_DECREF(x);
7737                goto onError;
7738            }
7739            Py_DECREF(x);
7740            ++s;
7741        }
7742    }
7743    if (PyUnicode_Resize(&v, outpos) < 0)
7744        goto onError;
7745    Py_XDECREF(errorHandler);
7746    Py_XDECREF(exc);
7747    assert(_PyUnicode_CheckConsistency(v, 1));
7748    return v;
7749
7750  onError:
7751    Py_XDECREF(errorHandler);
7752    Py_XDECREF(exc);
7753    Py_XDECREF(v);
7754    return NULL;
7755}
7756
7757/* Charmap encoding: the lookup table */
7758
7759struct encoding_map {
7760    PyObject_HEAD
7761    unsigned char level1[32];
7762    int count2, count3;
7763    unsigned char level23[1];
7764};
7765
7766static PyObject*
7767encoding_map_size(PyObject *obj, PyObject* args)
7768{
7769    struct encoding_map *map = (struct encoding_map*)obj;
7770    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7771                           128*map->count3);
7772}
7773
7774static PyMethodDef encoding_map_methods[] = {
7775    {"size", encoding_map_size, METH_NOARGS,
7776     PyDoc_STR("Return the size (in bytes) of this object") },
7777    { 0 }
7778};
7779
7780static void
7781encoding_map_dealloc(PyObject* o)
7782{
7783    PyObject_FREE(o);
7784}
7785
7786static PyTypeObject EncodingMapType = {
7787    PyVarObject_HEAD_INIT(NULL, 0)
7788    "EncodingMap",          /*tp_name*/
7789    sizeof(struct encoding_map),   /*tp_basicsize*/
7790    0,                      /*tp_itemsize*/
7791    /* methods */
7792    encoding_map_dealloc,   /*tp_dealloc*/
7793    0,                      /*tp_print*/
7794    0,                      /*tp_getattr*/
7795    0,                      /*tp_setattr*/
7796    0,                      /*tp_reserved*/
7797    0,                      /*tp_repr*/
7798    0,                      /*tp_as_number*/
7799    0,                      /*tp_as_sequence*/
7800    0,                      /*tp_as_mapping*/
7801    0,                      /*tp_hash*/
7802    0,                      /*tp_call*/
7803    0,                      /*tp_str*/
7804    0,                      /*tp_getattro*/
7805    0,                      /*tp_setattro*/
7806    0,                      /*tp_as_buffer*/
7807    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7808    0,                      /*tp_doc*/
7809    0,                      /*tp_traverse*/
7810    0,                      /*tp_clear*/
7811    0,                      /*tp_richcompare*/
7812    0,                      /*tp_weaklistoffset*/
7813    0,                      /*tp_iter*/
7814    0,                      /*tp_iternext*/
7815    encoding_map_methods,   /*tp_methods*/
7816    0,                      /*tp_members*/
7817    0,                      /*tp_getset*/
7818    0,                      /*tp_base*/
7819    0,                      /*tp_dict*/
7820    0,                      /*tp_descr_get*/
7821    0,                      /*tp_descr_set*/
7822    0,                      /*tp_dictoffset*/
7823    0,                      /*tp_init*/
7824    0,                      /*tp_alloc*/
7825    0,                      /*tp_new*/
7826    0,                      /*tp_free*/
7827    0,                      /*tp_is_gc*/
7828};
7829
7830PyObject*
7831PyUnicode_BuildEncodingMap(PyObject* string)
7832{
7833    PyObject *result;
7834    struct encoding_map *mresult;
7835    int i;
7836    int need_dict = 0;
7837    unsigned char level1[32];
7838    unsigned char level2[512];
7839    unsigned char *mlevel1, *mlevel2, *mlevel3;
7840    int count2 = 0, count3 = 0;
7841    int kind;
7842    void *data;
7843    Py_UCS4 ch;
7844
7845    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7846        PyErr_BadArgument();
7847        return NULL;
7848    }
7849    kind = PyUnicode_KIND(string);
7850    data = PyUnicode_DATA(string);
7851    memset(level1, 0xFF, sizeof level1);
7852    memset(level2, 0xFF, sizeof level2);
7853
7854    /* If there isn't a one-to-one mapping of NULL to \0,
7855       or if there are non-BMP characters, we need to use
7856       a mapping dictionary. */
7857    if (PyUnicode_READ(kind, data, 0) != 0)
7858        need_dict = 1;
7859    for (i = 1; i < 256; i++) {
7860        int l1, l2;
7861        ch = PyUnicode_READ(kind, data, i);
7862        if (ch == 0 || ch > 0xFFFF) {
7863            need_dict = 1;
7864            break;
7865        }
7866        if (ch == 0xFFFE)
7867            /* unmapped character */
7868            continue;
7869        l1 = ch >> 11;
7870        l2 = ch >> 7;
7871        if (level1[l1] == 0xFF)
7872            level1[l1] = count2++;
7873        if (level2[l2] == 0xFF)
7874            level2[l2] = count3++;
7875    }
7876
7877    if (count2 >= 0xFF || count3 >= 0xFF)
7878        need_dict = 1;
7879
7880    if (need_dict) {
7881        PyObject *result = PyDict_New();
7882        PyObject *key, *value;
7883        if (!result)
7884            return NULL;
7885        for (i = 0; i < 256; i++) {
7886            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7887            value = PyLong_FromLong(i);
7888            if (!key || !value)
7889                goto failed1;
7890            if (PyDict_SetItem(result, key, value) == -1)
7891                goto failed1;
7892            Py_DECREF(key);
7893            Py_DECREF(value);
7894        }
7895        return result;
7896      failed1:
7897        Py_XDECREF(key);
7898        Py_XDECREF(value);
7899        Py_DECREF(result);
7900        return NULL;
7901    }
7902
7903    /* Create a three-level trie */
7904    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7905                             16*count2 + 128*count3 - 1);
7906    if (!result)
7907        return PyErr_NoMemory();
7908    PyObject_Init(result, &EncodingMapType);
7909    mresult = (struct encoding_map*)result;
7910    mresult->count2 = count2;
7911    mresult->count3 = count3;
7912    mlevel1 = mresult->level1;
7913    mlevel2 = mresult->level23;
7914    mlevel3 = mresult->level23 + 16*count2;
7915    memcpy(mlevel1, level1, 32);
7916    memset(mlevel2, 0xFF, 16*count2);
7917    memset(mlevel3, 0, 128*count3);
7918    count3 = 0;
7919    for (i = 1; i < 256; i++) {
7920        int o1, o2, o3, i2, i3;
7921        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7922            /* unmapped character */
7923            continue;
7924        o1 = PyUnicode_READ(kind, data, i)>>11;
7925        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7926        i2 = 16*mlevel1[o1] + o2;
7927        if (mlevel2[i2] == 0xFF)
7928            mlevel2[i2] = count3++;
7929        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7930        i3 = 128*mlevel2[i2] + o3;
7931        mlevel3[i3] = i;
7932    }
7933    return result;
7934}
7935
7936static int
7937encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7938{
7939    struct encoding_map *map = (struct encoding_map*)mapping;
7940    int l1 = c>>11;
7941    int l2 = (c>>7) & 0xF;
7942    int l3 = c & 0x7F;
7943    int i;
7944
7945#ifdef Py_UNICODE_WIDE
7946    if (c > 0xFFFF) {
7947        return -1;
7948    }
7949#endif
7950    if (c == 0)
7951        return 0;
7952    /* level 1*/
7953    i = map->level1[l1];
7954    if (i == 0xFF) {
7955        return -1;
7956    }
7957    /* level 2*/
7958    i = map->level23[16*i+l2];
7959    if (i == 0xFF) {
7960        return -1;
7961    }
7962    /* level 3 */
7963    i = map->level23[16*map->count2 + 128*i + l3];
7964    if (i == 0) {
7965        return -1;
7966    }
7967    return i;
7968}
7969
7970/* Lookup the character ch in the mapping. If the character
7971   can't be found, Py_None is returned (or NULL, if another
7972   error occurred). */
7973static PyObject *
7974charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
7975{
7976    PyObject *w = PyLong_FromLong((long)c);
7977    PyObject *x;
7978
7979    if (w == NULL)
7980        return NULL;
7981    x = PyObject_GetItem(mapping, w);
7982    Py_DECREF(w);
7983    if (x == NULL) {
7984        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7985            /* No mapping found means: mapping is undefined. */
7986            PyErr_Clear();
7987            x = Py_None;
7988            Py_INCREF(x);
7989            return x;
7990        } else
7991            return NULL;
7992    }
7993    else if (x == Py_None)
7994        return x;
7995    else if (PyLong_Check(x)) {
7996        long value = PyLong_AS_LONG(x);
7997        if (value < 0 || value > 255) {
7998            PyErr_SetString(PyExc_TypeError,
7999                            "character mapping must be in range(256)");
8000            Py_DECREF(x);
8001            return NULL;
8002        }
8003        return x;
8004    }
8005    else if (PyBytes_Check(x))
8006        return x;
8007    else {
8008        /* wrong return value */
8009        PyErr_Format(PyExc_TypeError,
8010                     "character mapping must return integer, bytes or None, not %.400s",
8011                     x->ob_type->tp_name);
8012        Py_DECREF(x);
8013        return NULL;
8014    }
8015}
8016
8017static int
8018charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8019{
8020    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8021    /* exponentially overallocate to minimize reallocations */
8022    if (requiredsize < 2*outsize)
8023        requiredsize = 2*outsize;
8024    if (_PyBytes_Resize(outobj, requiredsize))
8025        return -1;
8026    return 0;
8027}
8028
8029typedef enum charmapencode_result {
8030    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8031} charmapencode_result;
8032/* lookup the character, put the result in the output string and adjust
8033   various state variables. Resize the output bytes object if not enough
8034   space is available. Return a new reference to the object that
8035   was put in the output buffer, or Py_None, if the mapping was undefined
8036   (in which case no character was written) or NULL, if a
8037   reallocation error occurred. The caller must decref the result */
8038static charmapencode_result
8039charmapencode_output(Py_UNICODE c, PyObject *mapping,
8040                     PyObject **outobj, Py_ssize_t *outpos)
8041{
8042    PyObject *rep;
8043    char *outstart;
8044    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8045
8046    if (Py_TYPE(mapping) == &EncodingMapType) {
8047        int res = encoding_map_lookup(c, mapping);
8048        Py_ssize_t requiredsize = *outpos+1;
8049        if (res == -1)
8050            return enc_FAILED;
8051        if (outsize<requiredsize)
8052            if (charmapencode_resize(outobj, outpos, requiredsize))
8053                return enc_EXCEPTION;
8054        outstart = PyBytes_AS_STRING(*outobj);
8055        outstart[(*outpos)++] = (char)res;
8056        return enc_SUCCESS;
8057    }
8058
8059    rep = charmapencode_lookup(c, mapping);
8060    if (rep==NULL)
8061        return enc_EXCEPTION;
8062    else if (rep==Py_None) {
8063        Py_DECREF(rep);
8064        return enc_FAILED;
8065    } else {
8066        if (PyLong_Check(rep)) {
8067            Py_ssize_t requiredsize = *outpos+1;
8068            if (outsize<requiredsize)
8069                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8070                    Py_DECREF(rep);
8071                    return enc_EXCEPTION;
8072                }
8073            outstart = PyBytes_AS_STRING(*outobj);
8074            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8075        }
8076        else {
8077            const char *repchars = PyBytes_AS_STRING(rep);
8078            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8079            Py_ssize_t requiredsize = *outpos+repsize;
8080            if (outsize<requiredsize)
8081                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8082                    Py_DECREF(rep);
8083                    return enc_EXCEPTION;
8084                }
8085            outstart = PyBytes_AS_STRING(*outobj);
8086            memcpy(outstart + *outpos, repchars, repsize);
8087            *outpos += repsize;
8088        }
8089    }
8090    Py_DECREF(rep);
8091    return enc_SUCCESS;
8092}
8093
8094/* handle an error in PyUnicode_EncodeCharmap
8095   Return 0 on success, -1 on error */
8096static int
8097charmap_encoding_error(
8098    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8099    PyObject **exceptionObject,
8100    int *known_errorHandler, PyObject **errorHandler, const char *errors,
8101    PyObject **res, Py_ssize_t *respos)
8102{
8103    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8104    Py_ssize_t size, repsize;
8105    Py_ssize_t newpos;
8106    Py_UNICODE *uni2;
8107    /* startpos for collecting unencodable chars */
8108    Py_ssize_t collstartpos = *inpos;
8109    Py_ssize_t collendpos = *inpos+1;
8110    Py_ssize_t collpos;
8111    char *encoding = "charmap";
8112    char *reason = "character maps to <undefined>";
8113    charmapencode_result x;
8114    Py_UCS4 ch;
8115    int val;
8116
8117    if (PyUnicode_READY(unicode) < 0)
8118        return -1;
8119    size = PyUnicode_GET_LENGTH(unicode);
8120    /* find all unencodable characters */
8121    while (collendpos < size) {
8122        PyObject *rep;
8123        if (Py_TYPE(mapping) == &EncodingMapType) {
8124            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8125            val = encoding_map_lookup(ch, mapping);
8126            if (val != -1)
8127                break;
8128            ++collendpos;
8129            continue;
8130        }
8131
8132        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8133        rep = charmapencode_lookup(ch, mapping);
8134        if (rep==NULL)
8135            return -1;
8136        else if (rep!=Py_None) {
8137            Py_DECREF(rep);
8138            break;
8139        }
8140        Py_DECREF(rep);
8141        ++collendpos;
8142    }
8143    /* cache callback name lookup
8144     * (if not done yet, i.e. it's the first error) */
8145    if (*known_errorHandler==-1) {
8146        if ((errors==NULL) || (!strcmp(errors, "strict")))
8147            *known_errorHandler = 1;
8148        else if (!strcmp(errors, "replace"))
8149            *known_errorHandler = 2;
8150        else if (!strcmp(errors, "ignore"))
8151            *known_errorHandler = 3;
8152        else if (!strcmp(errors, "xmlcharrefreplace"))
8153            *known_errorHandler = 4;
8154        else
8155            *known_errorHandler = 0;
8156    }
8157    switch (*known_errorHandler) {
8158    case 1: /* strict */
8159        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8160        return -1;
8161    case 2: /* replace */
8162        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8163            x = charmapencode_output('?', mapping, res, respos);
8164            if (x==enc_EXCEPTION) {
8165                return -1;
8166            }
8167            else if (x==enc_FAILED) {
8168                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8169                return -1;
8170            }
8171        }
8172        /* fall through */
8173    case 3: /* ignore */
8174        *inpos = collendpos;
8175        break;
8176    case 4: /* xmlcharrefreplace */
8177        /* generate replacement (temporarily (mis)uses p) */
8178        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8179            char buffer[2+29+1+1];
8180            char *cp;
8181            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8182            for (cp = buffer; *cp; ++cp) {
8183                x = charmapencode_output(*cp, mapping, res, respos);
8184                if (x==enc_EXCEPTION)
8185                    return -1;
8186                else if (x==enc_FAILED) {
8187                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8188                    return -1;
8189                }
8190            }
8191        }
8192        *inpos = collendpos;
8193        break;
8194    default:
8195        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8196                                                      encoding, reason, unicode, exceptionObject,
8197                                                      collstartpos, collendpos, &newpos);
8198        if (repunicode == NULL)
8199            return -1;
8200        if (PyBytes_Check(repunicode)) {
8201            /* Directly copy bytes result to output. */
8202            Py_ssize_t outsize = PyBytes_Size(*res);
8203            Py_ssize_t requiredsize;
8204            repsize = PyBytes_Size(repunicode);
8205            requiredsize = *respos + repsize;
8206            if (requiredsize > outsize)
8207                /* Make room for all additional bytes. */
8208                if (charmapencode_resize(res, respos, requiredsize)) {
8209                    Py_DECREF(repunicode);
8210                    return -1;
8211                }
8212            memcpy(PyBytes_AsString(*res) + *respos,
8213                   PyBytes_AsString(repunicode),  repsize);
8214            *respos += repsize;
8215            *inpos = newpos;
8216            Py_DECREF(repunicode);
8217            break;
8218        }
8219        /* generate replacement  */
8220        repsize = PyUnicode_GET_SIZE(repunicode);
8221        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8222            x = charmapencode_output(*uni2, mapping, res, respos);
8223            if (x==enc_EXCEPTION) {
8224                return -1;
8225            }
8226            else if (x==enc_FAILED) {
8227                Py_DECREF(repunicode);
8228                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8229                return -1;
8230            }
8231        }
8232        *inpos = newpos;
8233        Py_DECREF(repunicode);
8234    }
8235    return 0;
8236}
8237
8238PyObject *
8239_PyUnicode_EncodeCharmap(PyObject *unicode,
8240                         PyObject *mapping,
8241                         const char *errors)
8242{
8243    /* output object */
8244    PyObject *res = NULL;
8245    /* current input position */
8246    Py_ssize_t inpos = 0;
8247    Py_ssize_t size;
8248    /* current output position */
8249    Py_ssize_t respos = 0;
8250    PyObject *errorHandler = NULL;
8251    PyObject *exc = NULL;
8252    /* the following variable is used for caching string comparisons
8253     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8254     * 3=ignore, 4=xmlcharrefreplace */
8255    int known_errorHandler = -1;
8256
8257    if (PyUnicode_READY(unicode) < 0)
8258        return NULL;
8259    size = PyUnicode_GET_LENGTH(unicode);
8260
8261    /* Default to Latin-1 */
8262    if (mapping == NULL)
8263        return unicode_encode_ucs1(unicode, errors, 256);
8264
8265    /* allocate enough for a simple encoding without
8266       replacements, if we need more, we'll resize */
8267    res = PyBytes_FromStringAndSize(NULL, size);
8268    if (res == NULL)
8269        goto onError;
8270    if (size == 0)
8271        return res;
8272
8273    while (inpos<size) {
8274        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
8275        /* try to encode it */
8276        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8277        if (x==enc_EXCEPTION) /* error */
8278            goto onError;
8279        if (x==enc_FAILED) { /* unencodable character */
8280            if (charmap_encoding_error(unicode, &inpos, mapping,
8281                                       &exc,
8282                                       &known_errorHandler, &errorHandler, errors,
8283                                       &res, &respos)) {
8284                goto onError;
8285            }
8286        }
8287        else
8288            /* done with this character => adjust input position */
8289            ++inpos;
8290    }
8291
8292    /* Resize if we allocated to much */
8293    if (respos<PyBytes_GET_SIZE(res))
8294        if (_PyBytes_Resize(&res, respos) < 0)
8295            goto onError;
8296
8297    Py_XDECREF(exc);
8298    Py_XDECREF(errorHandler);
8299    return res;
8300
8301  onError:
8302    Py_XDECREF(res);
8303    Py_XDECREF(exc);
8304    Py_XDECREF(errorHandler);
8305    return NULL;
8306}
8307
8308/* Deprecated */
8309PyObject *
8310PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8311                        Py_ssize_t size,
8312                        PyObject *mapping,
8313                        const char *errors)
8314{
8315    PyObject *result;
8316    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8317    if (unicode == NULL)
8318        return NULL;
8319    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8320    Py_DECREF(unicode);
8321    return result;
8322}
8323
8324PyObject *
8325PyUnicode_AsCharmapString(PyObject *unicode,
8326                          PyObject *mapping)
8327{
8328    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8329        PyErr_BadArgument();
8330        return NULL;
8331    }
8332    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8333}
8334
8335/* create or adjust a UnicodeTranslateError */
8336static void
8337make_translate_exception(PyObject **exceptionObject,
8338                         PyObject *unicode,
8339                         Py_ssize_t startpos, Py_ssize_t endpos,
8340                         const char *reason)
8341{
8342    if (*exceptionObject == NULL) {
8343        *exceptionObject = _PyUnicodeTranslateError_Create(
8344            unicode, startpos, endpos, reason);
8345    }
8346    else {
8347        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8348            goto onError;
8349        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8350            goto onError;
8351        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8352            goto onError;
8353        return;
8354      onError:
8355        Py_DECREF(*exceptionObject);
8356        *exceptionObject = NULL;
8357    }
8358}
8359
8360/* raises a UnicodeTranslateError */
8361static void
8362raise_translate_exception(PyObject **exceptionObject,
8363                          PyObject *unicode,
8364                          Py_ssize_t startpos, Py_ssize_t endpos,
8365                          const char *reason)
8366{
8367    make_translate_exception(exceptionObject,
8368                             unicode, startpos, endpos, reason);
8369    if (*exceptionObject != NULL)
8370        PyCodec_StrictErrors(*exceptionObject);
8371}
8372
8373/* error handling callback helper:
8374   build arguments, call the callback and check the arguments,
8375   put the result into newpos and return the replacement string, which
8376   has to be freed by the caller */
8377static PyObject *
8378unicode_translate_call_errorhandler(const char *errors,
8379                                    PyObject **errorHandler,
8380                                    const char *reason,
8381                                    PyObject *unicode, PyObject **exceptionObject,
8382                                    Py_ssize_t startpos, Py_ssize_t endpos,
8383                                    Py_ssize_t *newpos)
8384{
8385    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8386
8387    Py_ssize_t i_newpos;
8388    PyObject *restuple;
8389    PyObject *resunicode;
8390
8391    if (*errorHandler == NULL) {
8392        *errorHandler = PyCodec_LookupError(errors);
8393        if (*errorHandler == NULL)
8394            return NULL;
8395    }
8396
8397    make_translate_exception(exceptionObject,
8398                             unicode, startpos, endpos, reason);
8399    if (*exceptionObject == NULL)
8400        return NULL;
8401
8402    restuple = PyObject_CallFunctionObjArgs(
8403        *errorHandler, *exceptionObject, NULL);
8404    if (restuple == NULL)
8405        return NULL;
8406    if (!PyTuple_Check(restuple)) {
8407        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8408        Py_DECREF(restuple);
8409        return NULL;
8410    }
8411    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8412                          &resunicode, &i_newpos)) {
8413        Py_DECREF(restuple);
8414        return NULL;
8415    }
8416    if (i_newpos<0)
8417        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8418    else
8419        *newpos = i_newpos;
8420    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8421        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8422        Py_DECREF(restuple);
8423        return NULL;
8424    }
8425    Py_INCREF(resunicode);
8426    Py_DECREF(restuple);
8427    return resunicode;
8428}
8429
8430/* Lookup the character ch in the mapping and put the result in result,
8431   which must be decrefed by the caller.
8432   Return 0 on success, -1 on error */
8433static int
8434charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8435{
8436    PyObject *w = PyLong_FromLong((long)c);
8437    PyObject *x;
8438
8439    if (w == NULL)
8440        return -1;
8441    x = PyObject_GetItem(mapping, w);
8442    Py_DECREF(w);
8443    if (x == NULL) {
8444        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8445            /* No mapping found means: use 1:1 mapping. */
8446            PyErr_Clear();
8447            *result = NULL;
8448            return 0;
8449        } else
8450            return -1;
8451    }
8452    else if (x == Py_None) {
8453        *result = x;
8454        return 0;
8455    }
8456    else if (PyLong_Check(x)) {
8457        long value = PyLong_AS_LONG(x);
8458        long max = PyUnicode_GetMax();
8459        if (value < 0 || value > max) {
8460            PyErr_Format(PyExc_TypeError,
8461                         "character mapping must be in range(0x%x)", max+1);
8462            Py_DECREF(x);
8463            return -1;
8464        }
8465        *result = x;
8466        return 0;
8467    }
8468    else if (PyUnicode_Check(x)) {
8469        *result = x;
8470        return 0;
8471    }
8472    else {
8473        /* wrong return value */
8474        PyErr_SetString(PyExc_TypeError,
8475                        "character mapping must return integer, None or str");
8476        Py_DECREF(x);
8477        return -1;
8478    }
8479}
8480/* ensure that *outobj is at least requiredsize characters long,
8481   if not reallocate and adjust various state variables.
8482   Return 0 on success, -1 on error */
8483static int
8484charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8485                               Py_ssize_t requiredsize)
8486{
8487    Py_ssize_t oldsize = *psize;
8488    if (requiredsize > oldsize) {
8489        /* exponentially overallocate to minimize reallocations */
8490        if (requiredsize < 2 * oldsize)
8491            requiredsize = 2 * oldsize;
8492        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8493        if (*outobj == 0)
8494            return -1;
8495        *psize = requiredsize;
8496    }
8497    return 0;
8498}
8499/* lookup the character, put the result in the output string and adjust
8500   various state variables. Return a new reference to the object that
8501   was put in the output buffer in *result, or Py_None, if the mapping was
8502   undefined (in which case no character was written).
8503   The called must decref result.
8504   Return 0 on success, -1 on error. */
8505static int
8506charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8507                        PyObject *mapping, Py_UCS4 **output,
8508                        Py_ssize_t *osize, Py_ssize_t *opos,
8509                        PyObject **res)
8510{
8511    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8512    if (charmaptranslate_lookup(curinp, mapping, res))
8513        return -1;
8514    if (*res==NULL) {
8515        /* not found => default to 1:1 mapping */
8516        (*output)[(*opos)++] = curinp;
8517    }
8518    else if (*res==Py_None)
8519        ;
8520    else if (PyLong_Check(*res)) {
8521        /* no overflow check, because we know that the space is enough */
8522        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8523    }
8524    else if (PyUnicode_Check(*res)) {
8525        Py_ssize_t repsize;
8526        if (PyUnicode_READY(*res) == -1)
8527            return -1;
8528        repsize = PyUnicode_GET_LENGTH(*res);
8529        if (repsize==1) {
8530            /* no overflow check, because we know that the space is enough */
8531            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8532        }
8533        else if (repsize!=0) {
8534            /* more than one character */
8535            Py_ssize_t requiredsize = *opos +
8536                (PyUnicode_GET_LENGTH(input) - ipos) +
8537                repsize - 1;
8538            Py_ssize_t i;
8539            if (charmaptranslate_makespace(output, osize, requiredsize))
8540                return -1;
8541            for(i = 0; i < repsize; i++)
8542                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8543        }
8544    }
8545    else
8546        return -1;
8547    return 0;
8548}
8549
8550PyObject *
8551_PyUnicode_TranslateCharmap(PyObject *input,
8552                            PyObject *mapping,
8553                            const char *errors)
8554{
8555    /* input object */
8556    char *idata;
8557    Py_ssize_t size, i;
8558    int kind;
8559    /* output buffer */
8560    Py_UCS4 *output = NULL;
8561    Py_ssize_t osize;
8562    PyObject *res;
8563    /* current output position */
8564    Py_ssize_t opos;
8565    char *reason = "character maps to <undefined>";
8566    PyObject *errorHandler = NULL;
8567    PyObject *exc = NULL;
8568    /* the following variable is used for caching string comparisons
8569     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8570     * 3=ignore, 4=xmlcharrefreplace */
8571    int known_errorHandler = -1;
8572
8573    if (mapping == NULL) {
8574        PyErr_BadArgument();
8575        return NULL;
8576    }
8577
8578    if (PyUnicode_READY(input) == -1)
8579        return NULL;
8580    idata = (char*)PyUnicode_DATA(input);
8581    kind = PyUnicode_KIND(input);
8582    size = PyUnicode_GET_LENGTH(input);
8583    i = 0;
8584
8585    if (size == 0) {
8586        Py_INCREF(input);
8587        return input;
8588    }
8589
8590    /* allocate enough for a simple 1:1 translation without
8591       replacements, if we need more, we'll resize */
8592    osize = size;
8593    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8594    opos = 0;
8595    if (output == NULL) {
8596        PyErr_NoMemory();
8597        goto onError;
8598    }
8599
8600    while (i<size) {
8601        /* try to encode it */
8602        PyObject *x = NULL;
8603        if (charmaptranslate_output(input, i, mapping,
8604                                    &output, &osize, &opos, &x)) {
8605            Py_XDECREF(x);
8606            goto onError;
8607        }
8608        Py_XDECREF(x);
8609        if (x!=Py_None) /* it worked => adjust input pointer */
8610            ++i;
8611        else { /* untranslatable character */
8612            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8613            Py_ssize_t repsize;
8614            Py_ssize_t newpos;
8615            Py_ssize_t uni2;
8616            /* startpos for collecting untranslatable chars */
8617            Py_ssize_t collstart = i;
8618            Py_ssize_t collend = i+1;
8619            Py_ssize_t coll;
8620
8621            /* find all untranslatable characters */
8622            while (collend < size) {
8623                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8624                    goto onError;
8625                Py_XDECREF(x);
8626                if (x!=Py_None)
8627                    break;
8628                ++collend;
8629            }
8630            /* cache callback name lookup
8631             * (if not done yet, i.e. it's the first error) */
8632            if (known_errorHandler==-1) {
8633                if ((errors==NULL) || (!strcmp(errors, "strict")))
8634                    known_errorHandler = 1;
8635                else if (!strcmp(errors, "replace"))
8636                    known_errorHandler = 2;
8637                else if (!strcmp(errors, "ignore"))
8638                    known_errorHandler = 3;
8639                else if (!strcmp(errors, "xmlcharrefreplace"))
8640                    known_errorHandler = 4;
8641                else
8642                    known_errorHandler = 0;
8643            }
8644            switch (known_errorHandler) {
8645            case 1: /* strict */
8646                raise_translate_exception(&exc, input, collstart,
8647                                          collend, reason);
8648                goto onError;
8649            case 2: /* replace */
8650                /* No need to check for space, this is a 1:1 replacement */
8651                for (coll = collstart; coll<collend; coll++)
8652                    output[opos++] = '?';
8653                /* fall through */
8654            case 3: /* ignore */
8655                i = collend;
8656                break;
8657            case 4: /* xmlcharrefreplace */
8658                /* generate replacement (temporarily (mis)uses i) */
8659                for (i = collstart; i < collend; ++i) {
8660                    char buffer[2+29+1+1];
8661                    char *cp;
8662                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8663                    if (charmaptranslate_makespace(&output, &osize,
8664                                                   opos+strlen(buffer)+(size-collend)))
8665                        goto onError;
8666                    for (cp = buffer; *cp; ++cp)
8667                        output[opos++] = *cp;
8668                }
8669                i = collend;
8670                break;
8671            default:
8672                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8673                                                                 reason, input, &exc,
8674                                                                 collstart, collend, &newpos);
8675                if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
8676                    goto onError;
8677                /* generate replacement  */
8678                repsize = PyUnicode_GET_LENGTH(repunicode);
8679                if (charmaptranslate_makespace(&output, &osize,
8680                                               opos+repsize+(size-collend))) {
8681                    Py_DECREF(repunicode);
8682                    goto onError;
8683                }
8684                for (uni2 = 0; repsize-->0; ++uni2)
8685                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8686                i = newpos;
8687                Py_DECREF(repunicode);
8688            }
8689        }
8690    }
8691    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8692    if (!res)
8693        goto onError;
8694    PyMem_Free(output);
8695    Py_XDECREF(exc);
8696    Py_XDECREF(errorHandler);
8697    return res;
8698
8699  onError:
8700    PyMem_Free(output);
8701    Py_XDECREF(exc);
8702    Py_XDECREF(errorHandler);
8703    return NULL;
8704}
8705
8706/* Deprecated. Use PyUnicode_Translate instead. */
8707PyObject *
8708PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8709                           Py_ssize_t size,
8710                           PyObject *mapping,
8711                           const char *errors)
8712{
8713    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8714    if (!unicode)
8715        return NULL;
8716    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8717}
8718
8719PyObject *
8720PyUnicode_Translate(PyObject *str,
8721                    PyObject *mapping,
8722                    const char *errors)
8723{
8724    PyObject *result;
8725
8726    str = PyUnicode_FromObject(str);
8727    if (str == NULL)
8728        goto onError;
8729    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8730    Py_DECREF(str);
8731    return result;
8732
8733  onError:
8734    Py_XDECREF(str);
8735    return NULL;
8736}
8737
8738static Py_UCS4
8739fix_decimal_and_space_to_ascii(PyObject *self)
8740{
8741    /* No need to call PyUnicode_READY(self) because this function is only
8742       called as a callback from fixup() which does it already. */
8743    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8744    const int kind = PyUnicode_KIND(self);
8745    void *data = PyUnicode_DATA(self);
8746    Py_UCS4 maxchar = 0, ch, fixed;
8747    Py_ssize_t i;
8748
8749    for (i = 0; i < len; ++i) {
8750        ch = PyUnicode_READ(kind, data, i);
8751        fixed = 0;
8752        if (ch > 127) {
8753            if (Py_UNICODE_ISSPACE(ch))
8754                fixed = ' ';
8755            else {
8756                const int decimal = Py_UNICODE_TODECIMAL(ch);
8757                if (decimal >= 0)
8758                    fixed = '0' + decimal;
8759            }
8760            if (fixed != 0) {
8761                if (fixed > maxchar)
8762                    maxchar = fixed;
8763                PyUnicode_WRITE(kind, data, i, fixed);
8764            }
8765            else if (ch > maxchar)
8766                maxchar = ch;
8767        }
8768        else if (ch > maxchar)
8769            maxchar = ch;
8770    }
8771
8772    return maxchar;
8773}
8774
8775PyObject *
8776_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8777{
8778    if (!PyUnicode_Check(unicode)) {
8779        PyErr_BadInternalCall();
8780        return NULL;
8781    }
8782    if (PyUnicode_READY(unicode) == -1)
8783        return NULL;
8784    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8785        /* If the string is already ASCII, just return the same string */
8786        Py_INCREF(unicode);
8787        return unicode;
8788    }
8789    return fixup(unicode, fix_decimal_and_space_to_ascii);
8790}
8791
8792PyObject *
8793PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8794                                  Py_ssize_t length)
8795{
8796    PyObject *result;
8797    Py_UNICODE *p; /* write pointer into result */
8798    Py_ssize_t i;
8799    /* Copy to a new string */
8800    result = (PyObject *)_PyUnicode_New(length);
8801    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8802    if (result == NULL)
8803        return result;
8804    p = PyUnicode_AS_UNICODE(result);
8805    /* Iterate over code points */
8806    for (i = 0; i < length; i++) {
8807        Py_UNICODE ch =s[i];
8808        if (ch > 127) {
8809            int decimal = Py_UNICODE_TODECIMAL(ch);
8810            if (decimal >= 0)
8811                p[i] = '0' + decimal;
8812        }
8813    }
8814#ifndef DONT_MAKE_RESULT_READY
8815    if (_PyUnicode_READY_REPLACE(&result)) {
8816        Py_DECREF(result);
8817        return NULL;
8818    }
8819#endif
8820    assert(_PyUnicode_CheckConsistency(result, 1));
8821    return result;
8822}
8823/* --- Decimal Encoder ---------------------------------------------------- */
8824
8825int
8826PyUnicode_EncodeDecimal(Py_UNICODE *s,
8827                        Py_ssize_t length,
8828                        char *output,
8829                        const char *errors)
8830{
8831    Py_UNICODE *p, *end;
8832    PyObject *errorHandler = NULL;
8833    PyObject *exc = NULL;
8834    PyObject *unicode;
8835    const char *encoding = "decimal";
8836    const char *reason = "invalid decimal Unicode string";
8837    /* the following variable is used for caching string comparisons
8838     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8839    int known_errorHandler = -1;
8840
8841    if (output == NULL) {
8842        PyErr_BadArgument();
8843        return -1;
8844    }
8845
8846    p = s;
8847    end = s + length;
8848    while (p < end) {
8849        register Py_UNICODE ch = *p;
8850        int decimal;
8851        PyObject *repunicode;
8852        Py_ssize_t repsize;
8853        Py_ssize_t newpos;
8854        Py_UNICODE *uni2;
8855        Py_UNICODE *collstart;
8856        Py_UNICODE *collend;
8857
8858        if (Py_UNICODE_ISSPACE(ch)) {
8859            *output++ = ' ';
8860            ++p;
8861            continue;
8862        }
8863        decimal = Py_UNICODE_TODECIMAL(ch);
8864        if (decimal >= 0) {
8865            *output++ = '0' + decimal;
8866            ++p;
8867            continue;
8868        }
8869        if (0 < ch && ch < 256) {
8870            *output++ = (char)ch;
8871            ++p;
8872            continue;
8873        }
8874        /* All other characters are considered unencodable */
8875        collstart = p;
8876        collend = p+1;
8877        while (collend < end) {
8878            if ((0 < *collend && *collend < 256) ||
8879                !Py_UNICODE_ISSPACE(*collend) ||
8880                Py_UNICODE_TODECIMAL(*collend))
8881                break;
8882        }
8883        /* cache callback name lookup
8884         * (if not done yet, i.e. it's the first error) */
8885        if (known_errorHandler==-1) {
8886            if ((errors==NULL) || (!strcmp(errors, "strict")))
8887                known_errorHandler = 1;
8888            else if (!strcmp(errors, "replace"))
8889                known_errorHandler = 2;
8890            else if (!strcmp(errors, "ignore"))
8891                known_errorHandler = 3;
8892            else if (!strcmp(errors, "xmlcharrefreplace"))
8893                known_errorHandler = 4;
8894            else
8895                known_errorHandler = 0;
8896        }
8897        switch (known_errorHandler) {
8898        case 1: /* strict */
8899            unicode = PyUnicode_FromUnicode(s, length);
8900            if (unicode == NULL)
8901                goto onError;
8902            raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
8903            Py_DECREF(unicode);
8904            goto onError;
8905        case 2: /* replace */
8906            for (p = collstart; p < collend; ++p)
8907                *output++ = '?';
8908            /* fall through */
8909        case 3: /* ignore */
8910            p = collend;
8911            break;
8912        case 4: /* xmlcharrefreplace */
8913            /* generate replacement (temporarily (mis)uses p) */
8914            for (p = collstart; p < collend; ++p)
8915                output += sprintf(output, "&#%d;", (int)*p);
8916            p = collend;
8917            break;
8918        default:
8919            unicode = PyUnicode_FromUnicode(s, length);
8920            if (unicode == NULL)
8921                goto onError;
8922            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8923                                                          encoding, reason, unicode, &exc,
8924                                                          collstart-s, collend-s, &newpos);
8925            Py_DECREF(unicode);
8926            if (repunicode == NULL)
8927                goto onError;
8928            if (!PyUnicode_Check(repunicode)) {
8929                /* Byte results not supported, since they have no decimal property. */
8930                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8931                Py_DECREF(repunicode);
8932                goto onError;
8933            }
8934            /* generate replacement  */
8935            repsize = PyUnicode_GET_SIZE(repunicode);
8936            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8937                Py_UNICODE ch = *uni2;
8938                if (Py_UNICODE_ISSPACE(ch))
8939                    *output++ = ' ';
8940                else {
8941                    decimal = Py_UNICODE_TODECIMAL(ch);
8942                    if (decimal >= 0)
8943                        *output++ = '0' + decimal;
8944                    else if (0 < ch && ch < 256)
8945                        *output++ = (char)ch;
8946                    else {
8947                        Py_DECREF(repunicode);
8948                        unicode = PyUnicode_FromUnicode(s, length);
8949                        if (unicode == NULL)
8950                            goto onError;
8951                        raise_encode_exception(&exc, encoding,
8952                                               unicode, collstart-s, collend-s, reason);
8953                        Py_DECREF(unicode);
8954                        goto onError;
8955                    }
8956                }
8957            }
8958            p = s + newpos;
8959            Py_DECREF(repunicode);
8960        }
8961    }
8962    /* 0-terminate the output string */
8963    *output++ = '\0';
8964    Py_XDECREF(exc);
8965    Py_XDECREF(errorHandler);
8966    return 0;
8967
8968  onError:
8969    Py_XDECREF(exc);
8970    Py_XDECREF(errorHandler);
8971    return -1;
8972}
8973
8974/* --- Helpers ------------------------------------------------------------ */
8975
8976static Py_ssize_t
8977any_find_slice(int direction, PyObject* s1, PyObject* s2,
8978               Py_ssize_t start,
8979               Py_ssize_t end)
8980{
8981    int kind1, kind2, kind;
8982    void *buf1, *buf2;
8983    Py_ssize_t len1, len2, result;
8984
8985    kind1 = PyUnicode_KIND(s1);
8986    kind2 = PyUnicode_KIND(s2);
8987    kind = kind1 > kind2 ? kind1 : kind2;
8988    buf1 = PyUnicode_DATA(s1);
8989    buf2 = PyUnicode_DATA(s2);
8990    if (kind1 != kind)
8991        buf1 = _PyUnicode_AsKind(s1, kind);
8992    if (!buf1)
8993        return -2;
8994    if (kind2 != kind)
8995        buf2 = _PyUnicode_AsKind(s2, kind);
8996    if (!buf2) {
8997        if (kind1 != kind) PyMem_Free(buf1);
8998        return -2;
8999    }
9000    len1 = PyUnicode_GET_LENGTH(s1);
9001    len2 = PyUnicode_GET_LENGTH(s2);
9002
9003    if (direction > 0) {
9004        switch(kind) {
9005        case PyUnicode_1BYTE_KIND:
9006            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9007                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9008            else
9009                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9010            break;
9011        case PyUnicode_2BYTE_KIND:
9012            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9013            break;
9014        case PyUnicode_4BYTE_KIND:
9015            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9016            break;
9017        default:
9018            assert(0); result = -2;
9019        }
9020    }
9021    else {
9022        switch(kind) {
9023        case PyUnicode_1BYTE_KIND:
9024            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9025                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9026            else
9027                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9028            break;
9029        case PyUnicode_2BYTE_KIND:
9030            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9031            break;
9032        case PyUnicode_4BYTE_KIND:
9033            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9034            break;
9035        default:
9036            assert(0); result = -2;
9037        }
9038    }
9039
9040    if (kind1 != kind)
9041        PyMem_Free(buf1);
9042    if (kind2 != kind)
9043        PyMem_Free(buf2);
9044
9045    return result;
9046}
9047
9048Py_ssize_t
9049_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
9050                                   Py_ssize_t n_buffer,
9051                                   void *digits, Py_ssize_t n_digits,
9052                                   Py_ssize_t min_width,
9053                                   const char *grouping,
9054                                   const char *thousands_sep)
9055{
9056    switch(kind) {
9057    case PyUnicode_1BYTE_KIND:
9058        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9059            return _PyUnicode_ascii_InsertThousandsGrouping(
9060                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9061                min_width, grouping, thousands_sep);
9062        else
9063            return _PyUnicode_ucs1_InsertThousandsGrouping(
9064                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9065                min_width, grouping, thousands_sep);
9066    case PyUnicode_2BYTE_KIND:
9067        return _PyUnicode_ucs2_InsertThousandsGrouping(
9068            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9069            min_width, grouping, thousands_sep);
9070    case PyUnicode_4BYTE_KIND:
9071        return _PyUnicode_ucs4_InsertThousandsGrouping(
9072            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9073            min_width, grouping, thousands_sep);
9074    }
9075    assert(0);
9076    return -1;
9077}
9078
9079
9080/* helper macro to fixup start/end slice values */
9081#define ADJUST_INDICES(start, end, len)         \
9082    if (end > len)                              \
9083        end = len;                              \
9084    else if (end < 0) {                         \
9085        end += len;                             \
9086        if (end < 0)                            \
9087            end = 0;                            \
9088    }                                           \
9089    if (start < 0) {                            \
9090        start += len;                           \
9091        if (start < 0)                          \
9092            start = 0;                          \
9093    }
9094
9095Py_ssize_t
9096PyUnicode_Count(PyObject *str,
9097                PyObject *substr,
9098                Py_ssize_t start,
9099                Py_ssize_t end)
9100{
9101    Py_ssize_t result;
9102    PyObject* str_obj;
9103    PyObject* sub_obj;
9104    int kind1, kind2, kind;
9105    void *buf1 = NULL, *buf2 = NULL;
9106    Py_ssize_t len1, len2;
9107
9108    str_obj = PyUnicode_FromObject(str);
9109    if (!str_obj || PyUnicode_READY(str_obj) == -1)
9110        return -1;
9111    sub_obj = PyUnicode_FromObject(substr);
9112    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
9113        Py_DECREF(str_obj);
9114        return -1;
9115    }
9116
9117    kind1 = PyUnicode_KIND(str_obj);
9118    kind2 = PyUnicode_KIND(sub_obj);
9119    kind = kind1 > kind2 ? kind1 : kind2;
9120    buf1 = PyUnicode_DATA(str_obj);
9121    if (kind1 != kind)
9122        buf1 = _PyUnicode_AsKind(str_obj, kind);
9123    if (!buf1)
9124        goto onError;
9125    buf2 = PyUnicode_DATA(sub_obj);
9126    if (kind2 != kind)
9127        buf2 = _PyUnicode_AsKind(sub_obj, kind);
9128    if (!buf2)
9129        goto onError;
9130    len1 = PyUnicode_GET_LENGTH(str_obj);
9131    len2 = PyUnicode_GET_LENGTH(sub_obj);
9132
9133    ADJUST_INDICES(start, end, len1);
9134    switch(kind) {
9135    case PyUnicode_1BYTE_KIND:
9136        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9137            result = asciilib_count(
9138                ((Py_UCS1*)buf1) + start, end - start,
9139                buf2, len2, PY_SSIZE_T_MAX
9140                );
9141        else
9142            result = ucs1lib_count(
9143                ((Py_UCS1*)buf1) + start, end - start,
9144                buf2, len2, PY_SSIZE_T_MAX
9145                );
9146        break;
9147    case PyUnicode_2BYTE_KIND:
9148        result = ucs2lib_count(
9149            ((Py_UCS2*)buf1) + start, end - start,
9150            buf2, len2, PY_SSIZE_T_MAX
9151            );
9152        break;
9153    case PyUnicode_4BYTE_KIND:
9154        result = ucs4lib_count(
9155            ((Py_UCS4*)buf1) + start, end - start,
9156            buf2, len2, PY_SSIZE_T_MAX
9157            );
9158        break;
9159    default:
9160        assert(0); result = 0;
9161    }
9162
9163    Py_DECREF(sub_obj);
9164    Py_DECREF(str_obj);
9165
9166    if (kind1 != kind)
9167        PyMem_Free(buf1);
9168    if (kind2 != kind)
9169        PyMem_Free(buf2);
9170
9171    return result;
9172  onError:
9173    Py_DECREF(sub_obj);
9174    Py_DECREF(str_obj);
9175    if (kind1 != kind && buf1)
9176        PyMem_Free(buf1);
9177    if (kind2 != kind && buf2)
9178        PyMem_Free(buf2);
9179    return -1;
9180}
9181
9182Py_ssize_t
9183PyUnicode_Find(PyObject *str,
9184               PyObject *sub,
9185               Py_ssize_t start,
9186               Py_ssize_t end,
9187               int direction)
9188{
9189    Py_ssize_t result;
9190
9191    str = PyUnicode_FromObject(str);
9192    if (!str || PyUnicode_READY(str) == -1)
9193        return -2;
9194    sub = PyUnicode_FromObject(sub);
9195    if (!sub || PyUnicode_READY(sub) == -1) {
9196        Py_DECREF(str);
9197        return -2;
9198    }
9199
9200    result = any_find_slice(direction,
9201        str, sub, start, end
9202        );
9203
9204    Py_DECREF(str);
9205    Py_DECREF(sub);
9206
9207    return result;
9208}
9209
9210Py_ssize_t
9211PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9212                   Py_ssize_t start, Py_ssize_t end,
9213                   int direction)
9214{
9215    int kind;
9216    Py_ssize_t result;
9217    if (PyUnicode_READY(str) == -1)
9218        return -2;
9219    if (start < 0 || end < 0) {
9220        PyErr_SetString(PyExc_IndexError, "string index out of range");
9221        return -2;
9222    }
9223    if (end > PyUnicode_GET_LENGTH(str))
9224        end = PyUnicode_GET_LENGTH(str);
9225    kind = PyUnicode_KIND(str);
9226    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9227                      kind, end-start, ch, direction);
9228    if (result == -1)
9229        return -1;
9230    else
9231        return start + result;
9232}
9233
9234static int
9235tailmatch(PyObject *self,
9236          PyObject *substring,
9237          Py_ssize_t start,
9238          Py_ssize_t end,
9239          int direction)
9240{
9241    int kind_self;
9242    int kind_sub;
9243    void *data_self;
9244    void *data_sub;
9245    Py_ssize_t offset;
9246    Py_ssize_t i;
9247    Py_ssize_t end_sub;
9248
9249    if (PyUnicode_READY(self) == -1 ||
9250        PyUnicode_READY(substring) == -1)
9251        return 0;
9252
9253    if (PyUnicode_GET_LENGTH(substring) == 0)
9254        return 1;
9255
9256    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9257    end -= PyUnicode_GET_LENGTH(substring);
9258    if (end < start)
9259        return 0;
9260
9261    kind_self = PyUnicode_KIND(self);
9262    data_self = PyUnicode_DATA(self);
9263    kind_sub = PyUnicode_KIND(substring);
9264    data_sub = PyUnicode_DATA(substring);
9265    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9266
9267    if (direction > 0)
9268        offset = end;
9269    else
9270        offset = start;
9271
9272    if (PyUnicode_READ(kind_self, data_self, offset) ==
9273        PyUnicode_READ(kind_sub, data_sub, 0) &&
9274        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9275        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9276        /* If both are of the same kind, memcmp is sufficient */
9277        if (kind_self == kind_sub) {
9278            return ! memcmp((char *)data_self +
9279                                (offset * PyUnicode_KIND(substring)),
9280                            data_sub,
9281                            PyUnicode_GET_LENGTH(substring) *
9282                                PyUnicode_KIND(substring));
9283        }
9284        /* otherwise we have to compare each character by first accesing it */
9285        else {
9286            /* We do not need to compare 0 and len(substring)-1 because
9287               the if statement above ensured already that they are equal
9288               when we end up here. */
9289            // TODO: honor direction and do a forward or backwards search
9290            for (i = 1; i < end_sub; ++i) {
9291                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9292                    PyUnicode_READ(kind_sub, data_sub, i))
9293                    return 0;
9294            }
9295            return 1;
9296        }
9297    }
9298
9299    return 0;
9300}
9301
9302Py_ssize_t
9303PyUnicode_Tailmatch(PyObject *str,
9304                    PyObject *substr,
9305                    Py_ssize_t start,
9306                    Py_ssize_t end,
9307                    int direction)
9308{
9309    Py_ssize_t result;
9310
9311    str = PyUnicode_FromObject(str);
9312    if (str == NULL)
9313        return -1;
9314    substr = PyUnicode_FromObject(substr);
9315    if (substr == NULL) {
9316        Py_DECREF(str);
9317        return -1;
9318    }
9319
9320    result = tailmatch(str, substr,
9321                       start, end, direction);
9322    Py_DECREF(str);
9323    Py_DECREF(substr);
9324    return result;
9325}
9326
9327/* Apply fixfct filter to the Unicode object self and return a
9328   reference to the modified object */
9329
9330static PyObject *
9331fixup(PyObject *self,
9332      Py_UCS4 (*fixfct)(PyObject *s))
9333{
9334    PyObject *u;
9335    Py_UCS4 maxchar_old, maxchar_new = 0;
9336
9337    if (PyUnicode_READY(self) == -1)
9338        return NULL;
9339    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9340    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9341                      maxchar_old);
9342    if (u == NULL)
9343        return NULL;
9344
9345    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
9346              PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
9347
9348    /* fix functions return the new maximum character in a string,
9349       if the kind of the resulting unicode object does not change,
9350       everything is fine.  Otherwise we need to change the string kind
9351       and re-run the fix function. */
9352    maxchar_new = fixfct(u);
9353    if (maxchar_new == 0)
9354        /* do nothing, keep maxchar_new at 0 which means no changes. */;
9355    else if (maxchar_new <= 127)
9356        maxchar_new = 127;
9357    else if (maxchar_new <= 255)
9358        maxchar_new = 255;
9359    else if (maxchar_new <= 65535)
9360        maxchar_new = 65535;
9361    else
9362        maxchar_new = 1114111; /* 0x10ffff */
9363
9364    if (!maxchar_new && PyUnicode_CheckExact(self)) {
9365        /* fixfct should return TRUE if it modified the buffer. If
9366           FALSE, return a reference to the original buffer instead
9367           (to save space, not time) */
9368        Py_INCREF(self);
9369        Py_DECREF(u);
9370        return self;
9371    }
9372    else if (maxchar_new == maxchar_old) {
9373        return u;
9374    }
9375    else {
9376        /* In case the maximum character changed, we need to
9377           convert the string to the new category. */
9378        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9379        if (v == NULL) {
9380            Py_DECREF(u);
9381            return NULL;
9382        }
9383        if (maxchar_new > maxchar_old) {
9384            /* If the maxchar increased so that the kind changed, not all
9385               characters are representable anymore and we need to fix the
9386               string again. This only happens in very few cases. */
9387            copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9388            maxchar_old = fixfct(v);
9389            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9390        }
9391        else {
9392            copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
9393        }
9394
9395        Py_DECREF(u);
9396        assert(_PyUnicode_CheckConsistency(v, 1));
9397        return v;
9398    }
9399}
9400
9401static Py_UCS4
9402fixupper(PyObject *self)
9403{
9404    /* No need to call PyUnicode_READY(self) because this function is only
9405       called as a callback from fixup() which does it already. */
9406    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9407    const int kind = PyUnicode_KIND(self);
9408    void *data = PyUnicode_DATA(self);
9409    int touched = 0;
9410    Py_UCS4 maxchar = 0;
9411    Py_ssize_t i;
9412
9413    for (i = 0; i < len; ++i) {
9414        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9415        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9416        if (up != ch) {
9417            if (up > maxchar)
9418                maxchar = up;
9419            PyUnicode_WRITE(kind, data, i, up);
9420            touched = 1;
9421        }
9422        else if (ch > maxchar)
9423            maxchar = ch;
9424    }
9425
9426    if (touched)
9427        return maxchar;
9428    else
9429        return 0;
9430}
9431
9432static Py_UCS4
9433fixlower(PyObject *self)
9434{
9435    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9436    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9437    const int kind = PyUnicode_KIND(self);
9438    void *data = PyUnicode_DATA(self);
9439    int touched = 0;
9440    Py_UCS4 maxchar = 0;
9441    Py_ssize_t i;
9442
9443    for(i = 0; i < len; ++i) {
9444        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9445        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9446        if (lo != ch) {
9447            if (lo > maxchar)
9448                maxchar = lo;
9449            PyUnicode_WRITE(kind, data, i, lo);
9450            touched = 1;
9451        }
9452        else if (ch > maxchar)
9453            maxchar = ch;
9454    }
9455
9456    if (touched)
9457        return maxchar;
9458    else
9459        return 0;
9460}
9461
9462static Py_UCS4
9463fixswapcase(PyObject *self)
9464{
9465    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9466    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9467    const int kind = PyUnicode_KIND(self);
9468    void *data = PyUnicode_DATA(self);
9469    int touched = 0;
9470    Py_UCS4 maxchar = 0;
9471    Py_ssize_t i;
9472
9473    for(i = 0; i < len; ++i) {
9474        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9475        Py_UCS4 nu = 0;
9476
9477        if (Py_UNICODE_ISUPPER(ch))
9478            nu = Py_UNICODE_TOLOWER(ch);
9479        else if (Py_UNICODE_ISLOWER(ch))
9480            nu = Py_UNICODE_TOUPPER(ch);
9481
9482        if (nu != 0) {
9483            if (nu > maxchar)
9484                maxchar = nu;
9485            PyUnicode_WRITE(kind, data, i, nu);
9486            touched = 1;
9487        }
9488        else if (ch > maxchar)
9489            maxchar = ch;
9490    }
9491
9492    if (touched)
9493        return maxchar;
9494    else
9495        return 0;
9496}
9497
9498static Py_UCS4
9499fixcapitalize(PyObject *self)
9500{
9501    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9502    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9503    const int kind = PyUnicode_KIND(self);
9504    void *data = PyUnicode_DATA(self);
9505    int touched = 0;
9506    Py_UCS4 maxchar = 0;
9507    Py_ssize_t i = 0;
9508    Py_UCS4 ch;
9509
9510    if (len == 0)
9511        return 0;
9512
9513    ch = PyUnicode_READ(kind, data, i);
9514    if (!Py_UNICODE_ISUPPER(ch)) {
9515        maxchar = Py_UNICODE_TOUPPER(ch);
9516        PyUnicode_WRITE(kind, data, i, maxchar);
9517        touched = 1;
9518    }
9519    ++i;
9520    for(; i < len; ++i) {
9521        ch = PyUnicode_READ(kind, data, i);
9522        if (!Py_UNICODE_ISLOWER(ch)) {
9523            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9524            if (lo > maxchar)
9525                maxchar = lo;
9526            PyUnicode_WRITE(kind, data, i, lo);
9527            touched = 1;
9528        }
9529        else if (ch > maxchar)
9530            maxchar = ch;
9531    }
9532
9533    if (touched)
9534        return maxchar;
9535    else
9536        return 0;
9537}
9538
9539static Py_UCS4
9540fixtitle(PyObject *self)
9541{
9542    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9543    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9544    const int kind = PyUnicode_KIND(self);
9545    void *data = PyUnicode_DATA(self);
9546    Py_UCS4 maxchar = 0;
9547    Py_ssize_t i = 0;
9548    int previous_is_cased;
9549
9550    /* Shortcut for single character strings */
9551    if (len == 1) {
9552        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9553        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9554        if (ti != ch) {
9555            PyUnicode_WRITE(kind, data, i, ti);
9556            return ti;
9557        }
9558        else
9559            return 0;
9560    }
9561    previous_is_cased = 0;
9562    for(; i < len; ++i) {
9563        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9564        Py_UCS4 nu;
9565
9566        if (previous_is_cased)
9567            nu = Py_UNICODE_TOLOWER(ch);
9568        else
9569            nu = Py_UNICODE_TOTITLE(ch);
9570
9571        if (nu > maxchar)
9572            maxchar = nu;
9573        PyUnicode_WRITE(kind, data, i, nu);
9574
9575        if (Py_UNICODE_ISLOWER(ch) ||
9576            Py_UNICODE_ISUPPER(ch) ||
9577            Py_UNICODE_ISTITLE(ch))
9578            previous_is_cased = 1;
9579        else
9580            previous_is_cased = 0;
9581    }
9582    return maxchar;
9583}
9584
9585PyObject *
9586PyUnicode_Join(PyObject *separator, PyObject *seq)
9587{
9588    PyObject *sep = NULL;
9589    Py_ssize_t seplen;
9590    PyObject *res = NULL; /* the result */
9591    PyObject *fseq;          /* PySequence_Fast(seq) */
9592    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9593    PyObject **items;
9594    PyObject *item;
9595    Py_ssize_t sz, i, res_offset;
9596    Py_UCS4 maxchar;
9597    Py_UCS4 item_maxchar;
9598    int use_memcpy;
9599    unsigned char *res_data = NULL, *sep_data = NULL;
9600    PyObject *last_obj;
9601    unsigned int kind = 0;
9602
9603    fseq = PySequence_Fast(seq, "");
9604    if (fseq == NULL) {
9605        return NULL;
9606    }
9607
9608    /* NOTE: the following code can't call back into Python code,
9609     * so we are sure that fseq won't be mutated.
9610     */
9611
9612    seqlen = PySequence_Fast_GET_SIZE(fseq);
9613    /* If empty sequence, return u"". */
9614    if (seqlen == 0) {
9615        Py_DECREF(fseq);
9616        Py_INCREF(unicode_empty);
9617        res = unicode_empty;
9618        return res;
9619    }
9620
9621    /* If singleton sequence with an exact Unicode, return that. */
9622    last_obj = NULL;
9623    items = PySequence_Fast_ITEMS(fseq);
9624    if (seqlen == 1) {
9625        if (PyUnicode_CheckExact(items[0])) {
9626            res = items[0];
9627            Py_INCREF(res);
9628            Py_DECREF(fseq);
9629            return res;
9630        }
9631        seplen = 0;
9632        maxchar = 0;
9633    }
9634    else {
9635        /* Set up sep and seplen */
9636        if (separator == NULL) {
9637            /* fall back to a blank space separator */
9638            sep = PyUnicode_FromOrdinal(' ');
9639            if (!sep)
9640                goto onError;
9641            seplen = 1;
9642            maxchar = 32;
9643        }
9644        else {
9645            if (!PyUnicode_Check(separator)) {
9646                PyErr_Format(PyExc_TypeError,
9647                             "separator: expected str instance,"
9648                             " %.80s found",
9649                             Py_TYPE(separator)->tp_name);
9650                goto onError;
9651            }
9652            if (PyUnicode_READY(separator))
9653                goto onError;
9654            sep = separator;
9655            seplen = PyUnicode_GET_LENGTH(separator);
9656            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9657            /* inc refcount to keep this code path symmetric with the
9658               above case of a blank separator */
9659            Py_INCREF(sep);
9660        }
9661        last_obj = sep;
9662    }
9663
9664    /* There are at least two things to join, or else we have a subclass
9665     * of str in the sequence.
9666     * Do a pre-pass to figure out the total amount of space we'll
9667     * need (sz), and see whether all argument are strings.
9668     */
9669    sz = 0;
9670#ifdef Py_DEBUG
9671    use_memcpy = 0;
9672#else
9673    use_memcpy = 1;
9674#endif
9675    for (i = 0; i < seqlen; i++) {
9676        const Py_ssize_t old_sz = sz;
9677        item = items[i];
9678        if (!PyUnicode_Check(item)) {
9679            PyErr_Format(PyExc_TypeError,
9680                         "sequence item %zd: expected str instance,"
9681                         " %.80s found",
9682                         i, Py_TYPE(item)->tp_name);
9683            goto onError;
9684        }
9685        if (PyUnicode_READY(item) == -1)
9686            goto onError;
9687        sz += PyUnicode_GET_LENGTH(item);
9688        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9689        maxchar = Py_MAX(maxchar, item_maxchar);
9690        if (i != 0)
9691            sz += seplen;
9692        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9693            PyErr_SetString(PyExc_OverflowError,
9694                            "join() result is too long for a Python string");
9695            goto onError;
9696        }
9697        if (use_memcpy && last_obj != NULL) {
9698            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9699                use_memcpy = 0;
9700        }
9701        last_obj = item;
9702    }
9703
9704    res = PyUnicode_New(sz, maxchar);
9705    if (res == NULL)
9706        goto onError;
9707
9708    /* Catenate everything. */
9709#ifdef Py_DEBUG
9710    use_memcpy = 0;
9711#else
9712    if (use_memcpy) {
9713        res_data = PyUnicode_1BYTE_DATA(res);
9714        kind = PyUnicode_KIND(res);
9715        if (seplen != 0)
9716            sep_data = PyUnicode_1BYTE_DATA(sep);
9717    }
9718#endif
9719    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9720        Py_ssize_t itemlen;
9721        item = items[i];
9722        /* Copy item, and maybe the separator. */
9723        if (i && seplen != 0) {
9724            if (use_memcpy) {
9725                Py_MEMCPY(res_data,
9726                          sep_data,
9727                          kind * seplen);
9728                res_data += kind * seplen;
9729            }
9730            else {
9731                copy_characters(res, res_offset, sep, 0, seplen);
9732                res_offset += seplen;
9733            }
9734        }
9735        itemlen = PyUnicode_GET_LENGTH(item);
9736        if (itemlen != 0) {
9737            if (use_memcpy) {
9738                Py_MEMCPY(res_data,
9739                          PyUnicode_DATA(item),
9740                          kind * itemlen);
9741                res_data += kind * itemlen;
9742            }
9743            else {
9744                copy_characters(res, res_offset, item, 0, itemlen);
9745                res_offset += itemlen;
9746            }
9747        }
9748    }
9749    if (use_memcpy)
9750        assert(res_data == PyUnicode_1BYTE_DATA(res)
9751                           + kind * PyUnicode_GET_LENGTH(res));
9752    else
9753        assert(res_offset == PyUnicode_GET_LENGTH(res));
9754
9755    Py_DECREF(fseq);
9756    Py_XDECREF(sep);
9757    assert(_PyUnicode_CheckConsistency(res, 1));
9758    return res;
9759
9760  onError:
9761    Py_DECREF(fseq);
9762    Py_XDECREF(sep);
9763    Py_XDECREF(res);
9764    return NULL;
9765}
9766
9767#define FILL(kind, data, value, start, length) \
9768    do { \
9769        Py_ssize_t i_ = 0; \
9770        assert(kind != PyUnicode_WCHAR_KIND); \
9771        switch ((kind)) { \
9772        case PyUnicode_1BYTE_KIND: { \
9773            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9774            memset(to_, (unsigned char)value, length); \
9775            break; \
9776        } \
9777        case PyUnicode_2BYTE_KIND: { \
9778            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9779            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9780            break; \
9781        } \
9782        default: { \
9783            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9784            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9785            break; \
9786        } \
9787        } \
9788    } while (0)
9789
9790static PyObject *
9791pad(PyObject *self,
9792    Py_ssize_t left,
9793    Py_ssize_t right,
9794    Py_UCS4 fill)
9795{
9796    PyObject *u;
9797    Py_UCS4 maxchar;
9798    int kind;
9799    void *data;
9800
9801    if (left < 0)
9802        left = 0;
9803    if (right < 0)
9804        right = 0;
9805
9806    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9807        Py_INCREF(self);
9808        return self;
9809    }
9810
9811    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9812        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9813        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9814        return NULL;
9815    }
9816    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9817    if (fill > maxchar)
9818        maxchar = fill;
9819    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9820    if (!u)
9821        return NULL;
9822
9823    kind = PyUnicode_KIND(u);
9824    data = PyUnicode_DATA(u);
9825    if (left)
9826        FILL(kind, data, fill, 0, left);
9827    if (right)
9828        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9829    copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
9830    assert(_PyUnicode_CheckConsistency(u, 1));
9831    return u;
9832}
9833#undef FILL
9834
9835PyObject *
9836PyUnicode_Splitlines(PyObject *string, int keepends)
9837{
9838    PyObject *list;
9839
9840    string = PyUnicode_FromObject(string);
9841    if (string == NULL || PyUnicode_READY(string) == -1)
9842        return NULL;
9843
9844    switch(PyUnicode_KIND(string)) {
9845    case PyUnicode_1BYTE_KIND:
9846        if (PyUnicode_IS_ASCII(string))
9847            list = asciilib_splitlines(
9848                string, PyUnicode_1BYTE_DATA(string),
9849                PyUnicode_GET_LENGTH(string), keepends);
9850        else
9851            list = ucs1lib_splitlines(
9852                string, PyUnicode_1BYTE_DATA(string),
9853                PyUnicode_GET_LENGTH(string), keepends);
9854        break;
9855    case PyUnicode_2BYTE_KIND:
9856        list = ucs2lib_splitlines(
9857            string, PyUnicode_2BYTE_DATA(string),
9858            PyUnicode_GET_LENGTH(string), keepends);
9859        break;
9860    case PyUnicode_4BYTE_KIND:
9861        list = ucs4lib_splitlines(
9862            string, PyUnicode_4BYTE_DATA(string),
9863            PyUnicode_GET_LENGTH(string), keepends);
9864        break;
9865    default:
9866        assert(0);
9867        list = 0;
9868    }
9869    Py_DECREF(string);
9870    return list;
9871}
9872
9873static PyObject *
9874split(PyObject *self,
9875      PyObject *substring,
9876      Py_ssize_t maxcount)
9877{
9878    int kind1, kind2, kind;
9879    void *buf1, *buf2;
9880    Py_ssize_t len1, len2;
9881    PyObject* out;
9882
9883    if (maxcount < 0)
9884        maxcount = PY_SSIZE_T_MAX;
9885
9886    if (PyUnicode_READY(self) == -1)
9887        return NULL;
9888
9889    if (substring == NULL)
9890        switch(PyUnicode_KIND(self)) {
9891        case PyUnicode_1BYTE_KIND:
9892            if (PyUnicode_IS_ASCII(self))
9893                return asciilib_split_whitespace(
9894                    self,  PyUnicode_1BYTE_DATA(self),
9895                    PyUnicode_GET_LENGTH(self), maxcount
9896                    );
9897            else
9898                return ucs1lib_split_whitespace(
9899                    self,  PyUnicode_1BYTE_DATA(self),
9900                    PyUnicode_GET_LENGTH(self), maxcount
9901                    );
9902        case PyUnicode_2BYTE_KIND:
9903            return ucs2lib_split_whitespace(
9904                self,  PyUnicode_2BYTE_DATA(self),
9905                PyUnicode_GET_LENGTH(self), maxcount
9906                );
9907        case PyUnicode_4BYTE_KIND:
9908            return ucs4lib_split_whitespace(
9909                self,  PyUnicode_4BYTE_DATA(self),
9910                PyUnicode_GET_LENGTH(self), maxcount
9911                );
9912        default:
9913            assert(0);
9914            return NULL;
9915        }
9916
9917    if (PyUnicode_READY(substring) == -1)
9918        return NULL;
9919
9920    kind1 = PyUnicode_KIND(self);
9921    kind2 = PyUnicode_KIND(substring);
9922    kind = kind1 > kind2 ? kind1 : kind2;
9923    buf1 = PyUnicode_DATA(self);
9924    buf2 = PyUnicode_DATA(substring);
9925    if (kind1 != kind)
9926        buf1 = _PyUnicode_AsKind(self, kind);
9927    if (!buf1)
9928        return NULL;
9929    if (kind2 != kind)
9930        buf2 = _PyUnicode_AsKind(substring, kind);
9931    if (!buf2) {
9932        if (kind1 != kind) PyMem_Free(buf1);
9933        return NULL;
9934    }
9935    len1 = PyUnicode_GET_LENGTH(self);
9936    len2 = PyUnicode_GET_LENGTH(substring);
9937
9938    switch(kind) {
9939    case PyUnicode_1BYTE_KIND:
9940        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9941            out = asciilib_split(
9942                self,  buf1, len1, buf2, len2, maxcount);
9943        else
9944            out = ucs1lib_split(
9945                self,  buf1, len1, buf2, len2, maxcount);
9946        break;
9947    case PyUnicode_2BYTE_KIND:
9948        out = ucs2lib_split(
9949            self,  buf1, len1, buf2, len2, maxcount);
9950        break;
9951    case PyUnicode_4BYTE_KIND:
9952        out = ucs4lib_split(
9953            self,  buf1, len1, buf2, len2, maxcount);
9954        break;
9955    default:
9956        out = NULL;
9957    }
9958    if (kind1 != kind)
9959        PyMem_Free(buf1);
9960    if (kind2 != kind)
9961        PyMem_Free(buf2);
9962    return out;
9963}
9964
9965static PyObject *
9966rsplit(PyObject *self,
9967       PyObject *substring,
9968       Py_ssize_t maxcount)
9969{
9970    int kind1, kind2, kind;
9971    void *buf1, *buf2;
9972    Py_ssize_t len1, len2;
9973    PyObject* out;
9974
9975    if (maxcount < 0)
9976        maxcount = PY_SSIZE_T_MAX;
9977
9978    if (PyUnicode_READY(self) == -1)
9979        return NULL;
9980
9981    if (substring == NULL)
9982        switch(PyUnicode_KIND(self)) {
9983        case PyUnicode_1BYTE_KIND:
9984            if (PyUnicode_IS_ASCII(self))
9985                return asciilib_rsplit_whitespace(
9986                    self,  PyUnicode_1BYTE_DATA(self),
9987                    PyUnicode_GET_LENGTH(self), maxcount
9988                    );
9989            else
9990                return ucs1lib_rsplit_whitespace(
9991                    self,  PyUnicode_1BYTE_DATA(self),
9992                    PyUnicode_GET_LENGTH(self), maxcount
9993                    );
9994        case PyUnicode_2BYTE_KIND:
9995            return ucs2lib_rsplit_whitespace(
9996                self,  PyUnicode_2BYTE_DATA(self),
9997                PyUnicode_GET_LENGTH(self), maxcount
9998                );
9999        case PyUnicode_4BYTE_KIND:
10000            return ucs4lib_rsplit_whitespace(
10001                self,  PyUnicode_4BYTE_DATA(self),
10002                PyUnicode_GET_LENGTH(self), maxcount
10003                );
10004        default:
10005            assert(0);
10006            return NULL;
10007        }
10008
10009    if (PyUnicode_READY(substring) == -1)
10010        return NULL;
10011
10012    kind1 = PyUnicode_KIND(self);
10013    kind2 = PyUnicode_KIND(substring);
10014    kind = kind1 > kind2 ? kind1 : kind2;
10015    buf1 = PyUnicode_DATA(self);
10016    buf2 = PyUnicode_DATA(substring);
10017    if (kind1 != kind)
10018        buf1 = _PyUnicode_AsKind(self, kind);
10019    if (!buf1)
10020        return NULL;
10021    if (kind2 != kind)
10022        buf2 = _PyUnicode_AsKind(substring, kind);
10023    if (!buf2) {
10024        if (kind1 != kind) PyMem_Free(buf1);
10025        return NULL;
10026    }
10027    len1 = PyUnicode_GET_LENGTH(self);
10028    len2 = PyUnicode_GET_LENGTH(substring);
10029
10030    switch(kind) {
10031    case PyUnicode_1BYTE_KIND:
10032        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10033            out = asciilib_rsplit(
10034                self,  buf1, len1, buf2, len2, maxcount);
10035        else
10036            out = ucs1lib_rsplit(
10037                self,  buf1, len1, buf2, len2, maxcount);
10038        break;
10039    case PyUnicode_2BYTE_KIND:
10040        out = ucs2lib_rsplit(
10041            self,  buf1, len1, buf2, len2, maxcount);
10042        break;
10043    case PyUnicode_4BYTE_KIND:
10044        out = ucs4lib_rsplit(
10045            self,  buf1, len1, buf2, len2, maxcount);
10046        break;
10047    default:
10048        out = NULL;
10049    }
10050    if (kind1 != kind)
10051        PyMem_Free(buf1);
10052    if (kind2 != kind)
10053        PyMem_Free(buf2);
10054    return out;
10055}
10056
10057static Py_ssize_t
10058anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10059            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10060{
10061    switch(kind) {
10062    case PyUnicode_1BYTE_KIND:
10063        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10064            return asciilib_find(buf1, len1, buf2, len2, offset);
10065        else
10066            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10067    case PyUnicode_2BYTE_KIND:
10068        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10069    case PyUnicode_4BYTE_KIND:
10070        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10071    }
10072    assert(0);
10073    return -1;
10074}
10075
10076static Py_ssize_t
10077anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10078             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10079{
10080        switch(kind) {
10081        case PyUnicode_1BYTE_KIND:
10082            if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10083                return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10084            else
10085                return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10086        case PyUnicode_2BYTE_KIND:
10087            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10088        case PyUnicode_4BYTE_KIND:
10089            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10090        }
10091        assert(0);
10092        return 0;
10093}
10094
10095static PyObject *
10096replace(PyObject *self, PyObject *str1,
10097        PyObject *str2, Py_ssize_t maxcount)
10098{
10099    PyObject *u;
10100    char *sbuf = PyUnicode_DATA(self);
10101    char *buf1 = PyUnicode_DATA(str1);
10102    char *buf2 = PyUnicode_DATA(str2);
10103    int srelease = 0, release1 = 0, release2 = 0;
10104    int skind = PyUnicode_KIND(self);
10105    int kind1 = PyUnicode_KIND(str1);
10106    int kind2 = PyUnicode_KIND(str2);
10107    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10108    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10109    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10110    int mayshrink;
10111    Py_UCS4 maxchar, maxchar_str2;
10112
10113    if (maxcount < 0)
10114        maxcount = PY_SSIZE_T_MAX;
10115    else if (maxcount == 0 || slen == 0)
10116        goto nothing;
10117
10118    if (str1 == str2)
10119        goto nothing;
10120    if (skind < kind1)
10121        /* substring too wide to be present */
10122        goto nothing;
10123
10124    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10125    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10126    /* Replacing str1 with str2 may cause a maxchar reduction in the
10127       result string. */
10128    mayshrink = (maxchar_str2 < maxchar);
10129    maxchar = Py_MAX(maxchar, maxchar_str2);
10130
10131    if (len1 == len2) {
10132        Py_ssize_t i;
10133        /* same length */
10134        if (len1 == 0)
10135            goto nothing;
10136        if (len1 == 1) {
10137            /* replace characters */
10138            Py_UCS4 u1, u2;
10139            int rkind;
10140            u1 = PyUnicode_READ_CHAR(str1, 0);
10141            if (findchar(sbuf, PyUnicode_KIND(self),
10142                         slen, u1, 1) < 0)
10143                goto nothing;
10144            u2 = PyUnicode_READ_CHAR(str2, 0);
10145            u = PyUnicode_New(slen, maxchar);
10146            if (!u)
10147                goto error;
10148            copy_characters(u, 0, self, 0, slen);
10149            rkind = PyUnicode_KIND(u);
10150            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10151                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
10152                    if (--maxcount < 0)
10153                        break;
10154                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
10155                }
10156        }
10157        else {
10158            int rkind = skind;
10159            char *res;
10160
10161            if (kind1 < rkind) {
10162                /* widen substring */
10163                buf1 = _PyUnicode_AsKind(str1, rkind);
10164                if (!buf1) goto error;
10165                release1 = 1;
10166            }
10167            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10168            if (i < 0)
10169                goto nothing;
10170            if (rkind > kind2) {
10171                /* widen replacement */
10172                buf2 = _PyUnicode_AsKind(str2, rkind);
10173                if (!buf2) goto error;
10174                release2 = 1;
10175            }
10176            else if (rkind < kind2) {
10177                /* widen self and buf1 */
10178                rkind = kind2;
10179                if (release1) PyMem_Free(buf1);
10180                sbuf = _PyUnicode_AsKind(self, rkind);
10181                if (!sbuf) goto error;
10182                srelease = 1;
10183                buf1 = _PyUnicode_AsKind(str1, rkind);
10184                if (!buf1) goto error;
10185                release1 = 1;
10186            }
10187            u = PyUnicode_New(slen, maxchar);
10188            if (!u)
10189                goto error;
10190            assert(PyUnicode_KIND(u) == rkind);
10191            res = PyUnicode_DATA(u);
10192
10193            memcpy(res, sbuf, rkind * slen);
10194            /* change everything in-place, starting with this one */
10195            memcpy(res + rkind * i,
10196                   buf2,
10197                   rkind * len2);
10198            i += len1;
10199
10200            while ( --maxcount > 0) {
10201                i = anylib_find(rkind, self,
10202                                sbuf+rkind*i, slen-i,
10203                                str1, buf1, len1, i);
10204                if (i == -1)
10205                    break;
10206                memcpy(res + rkind * i,
10207                       buf2,
10208                       rkind * len2);
10209                i += len1;
10210            }
10211        }
10212    }
10213    else {
10214        Py_ssize_t n, i, j, ires;
10215        Py_ssize_t product, new_size;
10216        int rkind = skind;
10217        char *res;
10218
10219        if (kind1 < rkind) {
10220            /* widen substring */
10221            buf1 = _PyUnicode_AsKind(str1, rkind);
10222            if (!buf1) goto error;
10223            release1 = 1;
10224        }
10225        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10226        if (n == 0)
10227            goto nothing;
10228        if (kind2 < rkind) {
10229            /* widen replacement */
10230            buf2 = _PyUnicode_AsKind(str2, rkind);
10231            if (!buf2) goto error;
10232            release2 = 1;
10233        }
10234        else if (kind2 > rkind) {
10235            /* widen self and buf1 */
10236            rkind = kind2;
10237            sbuf = _PyUnicode_AsKind(self, rkind);
10238            if (!sbuf) goto error;
10239            srelease = 1;
10240            if (release1) PyMem_Free(buf1);
10241            buf1 = _PyUnicode_AsKind(str1, rkind);
10242            if (!buf1) goto error;
10243            release1 = 1;
10244        }
10245        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10246           PyUnicode_GET_LENGTH(str1))); */
10247        product = n * (len2-len1);
10248        if ((product / (len2-len1)) != n) {
10249                PyErr_SetString(PyExc_OverflowError,
10250                                "replace string is too long");
10251                goto error;
10252        }
10253        new_size = slen + product;
10254        if (new_size == 0) {
10255            Py_INCREF(unicode_empty);
10256            u = unicode_empty;
10257            goto done;
10258        }
10259        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10260            PyErr_SetString(PyExc_OverflowError,
10261                            "replace string is too long");
10262            goto error;
10263        }
10264        u = PyUnicode_New(new_size, maxchar);
10265        if (!u)
10266            goto error;
10267        assert(PyUnicode_KIND(u) == rkind);
10268        res = PyUnicode_DATA(u);
10269        ires = i = 0;
10270        if (len1 > 0) {
10271            while (n-- > 0) {
10272                /* look for next match */
10273                j = anylib_find(rkind, self,
10274                                sbuf + rkind * i, slen-i,
10275                                str1, buf1, len1, i);
10276                if (j == -1)
10277                    break;
10278                else if (j > i) {
10279                    /* copy unchanged part [i:j] */
10280                    memcpy(res + rkind * ires,
10281                           sbuf + rkind * i,
10282                           rkind * (j-i));
10283                    ires += j - i;
10284                }
10285                /* copy substitution string */
10286                if (len2 > 0) {
10287                    memcpy(res + rkind * ires,
10288                           buf2,
10289                           rkind * len2);
10290                    ires += len2;
10291                }
10292                i = j + len1;
10293            }
10294            if (i < slen)
10295                /* copy tail [i:] */
10296                memcpy(res + rkind * ires,
10297                       sbuf + rkind * i,
10298                       rkind * (slen-i));
10299        }
10300        else {
10301            /* interleave */
10302            while (n > 0) {
10303                memcpy(res + rkind * ires,
10304                       buf2,
10305                       rkind * len2);
10306                ires += len2;
10307                if (--n <= 0)
10308                    break;
10309                memcpy(res + rkind * ires,
10310                       sbuf + rkind * i,
10311                       rkind);
10312                ires++;
10313                i++;
10314            }
10315            memcpy(res + rkind * ires,
10316                   sbuf + rkind * i,
10317                   rkind * (slen-i));
10318        }
10319    }
10320
10321    if (mayshrink) {
10322        unicode_adjust_maxchar(&u);
10323        if (u == NULL)
10324            goto error;
10325    }
10326
10327  done:
10328    if (srelease)
10329        PyMem_FREE(sbuf);
10330    if (release1)
10331        PyMem_FREE(buf1);
10332    if (release2)
10333        PyMem_FREE(buf2);
10334    assert(_PyUnicode_CheckConsistency(u, 1));
10335    return u;
10336
10337  nothing:
10338    /* nothing to replace; return original string (when possible) */
10339    if (srelease)
10340        PyMem_FREE(sbuf);
10341    if (release1)
10342        PyMem_FREE(buf1);
10343    if (release2)
10344        PyMem_FREE(buf2);
10345    if (PyUnicode_CheckExact(self)) {
10346        Py_INCREF(self);
10347        return self;
10348    }
10349    return PyUnicode_Copy(self);
10350  error:
10351    if (srelease && sbuf)
10352        PyMem_FREE(sbuf);
10353    if (release1 && buf1)
10354        PyMem_FREE(buf1);
10355    if (release2 && buf2)
10356        PyMem_FREE(buf2);
10357    return NULL;
10358}
10359
10360/* --- Unicode Object Methods --------------------------------------------- */
10361
10362PyDoc_STRVAR(title__doc__,
10363             "S.title() -> str\n\
10364\n\
10365Return a titlecased version of S, i.e. words start with title case\n\
10366characters, all remaining cased characters have lower case.");
10367
10368static PyObject*
10369unicode_title(PyObject *self)
10370{
10371    return fixup(self, fixtitle);
10372}
10373
10374PyDoc_STRVAR(capitalize__doc__,
10375             "S.capitalize() -> str\n\
10376\n\
10377Return a capitalized version of S, i.e. make the first character\n\
10378have upper case and the rest lower case.");
10379
10380static PyObject*
10381unicode_capitalize(PyObject *self)
10382{
10383    return fixup(self, fixcapitalize);
10384}
10385
10386#if 0
10387PyDoc_STRVAR(capwords__doc__,
10388             "S.capwords() -> str\n\
10389\n\
10390Apply .capitalize() to all words in S and return the result with\n\
10391normalized whitespace (all whitespace strings are replaced by ' ').");
10392
10393static PyObject*
10394unicode_capwords(PyObject *self)
10395{
10396    PyObject *list;
10397    PyObject *item;
10398    Py_ssize_t i;
10399
10400    /* Split into words */
10401    list = split(self, NULL, -1);
10402    if (!list)
10403        return NULL;
10404
10405    /* Capitalize each word */
10406    for (i = 0; i < PyList_GET_SIZE(list); i++) {
10407        item = fixup(PyList_GET_ITEM(list, i),
10408                     fixcapitalize);
10409        if (item == NULL)
10410            goto onError;
10411        Py_DECREF(PyList_GET_ITEM(list, i));
10412        PyList_SET_ITEM(list, i, item);
10413    }
10414
10415    /* Join the words to form a new string */
10416    item = PyUnicode_Join(NULL, list);
10417
10418  onError:
10419    Py_DECREF(list);
10420    return item;
10421}
10422#endif
10423
10424/* Argument converter.  Coerces to a single unicode character */
10425
10426static int
10427convert_uc(PyObject *obj, void *addr)
10428{
10429    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10430    PyObject *uniobj;
10431
10432    uniobj = PyUnicode_FromObject(obj);
10433    if (uniobj == NULL) {
10434        PyErr_SetString(PyExc_TypeError,
10435                        "The fill character cannot be converted to Unicode");
10436        return 0;
10437    }
10438    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10439        PyErr_SetString(PyExc_TypeError,
10440                        "The fill character must be exactly one character long");
10441        Py_DECREF(uniobj);
10442        return 0;
10443    }
10444    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10445    Py_DECREF(uniobj);
10446    return 1;
10447}
10448
10449PyDoc_STRVAR(center__doc__,
10450             "S.center(width[, fillchar]) -> str\n\
10451\n\
10452Return S centered in a string of length width. Padding is\n\
10453done using the specified fill character (default is a space)");
10454
10455static PyObject *
10456unicode_center(PyObject *self, PyObject *args)
10457{
10458    Py_ssize_t marg, left;
10459    Py_ssize_t width;
10460    Py_UCS4 fillchar = ' ';
10461
10462    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10463        return NULL;
10464
10465    if (PyUnicode_READY(self) == -1)
10466        return NULL;
10467
10468    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10469        Py_INCREF(self);
10470        return self;
10471    }
10472
10473    marg = width - _PyUnicode_LENGTH(self);
10474    left = marg / 2 + (marg & width & 1);
10475
10476    return pad(self, left, marg - left, fillchar);
10477}
10478
10479/* This function assumes that str1 and str2 are readied by the caller. */
10480
10481static int
10482unicode_compare(PyObject *str1, PyObject *str2)
10483{
10484    int kind1, kind2;
10485    void *data1, *data2;
10486    Py_ssize_t len1, len2, i;
10487
10488    kind1 = PyUnicode_KIND(str1);
10489    kind2 = PyUnicode_KIND(str2);
10490    data1 = PyUnicode_DATA(str1);
10491    data2 = PyUnicode_DATA(str2);
10492    len1 = PyUnicode_GET_LENGTH(str1);
10493    len2 = PyUnicode_GET_LENGTH(str2);
10494
10495    for (i = 0; i < len1 && i < len2; ++i) {
10496        Py_UCS4 c1, c2;
10497        c1 = PyUnicode_READ(kind1, data1, i);
10498        c2 = PyUnicode_READ(kind2, data2, i);
10499
10500        if (c1 != c2)
10501            return (c1 < c2) ? -1 : 1;
10502    }
10503
10504    return (len1 < len2) ? -1 : (len1 != len2);
10505}
10506
10507int
10508PyUnicode_Compare(PyObject *left, PyObject *right)
10509{
10510    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10511        if (PyUnicode_READY(left) == -1 ||
10512            PyUnicode_READY(right) == -1)
10513            return -1;
10514        return unicode_compare(left, right);
10515    }
10516    PyErr_Format(PyExc_TypeError,
10517                 "Can't compare %.100s and %.100s",
10518                 left->ob_type->tp_name,
10519                 right->ob_type->tp_name);
10520    return -1;
10521}
10522
10523int
10524PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10525{
10526    Py_ssize_t i;
10527    int kind;
10528    void *data;
10529    Py_UCS4 chr;
10530
10531    assert(_PyUnicode_CHECK(uni));
10532    if (PyUnicode_READY(uni) == -1)
10533        return -1;
10534    kind = PyUnicode_KIND(uni);
10535    data = PyUnicode_DATA(uni);
10536    /* Compare Unicode string and source character set string */
10537    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10538        if (chr != str[i])
10539            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10540    /* This check keeps Python strings that end in '\0' from comparing equal
10541     to C strings identical up to that point. */
10542    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10543        return 1; /* uni is longer */
10544    if (str[i])
10545        return -1; /* str is longer */
10546    return 0;
10547}
10548
10549
10550#define TEST_COND(cond)                         \
10551    ((cond) ? Py_True : Py_False)
10552
10553PyObject *
10554PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10555{
10556    int result;
10557
10558    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10559        PyObject *v;
10560        if (PyUnicode_READY(left) == -1 ||
10561            PyUnicode_READY(right) == -1)
10562            return NULL;
10563        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10564            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10565            if (op == Py_EQ) {
10566                Py_INCREF(Py_False);
10567                return Py_False;
10568            }
10569            if (op == Py_NE) {
10570                Py_INCREF(Py_True);
10571                return Py_True;
10572            }
10573        }
10574        if (left == right)
10575            result = 0;
10576        else
10577            result = unicode_compare(left, right);
10578
10579        /* Convert the return value to a Boolean */
10580        switch (op) {
10581        case Py_EQ:
10582            v = TEST_COND(result == 0);
10583            break;
10584        case Py_NE:
10585            v = TEST_COND(result != 0);
10586            break;
10587        case Py_LE:
10588            v = TEST_COND(result <= 0);
10589            break;
10590        case Py_GE:
10591            v = TEST_COND(result >= 0);
10592            break;
10593        case Py_LT:
10594            v = TEST_COND(result == -1);
10595            break;
10596        case Py_GT:
10597            v = TEST_COND(result == 1);
10598            break;
10599        default:
10600            PyErr_BadArgument();
10601            return NULL;
10602        }
10603        Py_INCREF(v);
10604        return v;
10605    }
10606
10607    Py_RETURN_NOTIMPLEMENTED;
10608}
10609
10610int
10611PyUnicode_Contains(PyObject *container, PyObject *element)
10612{
10613    PyObject *str, *sub;
10614    int kind1, kind2, kind;
10615    void *buf1, *buf2;
10616    Py_ssize_t len1, len2;
10617    int result;
10618
10619    /* Coerce the two arguments */
10620    sub = PyUnicode_FromObject(element);
10621    if (!sub) {
10622        PyErr_Format(PyExc_TypeError,
10623                     "'in <string>' requires string as left operand, not %s",
10624                     element->ob_type->tp_name);
10625        return -1;
10626    }
10627    if (PyUnicode_READY(sub) == -1)
10628        return -1;
10629
10630    str = PyUnicode_FromObject(container);
10631    if (!str || PyUnicode_READY(str) == -1) {
10632        Py_DECREF(sub);
10633        return -1;
10634    }
10635
10636    kind1 = PyUnicode_KIND(str);
10637    kind2 = PyUnicode_KIND(sub);
10638    kind = kind1 > kind2 ? kind1 : kind2;
10639    buf1 = PyUnicode_DATA(str);
10640    buf2 = PyUnicode_DATA(sub);
10641    if (kind1 != kind)
10642        buf1 = _PyUnicode_AsKind(str, kind);
10643    if (!buf1) {
10644        Py_DECREF(sub);
10645        return -1;
10646    }
10647    if (kind2 != kind)
10648        buf2 = _PyUnicode_AsKind(sub, kind);
10649    if (!buf2) {
10650        Py_DECREF(sub);
10651        if (kind1 != kind) PyMem_Free(buf1);
10652        return -1;
10653    }
10654    len1 = PyUnicode_GET_LENGTH(str);
10655    len2 = PyUnicode_GET_LENGTH(sub);
10656
10657    switch(kind) {
10658    case PyUnicode_1BYTE_KIND:
10659        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10660        break;
10661    case PyUnicode_2BYTE_KIND:
10662        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10663        break;
10664    case PyUnicode_4BYTE_KIND:
10665        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10666        break;
10667    default:
10668        result = -1;
10669        assert(0);
10670    }
10671
10672    Py_DECREF(str);
10673    Py_DECREF(sub);
10674
10675    if (kind1 != kind)
10676        PyMem_Free(buf1);
10677    if (kind2 != kind)
10678        PyMem_Free(buf2);
10679
10680    return result;
10681}
10682
10683/* Concat to string or Unicode object giving a new Unicode object. */
10684
10685PyObject *
10686PyUnicode_Concat(PyObject *left, PyObject *right)
10687{
10688    PyObject *u = NULL, *v = NULL, *w;
10689    Py_UCS4 maxchar, maxchar2;
10690
10691    /* Coerce the two arguments */
10692    u = PyUnicode_FromObject(left);
10693    if (u == NULL)
10694        goto onError;
10695    v = PyUnicode_FromObject(right);
10696    if (v == NULL)
10697        goto onError;
10698
10699    /* Shortcuts */
10700    if (v == unicode_empty) {
10701        Py_DECREF(v);
10702        return u;
10703    }
10704    if (u == unicode_empty) {
10705        Py_DECREF(u);
10706        return v;
10707    }
10708
10709    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10710    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10711    maxchar = Py_MAX(maxchar, maxchar2);
10712
10713    /* Concat the two Unicode strings */
10714    w = PyUnicode_New(
10715        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10716        maxchar);
10717    if (w == NULL)
10718        goto onError;
10719    copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10720    copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
10721    Py_DECREF(u);
10722    Py_DECREF(v);
10723    assert(_PyUnicode_CheckConsistency(w, 1));
10724    return w;
10725
10726  onError:
10727    Py_XDECREF(u);
10728    Py_XDECREF(v);
10729    return NULL;
10730}
10731
10732static void
10733unicode_append_inplace(PyObject **p_left, PyObject *right)
10734{
10735    Py_ssize_t left_len, right_len, new_len;
10736
10737    assert(PyUnicode_IS_READY(*p_left));
10738    assert(PyUnicode_IS_READY(right));
10739
10740    left_len = PyUnicode_GET_LENGTH(*p_left);
10741    right_len = PyUnicode_GET_LENGTH(right);
10742    if (left_len > PY_SSIZE_T_MAX - right_len) {
10743        PyErr_SetString(PyExc_OverflowError,
10744                        "strings are too large to concat");
10745        goto error;
10746    }
10747    new_len = left_len + right_len;
10748
10749    /* Now we own the last reference to 'left', so we can resize it
10750     * in-place.
10751     */
10752    if (unicode_resize(p_left, new_len) != 0) {
10753        /* XXX if _PyUnicode_Resize() fails, 'left' has been
10754         * deallocated so it cannot be put back into
10755         * 'variable'.  The MemoryError is raised when there
10756         * is no value in 'variable', which might (very
10757         * remotely) be a cause of incompatibilities.
10758         */
10759        goto error;
10760    }
10761    /* copy 'right' into the newly allocated area of 'left' */
10762    copy_characters(*p_left, left_len, right, 0, right_len);
10763    _PyUnicode_DIRTY(*p_left);
10764    return;
10765
10766error:
10767    Py_DECREF(*p_left);
10768    *p_left = NULL;
10769}
10770
10771void
10772PyUnicode_Append(PyObject **p_left, PyObject *right)
10773{
10774    PyObject *left, *res;
10775
10776    if (p_left == NULL) {
10777        if (!PyErr_Occurred())
10778            PyErr_BadInternalCall();
10779        return;
10780    }
10781    left = *p_left;
10782    if (right == NULL || !PyUnicode_Check(left)) {
10783        if (!PyErr_Occurred())
10784            PyErr_BadInternalCall();
10785        goto error;
10786    }
10787
10788    if (PyUnicode_READY(left))
10789        goto error;
10790    if (PyUnicode_READY(right))
10791        goto error;
10792
10793    if (PyUnicode_CheckExact(left) && left != unicode_empty
10794        && PyUnicode_CheckExact(right) && right != unicode_empty
10795        && unicode_resizable(left)
10796        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10797            || _PyUnicode_WSTR(left) != NULL))
10798    {
10799        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10800           to change the structure size, but characters are stored just after
10801           the structure, and so it requires to move all characters which is
10802           not so different than duplicating the string. */
10803        if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10804        {
10805            unicode_append_inplace(p_left, right);
10806            if (p_left != NULL)
10807                assert(_PyUnicode_CheckConsistency(*p_left, 1));
10808            return;
10809        }
10810    }
10811
10812    res = PyUnicode_Concat(left, right);
10813    if (res == NULL)
10814        goto error;
10815    Py_DECREF(left);
10816    *p_left = res;
10817    return;
10818
10819error:
10820    Py_DECREF(*p_left);
10821    *p_left = NULL;
10822}
10823
10824void
10825PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10826{
10827    PyUnicode_Append(pleft, right);
10828    Py_XDECREF(right);
10829}
10830
10831PyDoc_STRVAR(count__doc__,
10832             "S.count(sub[, start[, end]]) -> int\n\
10833\n\
10834Return the number of non-overlapping occurrences of substring sub in\n\
10835string S[start:end].  Optional arguments start and end are\n\
10836interpreted as in slice notation.");
10837
10838static PyObject *
10839unicode_count(PyObject *self, PyObject *args)
10840{
10841    PyObject *substring;
10842    Py_ssize_t start = 0;
10843    Py_ssize_t end = PY_SSIZE_T_MAX;
10844    PyObject *result;
10845    int kind1, kind2, kind;
10846    void *buf1, *buf2;
10847    Py_ssize_t len1, len2, iresult;
10848
10849    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10850                                            &start, &end))
10851        return NULL;
10852
10853    kind1 = PyUnicode_KIND(self);
10854    kind2 = PyUnicode_KIND(substring);
10855    kind = kind1 > kind2 ? kind1 : kind2;
10856    buf1 = PyUnicode_DATA(self);
10857    buf2 = PyUnicode_DATA(substring);
10858    if (kind1 != kind)
10859        buf1 = _PyUnicode_AsKind(self, kind);
10860    if (!buf1) {
10861        Py_DECREF(substring);
10862        return NULL;
10863    }
10864    if (kind2 != kind)
10865        buf2 = _PyUnicode_AsKind(substring, kind);
10866    if (!buf2) {
10867        Py_DECREF(substring);
10868        if (kind1 != kind) PyMem_Free(buf1);
10869        return NULL;
10870    }
10871    len1 = PyUnicode_GET_LENGTH(self);
10872    len2 = PyUnicode_GET_LENGTH(substring);
10873
10874    ADJUST_INDICES(start, end, len1);
10875    switch(kind) {
10876    case PyUnicode_1BYTE_KIND:
10877        iresult = ucs1lib_count(
10878            ((Py_UCS1*)buf1) + start, end - start,
10879            buf2, len2, PY_SSIZE_T_MAX
10880            );
10881        break;
10882    case PyUnicode_2BYTE_KIND:
10883        iresult = ucs2lib_count(
10884            ((Py_UCS2*)buf1) + start, end - start,
10885            buf2, len2, PY_SSIZE_T_MAX
10886            );
10887        break;
10888    case PyUnicode_4BYTE_KIND:
10889        iresult = ucs4lib_count(
10890            ((Py_UCS4*)buf1) + start, end - start,
10891            buf2, len2, PY_SSIZE_T_MAX
10892            );
10893        break;
10894    default:
10895        assert(0); iresult = 0;
10896    }
10897
10898    result = PyLong_FromSsize_t(iresult);
10899
10900    if (kind1 != kind)
10901        PyMem_Free(buf1);
10902    if (kind2 != kind)
10903        PyMem_Free(buf2);
10904
10905    Py_DECREF(substring);
10906
10907    return result;
10908}
10909
10910PyDoc_STRVAR(encode__doc__,
10911             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10912\n\
10913Encode S using the codec registered for encoding. Default encoding\n\
10914is 'utf-8'. errors may be given to set a different error\n\
10915handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10916a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10917'xmlcharrefreplace' as well as any other name registered with\n\
10918codecs.register_error that can handle UnicodeEncodeErrors.");
10919
10920static PyObject *
10921unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10922{
10923    static char *kwlist[] = {"encoding", "errors", 0};
10924    char *encoding = NULL;
10925    char *errors = NULL;
10926
10927    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10928                                     kwlist, &encoding, &errors))
10929        return NULL;
10930    return PyUnicode_AsEncodedString(self, encoding, errors);
10931}
10932
10933PyDoc_STRVAR(expandtabs__doc__,
10934             "S.expandtabs([tabsize]) -> str\n\
10935\n\
10936Return a copy of S where all tab characters are expanded using spaces.\n\
10937If tabsize is not given, a tab size of 8 characters is assumed.");
10938
10939static PyObject*
10940unicode_expandtabs(PyObject *self, PyObject *args)
10941{
10942    Py_ssize_t i, j, line_pos, src_len, incr;
10943    Py_UCS4 ch;
10944    PyObject *u;
10945    void *src_data, *dest_data;
10946    int tabsize = 8;
10947    int kind;
10948    int found;
10949
10950    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10951        return NULL;
10952
10953    if (PyUnicode_READY(self) == -1)
10954        return NULL;
10955
10956    /* First pass: determine size of output string */
10957    src_len = PyUnicode_GET_LENGTH(self);
10958    i = j = line_pos = 0;
10959    kind = PyUnicode_KIND(self);
10960    src_data = PyUnicode_DATA(self);
10961    found = 0;
10962    for (; i < src_len; i++) {
10963        ch = PyUnicode_READ(kind, src_data, i);
10964        if (ch == '\t') {
10965            found = 1;
10966            if (tabsize > 0) {
10967                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10968                if (j > PY_SSIZE_T_MAX - incr)
10969                    goto overflow;
10970                line_pos += incr;
10971                j += incr;
10972            }
10973        }
10974        else {
10975            if (j > PY_SSIZE_T_MAX - 1)
10976                goto overflow;
10977            line_pos++;
10978            j++;
10979            if (ch == '\n' || ch == '\r')
10980                line_pos = 0;
10981        }
10982    }
10983    if (!found && PyUnicode_CheckExact(self)) {
10984        Py_INCREF(self);
10985        return self;
10986    }
10987
10988    /* Second pass: create output string and fill it */
10989    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10990    if (!u)
10991        return NULL;
10992    dest_data = PyUnicode_DATA(u);
10993
10994    i = j = line_pos = 0;
10995
10996    for (; i < src_len; i++) {
10997        ch = PyUnicode_READ(kind, src_data, i);
10998        if (ch == '\t') {
10999            if (tabsize > 0) {
11000                incr = tabsize - (line_pos % tabsize);
11001                line_pos += incr;
11002                while (incr--) {
11003                    PyUnicode_WRITE(kind, dest_data, j, ' ');
11004                    j++;
11005                }
11006            }
11007        }
11008        else {
11009            line_pos++;
11010            PyUnicode_WRITE(kind, dest_data, j, ch);
11011            j++;
11012            if (ch == '\n' || ch == '\r')
11013                line_pos = 0;
11014        }
11015    }
11016    assert (j == PyUnicode_GET_LENGTH(u));
11017#ifndef DONT_MAKE_RESULT_READY
11018    if (_PyUnicode_READY_REPLACE(&u)) {
11019        Py_DECREF(u);
11020        return NULL;
11021    }
11022#endif
11023    assert(_PyUnicode_CheckConsistency(u, 1));
11024    return u;
11025
11026  overflow:
11027    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11028    return NULL;
11029}
11030
11031PyDoc_STRVAR(find__doc__,
11032             "S.find(sub[, start[, end]]) -> int\n\
11033\n\
11034Return the lowest index in S where substring sub is found,\n\
11035such that sub is contained within S[start:end].  Optional\n\
11036arguments start and end are interpreted as in slice notation.\n\
11037\n\
11038Return -1 on failure.");
11039
11040static PyObject *
11041unicode_find(PyObject *self, PyObject *args)
11042{
11043    PyObject *substring;
11044    Py_ssize_t start;
11045    Py_ssize_t end;
11046    Py_ssize_t result;
11047
11048    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11049                                            &start, &end))
11050        return NULL;
11051
11052    if (PyUnicode_READY(self) == -1)
11053        return NULL;
11054    if (PyUnicode_READY(substring) == -1)
11055        return NULL;
11056
11057    result = any_find_slice(1, self, substring, start, end);
11058
11059    Py_DECREF(substring);
11060
11061    if (result == -2)
11062        return NULL;
11063
11064    return PyLong_FromSsize_t(result);
11065}
11066
11067static PyObject *
11068unicode_getitem(PyObject *self, Py_ssize_t index)
11069{
11070    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11071    if (ch == (Py_UCS4)-1)
11072        return NULL;
11073    return PyUnicode_FromOrdinal(ch);
11074}
11075
11076/* Believe it or not, this produces the same value for ASCII strings
11077   as bytes_hash(). */
11078static Py_hash_t
11079unicode_hash(PyObject *self)
11080{
11081    Py_ssize_t len;
11082    Py_uhash_t x;
11083
11084    if (_PyUnicode_HASH(self) != -1)
11085        return _PyUnicode_HASH(self);
11086    if (PyUnicode_READY(self) == -1)
11087        return -1;
11088    len = PyUnicode_GET_LENGTH(self);
11089
11090    /* The hash function as a macro, gets expanded three times below. */
11091#define HASH(P) \
11092    x = (Py_uhash_t)*P << 7; \
11093    while (--len >= 0) \
11094        x = (1000003*x) ^ (Py_uhash_t)*P++;
11095
11096    switch (PyUnicode_KIND(self)) {
11097    case PyUnicode_1BYTE_KIND: {
11098        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11099        HASH(c);
11100        break;
11101    }
11102    case PyUnicode_2BYTE_KIND: {
11103        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11104        HASH(s);
11105        break;
11106    }
11107    default: {
11108        Py_UCS4 *l;
11109        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11110               "Impossible switch case in unicode_hash");
11111        l = PyUnicode_4BYTE_DATA(self);
11112        HASH(l);
11113        break;
11114    }
11115    }
11116    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11117
11118    if (x == -1)
11119        x = -2;
11120    _PyUnicode_HASH(self) = x;
11121    return x;
11122}
11123#undef HASH
11124
11125PyDoc_STRVAR(index__doc__,
11126             "S.index(sub[, start[, end]]) -> int\n\
11127\n\
11128Like S.find() but raise ValueError when the substring is not found.");
11129
11130static PyObject *
11131unicode_index(PyObject *self, PyObject *args)
11132{
11133    Py_ssize_t result;
11134    PyObject *substring;
11135    Py_ssize_t start;
11136    Py_ssize_t end;
11137
11138    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11139                                            &start, &end))
11140        return NULL;
11141
11142    if (PyUnicode_READY(self) == -1)
11143        return NULL;
11144    if (PyUnicode_READY(substring) == -1)
11145        return NULL;
11146
11147    result = any_find_slice(1, self, substring, start, end);
11148
11149    Py_DECREF(substring);
11150
11151    if (result == -2)
11152        return NULL;
11153
11154    if (result < 0) {
11155        PyErr_SetString(PyExc_ValueError, "substring not found");
11156        return NULL;
11157    }
11158
11159    return PyLong_FromSsize_t(result);
11160}
11161
11162PyDoc_STRVAR(islower__doc__,
11163             "S.islower() -> bool\n\
11164\n\
11165Return True if all cased characters in S are lowercase and there is\n\
11166at least one cased character in S, False otherwise.");
11167
11168static PyObject*
11169unicode_islower(PyObject *self)
11170{
11171    Py_ssize_t i, length;
11172    int kind;
11173    void *data;
11174    int cased;
11175
11176    if (PyUnicode_READY(self) == -1)
11177        return NULL;
11178    length = PyUnicode_GET_LENGTH(self);
11179    kind = PyUnicode_KIND(self);
11180    data = PyUnicode_DATA(self);
11181
11182    /* Shortcut for single character strings */
11183    if (length == 1)
11184        return PyBool_FromLong(
11185            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11186
11187    /* Special case for empty strings */
11188    if (length == 0)
11189        return PyBool_FromLong(0);
11190
11191    cased = 0;
11192    for (i = 0; i < length; i++) {
11193        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11194
11195        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11196            return PyBool_FromLong(0);
11197        else if (!cased && Py_UNICODE_ISLOWER(ch))
11198            cased = 1;
11199    }
11200    return PyBool_FromLong(cased);
11201}
11202
11203PyDoc_STRVAR(isupper__doc__,
11204             "S.isupper() -> bool\n\
11205\n\
11206Return True if all cased characters in S are uppercase and there is\n\
11207at least one cased character in S, False otherwise.");
11208
11209static PyObject*
11210unicode_isupper(PyObject *self)
11211{
11212    Py_ssize_t i, length;
11213    int kind;
11214    void *data;
11215    int cased;
11216
11217    if (PyUnicode_READY(self) == -1)
11218        return NULL;
11219    length = PyUnicode_GET_LENGTH(self);
11220    kind = PyUnicode_KIND(self);
11221    data = PyUnicode_DATA(self);
11222
11223    /* Shortcut for single character strings */
11224    if (length == 1)
11225        return PyBool_FromLong(
11226            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11227
11228    /* Special case for empty strings */
11229    if (length == 0)
11230        return PyBool_FromLong(0);
11231
11232    cased = 0;
11233    for (i = 0; i < length; i++) {
11234        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11235
11236        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11237            return PyBool_FromLong(0);
11238        else if (!cased && Py_UNICODE_ISUPPER(ch))
11239            cased = 1;
11240    }
11241    return PyBool_FromLong(cased);
11242}
11243
11244PyDoc_STRVAR(istitle__doc__,
11245             "S.istitle() -> bool\n\
11246\n\
11247Return True if S is a titlecased string and there is at least one\n\
11248character in S, i.e. upper- and titlecase characters may only\n\
11249follow uncased characters and lowercase characters only cased ones.\n\
11250Return False otherwise.");
11251
11252static PyObject*
11253unicode_istitle(PyObject *self)
11254{
11255    Py_ssize_t i, length;
11256    int kind;
11257    void *data;
11258    int cased, previous_is_cased;
11259
11260    if (PyUnicode_READY(self) == -1)
11261        return NULL;
11262    length = PyUnicode_GET_LENGTH(self);
11263    kind = PyUnicode_KIND(self);
11264    data = PyUnicode_DATA(self);
11265
11266    /* Shortcut for single character strings */
11267    if (length == 1) {
11268        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11269        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11270                               (Py_UNICODE_ISUPPER(ch) != 0));
11271    }
11272
11273    /* Special case for empty strings */
11274    if (length == 0)
11275        return PyBool_FromLong(0);
11276
11277    cased = 0;
11278    previous_is_cased = 0;
11279    for (i = 0; i < length; i++) {
11280        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11281
11282        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11283            if (previous_is_cased)
11284                return PyBool_FromLong(0);
11285            previous_is_cased = 1;
11286            cased = 1;
11287        }
11288        else if (Py_UNICODE_ISLOWER(ch)) {
11289            if (!previous_is_cased)
11290                return PyBool_FromLong(0);
11291            previous_is_cased = 1;
11292            cased = 1;
11293        }
11294        else
11295            previous_is_cased = 0;
11296    }
11297    return PyBool_FromLong(cased);
11298}
11299
11300PyDoc_STRVAR(isspace__doc__,
11301             "S.isspace() -> bool\n\
11302\n\
11303Return True if all characters in S are whitespace\n\
11304and there is at least one character in S, False otherwise.");
11305
11306static PyObject*
11307unicode_isspace(PyObject *self)
11308{
11309    Py_ssize_t i, length;
11310    int kind;
11311    void *data;
11312
11313    if (PyUnicode_READY(self) == -1)
11314        return NULL;
11315    length = PyUnicode_GET_LENGTH(self);
11316    kind = PyUnicode_KIND(self);
11317    data = PyUnicode_DATA(self);
11318
11319    /* Shortcut for single character strings */
11320    if (length == 1)
11321        return PyBool_FromLong(
11322            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11323
11324    /* Special case for empty strings */
11325    if (length == 0)
11326        return PyBool_FromLong(0);
11327
11328    for (i = 0; i < length; i++) {
11329        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11330        if (!Py_UNICODE_ISSPACE(ch))
11331            return PyBool_FromLong(0);
11332    }
11333    return PyBool_FromLong(1);
11334}
11335
11336PyDoc_STRVAR(isalpha__doc__,
11337             "S.isalpha() -> bool\n\
11338\n\
11339Return True if all characters in S are alphabetic\n\
11340and there is at least one character in S, False otherwise.");
11341
11342static PyObject*
11343unicode_isalpha(PyObject *self)
11344{
11345    Py_ssize_t i, length;
11346    int kind;
11347    void *data;
11348
11349    if (PyUnicode_READY(self) == -1)
11350        return NULL;
11351    length = PyUnicode_GET_LENGTH(self);
11352    kind = PyUnicode_KIND(self);
11353    data = PyUnicode_DATA(self);
11354
11355    /* Shortcut for single character strings */
11356    if (length == 1)
11357        return PyBool_FromLong(
11358            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11359
11360    /* Special case for empty strings */
11361    if (length == 0)
11362        return PyBool_FromLong(0);
11363
11364    for (i = 0; i < length; i++) {
11365        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11366            return PyBool_FromLong(0);
11367    }
11368    return PyBool_FromLong(1);
11369}
11370
11371PyDoc_STRVAR(isalnum__doc__,
11372             "S.isalnum() -> bool\n\
11373\n\
11374Return True if all characters in S are alphanumeric\n\
11375and there is at least one character in S, False otherwise.");
11376
11377static PyObject*
11378unicode_isalnum(PyObject *self)
11379{
11380    int kind;
11381    void *data;
11382    Py_ssize_t len, i;
11383
11384    if (PyUnicode_READY(self) == -1)
11385        return NULL;
11386
11387    kind = PyUnicode_KIND(self);
11388    data = PyUnicode_DATA(self);
11389    len = PyUnicode_GET_LENGTH(self);
11390
11391    /* Shortcut for single character strings */
11392    if (len == 1) {
11393        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11394        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11395    }
11396
11397    /* Special case for empty strings */
11398    if (len == 0)
11399        return PyBool_FromLong(0);
11400
11401    for (i = 0; i < len; i++) {
11402        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11403        if (!Py_UNICODE_ISALNUM(ch))
11404            return PyBool_FromLong(0);
11405    }
11406    return PyBool_FromLong(1);
11407}
11408
11409PyDoc_STRVAR(isdecimal__doc__,
11410             "S.isdecimal() -> bool\n\
11411\n\
11412Return True if there are only decimal characters in S,\n\
11413False otherwise.");
11414
11415static PyObject*
11416unicode_isdecimal(PyObject *self)
11417{
11418    Py_ssize_t i, length;
11419    int kind;
11420    void *data;
11421
11422    if (PyUnicode_READY(self) == -1)
11423        return NULL;
11424    length = PyUnicode_GET_LENGTH(self);
11425    kind = PyUnicode_KIND(self);
11426    data = PyUnicode_DATA(self);
11427
11428    /* Shortcut for single character strings */
11429    if (length == 1)
11430        return PyBool_FromLong(
11431            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11432
11433    /* Special case for empty strings */
11434    if (length == 0)
11435        return PyBool_FromLong(0);
11436
11437    for (i = 0; i < length; i++) {
11438        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11439            return PyBool_FromLong(0);
11440    }
11441    return PyBool_FromLong(1);
11442}
11443
11444PyDoc_STRVAR(isdigit__doc__,
11445             "S.isdigit() -> bool\n\
11446\n\
11447Return True if all characters in S are digits\n\
11448and there is at least one character in S, False otherwise.");
11449
11450static PyObject*
11451unicode_isdigit(PyObject *self)
11452{
11453    Py_ssize_t i, length;
11454    int kind;
11455    void *data;
11456
11457    if (PyUnicode_READY(self) == -1)
11458        return NULL;
11459    length = PyUnicode_GET_LENGTH(self);
11460    kind = PyUnicode_KIND(self);
11461    data = PyUnicode_DATA(self);
11462
11463    /* Shortcut for single character strings */
11464    if (length == 1) {
11465        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11466        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11467    }
11468
11469    /* Special case for empty strings */
11470    if (length == 0)
11471        return PyBool_FromLong(0);
11472
11473    for (i = 0; i < length; i++) {
11474        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11475            return PyBool_FromLong(0);
11476    }
11477    return PyBool_FromLong(1);
11478}
11479
11480PyDoc_STRVAR(isnumeric__doc__,
11481             "S.isnumeric() -> bool\n\
11482\n\
11483Return True if there are only numeric characters in S,\n\
11484False otherwise.");
11485
11486static PyObject*
11487unicode_isnumeric(PyObject *self)
11488{
11489    Py_ssize_t i, length;
11490    int kind;
11491    void *data;
11492
11493    if (PyUnicode_READY(self) == -1)
11494        return NULL;
11495    length = PyUnicode_GET_LENGTH(self);
11496    kind = PyUnicode_KIND(self);
11497    data = PyUnicode_DATA(self);
11498
11499    /* Shortcut for single character strings */
11500    if (length == 1)
11501        return PyBool_FromLong(
11502            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11503
11504    /* Special case for empty strings */
11505    if (length == 0)
11506        return PyBool_FromLong(0);
11507
11508    for (i = 0; i < length; i++) {
11509        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11510            return PyBool_FromLong(0);
11511    }
11512    return PyBool_FromLong(1);
11513}
11514
11515int
11516PyUnicode_IsIdentifier(PyObject *self)
11517{
11518    int kind;
11519    void *data;
11520    Py_ssize_t i;
11521    Py_UCS4 first;
11522
11523    if (PyUnicode_READY(self) == -1) {
11524        Py_FatalError("identifier not ready");
11525        return 0;
11526    }
11527
11528    /* Special case for empty strings */
11529    if (PyUnicode_GET_LENGTH(self) == 0)
11530        return 0;
11531    kind = PyUnicode_KIND(self);
11532    data = PyUnicode_DATA(self);
11533
11534    /* PEP 3131 says that the first character must be in
11535       XID_Start and subsequent characters in XID_Continue,
11536       and for the ASCII range, the 2.x rules apply (i.e
11537       start with letters and underscore, continue with
11538       letters, digits, underscore). However, given the current
11539       definition of XID_Start and XID_Continue, it is sufficient
11540       to check just for these, except that _ must be allowed
11541       as starting an identifier.  */
11542    first = PyUnicode_READ(kind, data, 0);
11543    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11544        return 0;
11545
11546    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11547        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11548            return 0;
11549    return 1;
11550}
11551
11552PyDoc_STRVAR(isidentifier__doc__,
11553             "S.isidentifier() -> bool\n\
11554\n\
11555Return True if S is a valid identifier according\n\
11556to the language definition.");
11557
11558static PyObject*
11559unicode_isidentifier(PyObject *self)
11560{
11561    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11562}
11563
11564PyDoc_STRVAR(isprintable__doc__,
11565             "S.isprintable() -> bool\n\
11566\n\
11567Return True if all characters in S are considered\n\
11568printable in repr() or S is empty, False otherwise.");
11569
11570static PyObject*
11571unicode_isprintable(PyObject *self)
11572{
11573    Py_ssize_t i, length;
11574    int kind;
11575    void *data;
11576
11577    if (PyUnicode_READY(self) == -1)
11578        return NULL;
11579    length = PyUnicode_GET_LENGTH(self);
11580    kind = PyUnicode_KIND(self);
11581    data = PyUnicode_DATA(self);
11582
11583    /* Shortcut for single character strings */
11584    if (length == 1)
11585        return PyBool_FromLong(
11586            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11587
11588    for (i = 0; i < length; i++) {
11589        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11590            Py_RETURN_FALSE;
11591        }
11592    }
11593    Py_RETURN_TRUE;
11594}
11595
11596PyDoc_STRVAR(join__doc__,
11597             "S.join(iterable) -> str\n\
11598\n\
11599Return a string which is the concatenation of the strings in the\n\
11600iterable.  The separator between elements is S.");
11601
11602static PyObject*
11603unicode_join(PyObject *self, PyObject *data)
11604{
11605    return PyUnicode_Join(self, data);
11606}
11607
11608static Py_ssize_t
11609unicode_length(PyObject *self)
11610{
11611    if (PyUnicode_READY(self) == -1)
11612        return -1;
11613    return PyUnicode_GET_LENGTH(self);
11614}
11615
11616PyDoc_STRVAR(ljust__doc__,
11617             "S.ljust(width[, fillchar]) -> str\n\
11618\n\
11619Return S left-justified in a Unicode string of length width. Padding is\n\
11620done using the specified fill character (default is a space).");
11621
11622static PyObject *
11623unicode_ljust(PyObject *self, PyObject *args)
11624{
11625    Py_ssize_t width;
11626    Py_UCS4 fillchar = ' ';
11627
11628    if (PyUnicode_READY(self) == -1)
11629        return NULL;
11630
11631    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11632        return NULL;
11633
11634    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11635        Py_INCREF(self);
11636        return self;
11637    }
11638
11639    return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
11640}
11641
11642PyDoc_STRVAR(lower__doc__,
11643             "S.lower() -> str\n\
11644\n\
11645Return a copy of the string S converted to lowercase.");
11646
11647static PyObject*
11648unicode_lower(PyObject *self)
11649{
11650    return fixup(self, fixlower);
11651}
11652
11653#define LEFTSTRIP 0
11654#define RIGHTSTRIP 1
11655#define BOTHSTRIP 2
11656
11657/* Arrays indexed by above */
11658static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11659
11660#define STRIPNAME(i) (stripformat[i]+3)
11661
11662/* externally visible for str.strip(unicode) */
11663PyObject *
11664_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11665{
11666    void *data;
11667    int kind;
11668    Py_ssize_t i, j, len;
11669    BLOOM_MASK sepmask;
11670
11671    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11672        return NULL;
11673
11674    kind = PyUnicode_KIND(self);
11675    data = PyUnicode_DATA(self);
11676    len = PyUnicode_GET_LENGTH(self);
11677    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11678                              PyUnicode_DATA(sepobj),
11679                              PyUnicode_GET_LENGTH(sepobj));
11680
11681    i = 0;
11682    if (striptype != RIGHTSTRIP) {
11683        while (i < len &&
11684               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11685            i++;
11686        }
11687    }
11688
11689    j = len;
11690    if (striptype != LEFTSTRIP) {
11691        do {
11692            j--;
11693        } while (j >= i &&
11694                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11695        j++;
11696    }
11697
11698    return PyUnicode_Substring(self, i, j);
11699}
11700
11701PyObject*
11702PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11703{
11704    unsigned char *data;
11705    int kind;
11706    Py_ssize_t length;
11707
11708    if (PyUnicode_READY(self) == -1)
11709        return NULL;
11710
11711    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11712
11713    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
11714    {
11715        if (PyUnicode_CheckExact(self)) {
11716            Py_INCREF(self);
11717            return self;
11718        }
11719        else
11720            return PyUnicode_Copy(self);
11721    }
11722
11723    length = end - start;
11724    if (length == 1)
11725        return unicode_getitem(self, start);
11726
11727    if (start < 0 || end < 0) {
11728        PyErr_SetString(PyExc_IndexError, "string index out of range");
11729        return NULL;
11730    }
11731
11732    if (PyUnicode_IS_ASCII(self)) {
11733        kind = PyUnicode_KIND(self);
11734        data = PyUnicode_1BYTE_DATA(self);
11735        return unicode_fromascii(data + start, length);
11736    }
11737    else {
11738        kind = PyUnicode_KIND(self);
11739        data = PyUnicode_1BYTE_DATA(self);
11740        return PyUnicode_FromKindAndData(kind,
11741                                         data + kind * start,
11742                                         length);
11743    }
11744}
11745
11746static PyObject *
11747do_strip(PyObject *self, int striptype)
11748{
11749    int kind;
11750    void *data;
11751    Py_ssize_t len, i, j;
11752
11753    if (PyUnicode_READY(self) == -1)
11754        return NULL;
11755
11756    kind = PyUnicode_KIND(self);
11757    data = PyUnicode_DATA(self);
11758    len = PyUnicode_GET_LENGTH(self);
11759
11760    i = 0;
11761    if (striptype != RIGHTSTRIP) {
11762        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11763            i++;
11764        }
11765    }
11766
11767    j = len;
11768    if (striptype != LEFTSTRIP) {
11769        do {
11770            j--;
11771        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11772        j++;
11773    }
11774
11775    return PyUnicode_Substring(self, i, j);
11776}
11777
11778
11779static PyObject *
11780do_argstrip(PyObject *self, int striptype, PyObject *args)
11781{
11782    PyObject *sep = NULL;
11783
11784    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11785        return NULL;
11786
11787    if (sep != NULL && sep != Py_None) {
11788        if (PyUnicode_Check(sep))
11789            return _PyUnicode_XStrip(self, striptype, sep);
11790        else {
11791            PyErr_Format(PyExc_TypeError,
11792                         "%s arg must be None or str",
11793                         STRIPNAME(striptype));
11794            return NULL;
11795        }
11796    }
11797
11798    return do_strip(self, striptype);
11799}
11800
11801
11802PyDoc_STRVAR(strip__doc__,
11803             "S.strip([chars]) -> str\n\
11804\n\
11805Return a copy of the string S with leading and trailing\n\
11806whitespace removed.\n\
11807If chars is given and not None, remove characters in chars instead.");
11808
11809static PyObject *
11810unicode_strip(PyObject *self, PyObject *args)
11811{
11812    if (PyTuple_GET_SIZE(args) == 0)
11813        return do_strip(self, BOTHSTRIP); /* Common case */
11814    else
11815        return do_argstrip(self, BOTHSTRIP, args);
11816}
11817
11818
11819PyDoc_STRVAR(lstrip__doc__,
11820             "S.lstrip([chars]) -> str\n\
11821\n\
11822Return a copy of the string S with leading whitespace removed.\n\
11823If chars is given and not None, remove characters in chars instead.");
11824
11825static PyObject *
11826unicode_lstrip(PyObject *self, PyObject *args)
11827{
11828    if (PyTuple_GET_SIZE(args) == 0)
11829        return do_strip(self, LEFTSTRIP); /* Common case */
11830    else
11831        return do_argstrip(self, LEFTSTRIP, args);
11832}
11833
11834
11835PyDoc_STRVAR(rstrip__doc__,
11836             "S.rstrip([chars]) -> str\n\
11837\n\
11838Return a copy of the string S with trailing whitespace removed.\n\
11839If chars is given and not None, remove characters in chars instead.");
11840
11841static PyObject *
11842unicode_rstrip(PyObject *self, PyObject *args)
11843{
11844    if (PyTuple_GET_SIZE(args) == 0)
11845        return do_strip(self, RIGHTSTRIP); /* Common case */
11846    else
11847        return do_argstrip(self, RIGHTSTRIP, args);
11848}
11849
11850
11851static PyObject*
11852unicode_repeat(PyObject *str, Py_ssize_t len)
11853{
11854    PyObject *u;
11855    Py_ssize_t nchars, n;
11856
11857    if (len < 1) {
11858        Py_INCREF(unicode_empty);
11859        return unicode_empty;
11860    }
11861
11862    if (len == 1 && PyUnicode_CheckExact(str)) {
11863        /* no repeat, return original string */
11864        Py_INCREF(str);
11865        return str;
11866    }
11867
11868    if (PyUnicode_READY(str) == -1)
11869        return NULL;
11870
11871    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11872        PyErr_SetString(PyExc_OverflowError,
11873                        "repeated string is too long");
11874        return NULL;
11875    }
11876    nchars = len * PyUnicode_GET_LENGTH(str);
11877
11878    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11879    if (!u)
11880        return NULL;
11881    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11882
11883    if (PyUnicode_GET_LENGTH(str) == 1) {
11884        const int kind = PyUnicode_KIND(str);
11885        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11886        void *to = PyUnicode_DATA(u);
11887        if (kind == PyUnicode_1BYTE_KIND)
11888            memset(to, (unsigned char)fill_char, len);
11889        else {
11890            for (n = 0; n < len; ++n)
11891                PyUnicode_WRITE(kind, to, n, fill_char);
11892        }
11893    }
11894    else {
11895        /* number of characters copied this far */
11896        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11897        const Py_ssize_t char_size = PyUnicode_KIND(str);
11898        char *to = (char *) PyUnicode_DATA(u);
11899        Py_MEMCPY(to, PyUnicode_DATA(str),
11900                  PyUnicode_GET_LENGTH(str) * char_size);
11901        while (done < nchars) {
11902            n = (done <= nchars-done) ? done : nchars-done;
11903            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11904            done += n;
11905        }
11906    }
11907
11908    assert(_PyUnicode_CheckConsistency(u, 1));
11909    return u;
11910}
11911
11912PyObject *
11913PyUnicode_Replace(PyObject *obj,
11914                  PyObject *subobj,
11915                  PyObject *replobj,
11916                  Py_ssize_t maxcount)
11917{
11918    PyObject *self;
11919    PyObject *str1;
11920    PyObject *str2;
11921    PyObject *result;
11922
11923    self = PyUnicode_FromObject(obj);
11924    if (self == NULL || PyUnicode_READY(self) == -1)
11925        return NULL;
11926    str1 = PyUnicode_FromObject(subobj);
11927    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11928        Py_DECREF(self);
11929        return NULL;
11930    }
11931    str2 = PyUnicode_FromObject(replobj);
11932    if (str2 == NULL || PyUnicode_READY(str2)) {
11933        Py_DECREF(self);
11934        Py_DECREF(str1);
11935        return NULL;
11936    }
11937    result = replace(self, str1, str2, maxcount);
11938    Py_DECREF(self);
11939    Py_DECREF(str1);
11940    Py_DECREF(str2);
11941    return result;
11942}
11943
11944PyDoc_STRVAR(replace__doc__,
11945             "S.replace(old, new[, count]) -> str\n\
11946\n\
11947Return a copy of S with all occurrences of substring\n\
11948old replaced by new.  If the optional argument count is\n\
11949given, only the first count occurrences are replaced.");
11950
11951static PyObject*
11952unicode_replace(PyObject *self, PyObject *args)
11953{
11954    PyObject *str1;
11955    PyObject *str2;
11956    Py_ssize_t maxcount = -1;
11957    PyObject *result;
11958
11959    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11960        return NULL;
11961    if (!PyUnicode_READY(self) == -1)
11962        return NULL;
11963    str1 = PyUnicode_FromObject(str1);
11964    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11965        return NULL;
11966    str2 = PyUnicode_FromObject(str2);
11967    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11968        Py_DECREF(str1);
11969        return NULL;
11970    }
11971
11972    result = replace(self, str1, str2, maxcount);
11973
11974    Py_DECREF(str1);
11975    Py_DECREF(str2);
11976    return result;
11977}
11978
11979static PyObject *
11980unicode_repr(PyObject *unicode)
11981{
11982    PyObject *repr;
11983    Py_ssize_t isize;
11984    Py_ssize_t osize, squote, dquote, i, o;
11985    Py_UCS4 max, quote;
11986    int ikind, okind;
11987    void *idata, *odata;
11988
11989    if (PyUnicode_READY(unicode) == -1)
11990        return NULL;
11991
11992    isize = PyUnicode_GET_LENGTH(unicode);
11993    idata = PyUnicode_DATA(unicode);
11994
11995    /* Compute length of output, quote characters, and
11996       maximum character */
11997    osize = 2; /* quotes */
11998    max = 127;
11999    squote = dquote = 0;
12000    ikind = PyUnicode_KIND(unicode);
12001    for (i = 0; i < isize; i++) {
12002        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12003        switch (ch) {
12004        case '\'': squote++; osize++; break;
12005        case '"':  dquote++; osize++; break;
12006        case '\\': case '\t': case '\r': case '\n':
12007            osize += 2; break;
12008        default:
12009            /* Fast-path ASCII */
12010            if (ch < ' ' || ch == 0x7f)
12011                osize += 4; /* \xHH */
12012            else if (ch < 0x7f)
12013                osize++;
12014            else if (Py_UNICODE_ISPRINTABLE(ch)) {
12015                osize++;
12016                max = ch > max ? ch : max;
12017            }
12018            else if (ch < 0x100)
12019                osize += 4; /* \xHH */
12020            else if (ch < 0x10000)
12021                osize += 6; /* \uHHHH */
12022            else
12023                osize += 10; /* \uHHHHHHHH */
12024        }
12025    }
12026
12027    quote = '\'';
12028    if (squote) {
12029        if (dquote)
12030            /* Both squote and dquote present. Use squote,
12031               and escape them */
12032            osize += squote;
12033        else
12034            quote = '"';
12035    }
12036
12037    repr = PyUnicode_New(osize, max);
12038    if (repr == NULL)
12039        return NULL;
12040    okind = PyUnicode_KIND(repr);
12041    odata = PyUnicode_DATA(repr);
12042
12043    PyUnicode_WRITE(okind, odata, 0, quote);
12044    PyUnicode_WRITE(okind, odata, osize-1, quote);
12045
12046    for (i = 0, o = 1; i < isize; i++) {
12047        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12048
12049        /* Escape quotes and backslashes */
12050        if ((ch == quote) || (ch == '\\')) {
12051            PyUnicode_WRITE(okind, odata, o++, '\\');
12052            PyUnicode_WRITE(okind, odata, o++, ch);
12053            continue;
12054        }
12055
12056        /* Map special whitespace to '\t', \n', '\r' */
12057        if (ch == '\t') {
12058            PyUnicode_WRITE(okind, odata, o++, '\\');
12059            PyUnicode_WRITE(okind, odata, o++, 't');
12060        }
12061        else if (ch == '\n') {
12062            PyUnicode_WRITE(okind, odata, o++, '\\');
12063            PyUnicode_WRITE(okind, odata, o++, 'n');
12064        }
12065        else if (ch == '\r') {
12066            PyUnicode_WRITE(okind, odata, o++, '\\');
12067            PyUnicode_WRITE(okind, odata, o++, 'r');
12068        }
12069
12070        /* Map non-printable US ASCII to '\xhh' */
12071        else if (ch < ' ' || ch == 0x7F) {
12072            PyUnicode_WRITE(okind, odata, o++, '\\');
12073            PyUnicode_WRITE(okind, odata, o++, 'x');
12074            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12075            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12076        }
12077
12078        /* Copy ASCII characters as-is */
12079        else if (ch < 0x7F) {
12080            PyUnicode_WRITE(okind, odata, o++, ch);
12081        }
12082
12083        /* Non-ASCII characters */
12084        else {
12085            /* Map Unicode whitespace and control characters
12086               (categories Z* and C* except ASCII space)
12087            */
12088            if (!Py_UNICODE_ISPRINTABLE(ch)) {
12089                /* Map 8-bit characters to '\xhh' */
12090                if (ch <= 0xff) {
12091                    PyUnicode_WRITE(okind, odata, o++, '\\');
12092                    PyUnicode_WRITE(okind, odata, o++, 'x');
12093                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12094                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12095                }
12096                /* Map 21-bit characters to '\U00xxxxxx' */
12097                else if (ch >= 0x10000) {
12098                    PyUnicode_WRITE(okind, odata, o++, '\\');
12099                    PyUnicode_WRITE(okind, odata, o++, 'U');
12100                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12101                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12102                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12103                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12104                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12105                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12106                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12107                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12108                }
12109                /* Map 16-bit characters to '\uxxxx' */
12110                else {
12111                    PyUnicode_WRITE(okind, odata, o++, '\\');
12112                    PyUnicode_WRITE(okind, odata, o++, 'u');
12113                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12114                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12115                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12116                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12117                }
12118            }
12119            /* Copy characters as-is */
12120            else {
12121                PyUnicode_WRITE(okind, odata, o++, ch);
12122            }
12123        }
12124    }
12125    /* Closing quote already added at the beginning */
12126    assert(_PyUnicode_CheckConsistency(repr, 1));
12127    return repr;
12128}
12129
12130PyDoc_STRVAR(rfind__doc__,
12131             "S.rfind(sub[, start[, end]]) -> int\n\
12132\n\
12133Return the highest index in S where substring sub is found,\n\
12134such that sub is contained within S[start:end].  Optional\n\
12135arguments start and end are interpreted as in slice notation.\n\
12136\n\
12137Return -1 on failure.");
12138
12139static PyObject *
12140unicode_rfind(PyObject *self, PyObject *args)
12141{
12142    PyObject *substring;
12143    Py_ssize_t start;
12144    Py_ssize_t end;
12145    Py_ssize_t result;
12146
12147    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12148                                            &start, &end))
12149        return NULL;
12150
12151    if (PyUnicode_READY(self) == -1)
12152        return NULL;
12153    if (PyUnicode_READY(substring) == -1)
12154        return NULL;
12155
12156    result = any_find_slice(-1, self, substring, start, end);
12157
12158    Py_DECREF(substring);
12159
12160    if (result == -2)
12161        return NULL;
12162
12163    return PyLong_FromSsize_t(result);
12164}
12165
12166PyDoc_STRVAR(rindex__doc__,
12167             "S.rindex(sub[, start[, end]]) -> int\n\
12168\n\
12169Like S.rfind() but raise ValueError when the substring is not found.");
12170
12171static PyObject *
12172unicode_rindex(PyObject *self, PyObject *args)
12173{
12174    PyObject *substring;
12175    Py_ssize_t start;
12176    Py_ssize_t end;
12177    Py_ssize_t result;
12178
12179    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12180                                            &start, &end))
12181        return NULL;
12182
12183    if (PyUnicode_READY(self) == -1)
12184        return NULL;
12185    if (PyUnicode_READY(substring) == -1)
12186        return NULL;
12187
12188    result = any_find_slice(-1, self, substring, start, end);
12189
12190    Py_DECREF(substring);
12191
12192    if (result == -2)
12193        return NULL;
12194
12195    if (result < 0) {
12196        PyErr_SetString(PyExc_ValueError, "substring not found");
12197        return NULL;
12198    }
12199
12200    return PyLong_FromSsize_t(result);
12201}
12202
12203PyDoc_STRVAR(rjust__doc__,
12204             "S.rjust(width[, fillchar]) -> str\n\
12205\n\
12206Return S right-justified in a string of length width. Padding is\n\
12207done using the specified fill character (default is a space).");
12208
12209static PyObject *
12210unicode_rjust(PyObject *self, PyObject *args)
12211{
12212    Py_ssize_t width;
12213    Py_UCS4 fillchar = ' ';
12214
12215    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12216        return NULL;
12217
12218    if (PyUnicode_READY(self) == -1)
12219        return NULL;
12220
12221    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
12222        Py_INCREF(self);
12223        return self;
12224    }
12225
12226    return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
12227}
12228
12229PyObject *
12230PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12231{
12232    PyObject *result;
12233
12234    s = PyUnicode_FromObject(s);
12235    if (s == NULL)
12236        return NULL;
12237    if (sep != NULL) {
12238        sep = PyUnicode_FromObject(sep);
12239        if (sep == NULL) {
12240            Py_DECREF(s);
12241            return NULL;
12242        }
12243    }
12244
12245    result = split(s, sep, maxsplit);
12246
12247    Py_DECREF(s);
12248    Py_XDECREF(sep);
12249    return result;
12250}
12251
12252PyDoc_STRVAR(split__doc__,
12253             "S.split([sep[, maxsplit]]) -> list of strings\n\
12254\n\
12255Return a list of the words in S, using sep as the\n\
12256delimiter string.  If maxsplit is given, at most maxsplit\n\
12257splits are done. If sep is not specified or is None, any\n\
12258whitespace string is a separator and empty strings are\n\
12259removed from the result.");
12260
12261static PyObject*
12262unicode_split(PyObject *self, PyObject *args)
12263{
12264    PyObject *substring = Py_None;
12265    Py_ssize_t maxcount = -1;
12266
12267    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
12268        return NULL;
12269
12270    if (substring == Py_None)
12271        return split(self, NULL, maxcount);
12272    else if (PyUnicode_Check(substring))
12273        return split(self, substring, maxcount);
12274    else
12275        return PyUnicode_Split(self, substring, maxcount);
12276}
12277
12278PyObject *
12279PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12280{
12281    PyObject* str_obj;
12282    PyObject* sep_obj;
12283    PyObject* out;
12284    int kind1, kind2, kind;
12285    void *buf1 = NULL, *buf2 = NULL;
12286    Py_ssize_t len1, len2;
12287
12288    str_obj = PyUnicode_FromObject(str_in);
12289    if (!str_obj || PyUnicode_READY(str_obj) == -1)
12290        return NULL;
12291    sep_obj = PyUnicode_FromObject(sep_in);
12292    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
12293        Py_DECREF(str_obj);
12294        return NULL;
12295    }
12296
12297    kind1 = PyUnicode_KIND(str_obj);
12298    kind2 = PyUnicode_KIND(sep_obj);
12299    kind = Py_MAX(kind1, kind2);
12300    buf1 = PyUnicode_DATA(str_obj);
12301    if (kind1 != kind)
12302        buf1 = _PyUnicode_AsKind(str_obj, kind);
12303    if (!buf1)
12304        goto onError;
12305    buf2 = PyUnicode_DATA(sep_obj);
12306    if (kind2 != kind)
12307        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12308    if (!buf2)
12309        goto onError;
12310    len1 = PyUnicode_GET_LENGTH(str_obj);
12311    len2 = PyUnicode_GET_LENGTH(sep_obj);
12312
12313    switch(PyUnicode_KIND(str_obj)) {
12314    case PyUnicode_1BYTE_KIND:
12315        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12316            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12317        else
12318            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12319        break;
12320    case PyUnicode_2BYTE_KIND:
12321        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12322        break;
12323    case PyUnicode_4BYTE_KIND:
12324        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12325        break;
12326    default:
12327        assert(0);
12328        out = 0;
12329    }
12330
12331    Py_DECREF(sep_obj);
12332    Py_DECREF(str_obj);
12333    if (kind1 != kind)
12334        PyMem_Free(buf1);
12335    if (kind2 != kind)
12336        PyMem_Free(buf2);
12337
12338    return out;
12339  onError:
12340    Py_DECREF(sep_obj);
12341    Py_DECREF(str_obj);
12342    if (kind1 != kind && buf1)
12343        PyMem_Free(buf1);
12344    if (kind2 != kind && buf2)
12345        PyMem_Free(buf2);
12346    return NULL;
12347}
12348
12349
12350PyObject *
12351PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12352{
12353    PyObject* str_obj;
12354    PyObject* sep_obj;
12355    PyObject* out;
12356    int kind1, kind2, kind;
12357    void *buf1 = NULL, *buf2 = NULL;
12358    Py_ssize_t len1, len2;
12359
12360    str_obj = PyUnicode_FromObject(str_in);
12361    if (!str_obj)
12362        return NULL;
12363    sep_obj = PyUnicode_FromObject(sep_in);
12364    if (!sep_obj) {
12365        Py_DECREF(str_obj);
12366        return NULL;
12367    }
12368
12369    kind1 = PyUnicode_KIND(str_in);
12370    kind2 = PyUnicode_KIND(sep_obj);
12371    kind = Py_MAX(kind1, kind2);
12372    buf1 = PyUnicode_DATA(str_in);
12373    if (kind1 != kind)
12374        buf1 = _PyUnicode_AsKind(str_in, kind);
12375    if (!buf1)
12376        goto onError;
12377    buf2 = PyUnicode_DATA(sep_obj);
12378    if (kind2 != kind)
12379        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12380    if (!buf2)
12381        goto onError;
12382    len1 = PyUnicode_GET_LENGTH(str_obj);
12383    len2 = PyUnicode_GET_LENGTH(sep_obj);
12384
12385    switch(PyUnicode_KIND(str_in)) {
12386    case PyUnicode_1BYTE_KIND:
12387        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12388            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12389        else
12390            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12391        break;
12392    case PyUnicode_2BYTE_KIND:
12393        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12394        break;
12395    case PyUnicode_4BYTE_KIND:
12396        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12397        break;
12398    default:
12399        assert(0);
12400        out = 0;
12401    }
12402
12403    Py_DECREF(sep_obj);
12404    Py_DECREF(str_obj);
12405    if (kind1 != kind)
12406        PyMem_Free(buf1);
12407    if (kind2 != kind)
12408        PyMem_Free(buf2);
12409
12410    return out;
12411  onError:
12412    Py_DECREF(sep_obj);
12413    Py_DECREF(str_obj);
12414    if (kind1 != kind && buf1)
12415        PyMem_Free(buf1);
12416    if (kind2 != kind && buf2)
12417        PyMem_Free(buf2);
12418    return NULL;
12419}
12420
12421PyDoc_STRVAR(partition__doc__,
12422             "S.partition(sep) -> (head, sep, tail)\n\
12423\n\
12424Search for the separator sep in S, and return the part before it,\n\
12425the separator itself, and the part after it.  If the separator is not\n\
12426found, return S and two empty strings.");
12427
12428static PyObject*
12429unicode_partition(PyObject *self, PyObject *separator)
12430{
12431    return PyUnicode_Partition(self, separator);
12432}
12433
12434PyDoc_STRVAR(rpartition__doc__,
12435             "S.rpartition(sep) -> (head, sep, tail)\n\
12436\n\
12437Search for the separator sep in S, starting at the end of S, and return\n\
12438the part before it, the separator itself, and the part after it.  If the\n\
12439separator is not found, return two empty strings and S.");
12440
12441static PyObject*
12442unicode_rpartition(PyObject *self, PyObject *separator)
12443{
12444    return PyUnicode_RPartition(self, separator);
12445}
12446
12447PyObject *
12448PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12449{
12450    PyObject *result;
12451
12452    s = PyUnicode_FromObject(s);
12453    if (s == NULL)
12454        return NULL;
12455    if (sep != NULL) {
12456        sep = PyUnicode_FromObject(sep);
12457        if (sep == NULL) {
12458            Py_DECREF(s);
12459            return NULL;
12460        }
12461    }
12462
12463    result = rsplit(s, sep, maxsplit);
12464
12465    Py_DECREF(s);
12466    Py_XDECREF(sep);
12467    return result;
12468}
12469
12470PyDoc_STRVAR(rsplit__doc__,
12471             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
12472\n\
12473Return a list of the words in S, using sep as the\n\
12474delimiter string, starting at the end of the string and\n\
12475working to the front.  If maxsplit is given, at most maxsplit\n\
12476splits are done. If sep is not specified, any whitespace string\n\
12477is a separator.");
12478
12479static PyObject*
12480unicode_rsplit(PyObject *self, PyObject *args)
12481{
12482    PyObject *substring = Py_None;
12483    Py_ssize_t maxcount = -1;
12484
12485    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
12486        return NULL;
12487
12488    if (substring == Py_None)
12489        return rsplit(self, NULL, maxcount);
12490    else if (PyUnicode_Check(substring))
12491        return rsplit(self, substring, maxcount);
12492    else
12493        return PyUnicode_RSplit(self, substring, maxcount);
12494}
12495
12496PyDoc_STRVAR(splitlines__doc__,
12497             "S.splitlines([keepends]) -> list of strings\n\
12498\n\
12499Return a list of the lines in S, breaking at line boundaries.\n\
12500Line breaks are not included in the resulting list unless keepends\n\
12501is given and true.");
12502
12503static PyObject*
12504unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12505{
12506    static char *kwlist[] = {"keepends", 0};
12507    int keepends = 0;
12508
12509    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12510                                     kwlist, &keepends))
12511        return NULL;
12512
12513    return PyUnicode_Splitlines(self, keepends);
12514}
12515
12516static
12517PyObject *unicode_str(PyObject *self)
12518{
12519    if (PyUnicode_CheckExact(self)) {
12520        Py_INCREF(self);
12521        return self;
12522    } else
12523        /* Subtype -- return genuine unicode string with the same value. */
12524        return PyUnicode_Copy(self);
12525}
12526
12527PyDoc_STRVAR(swapcase__doc__,
12528             "S.swapcase() -> str\n\
12529\n\
12530Return a copy of S with uppercase characters converted to lowercase\n\
12531and vice versa.");
12532
12533static PyObject*
12534unicode_swapcase(PyObject *self)
12535{
12536    return fixup(self, fixswapcase);
12537}
12538
12539PyDoc_STRVAR(maketrans__doc__,
12540             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12541\n\
12542Return a translation table usable for str.translate().\n\
12543If there is only one argument, it must be a dictionary mapping Unicode\n\
12544ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12545Character keys will be then converted to ordinals.\n\
12546If there are two arguments, they must be strings of equal length, and\n\
12547in the resulting dictionary, each character in x will be mapped to the\n\
12548character at the same position in y. If there is a third argument, it\n\
12549must be a string, whose characters will be mapped to None in the result.");
12550
12551static PyObject*
12552unicode_maketrans(PyObject *null, PyObject *args)
12553{
12554    PyObject *x, *y = NULL, *z = NULL;
12555    PyObject *new = NULL, *key, *value;
12556    Py_ssize_t i = 0;
12557    int res;
12558
12559    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12560        return NULL;
12561    new = PyDict_New();
12562    if (!new)
12563        return NULL;
12564    if (y != NULL) {
12565        int x_kind, y_kind, z_kind;
12566        void *x_data, *y_data, *z_data;
12567
12568        /* x must be a string too, of equal length */
12569        if (!PyUnicode_Check(x)) {
12570            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12571                            "be a string if there is a second argument");
12572            goto err;
12573        }
12574        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12575            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12576                            "arguments must have equal length");
12577            goto err;
12578        }
12579        /* create entries for translating chars in x to those in y */
12580        x_kind = PyUnicode_KIND(x);
12581        y_kind = PyUnicode_KIND(y);
12582        x_data = PyUnicode_DATA(x);
12583        y_data = PyUnicode_DATA(y);
12584        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12585            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12586            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12587            if (!key || !value)
12588                goto err;
12589            res = PyDict_SetItem(new, key, value);
12590            Py_DECREF(key);
12591            Py_DECREF(value);
12592            if (res < 0)
12593                goto err;
12594        }
12595        /* create entries for deleting chars in z */
12596        if (z != NULL) {
12597            z_kind = PyUnicode_KIND(z);
12598            z_data = PyUnicode_DATA(z);
12599            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12600                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12601                if (!key)
12602                    goto err;
12603                res = PyDict_SetItem(new, key, Py_None);
12604                Py_DECREF(key);
12605                if (res < 0)
12606                    goto err;
12607            }
12608        }
12609    } else {
12610        int kind;
12611        void *data;
12612
12613        /* x must be a dict */
12614        if (!PyDict_CheckExact(x)) {
12615            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12616                            "to maketrans it must be a dict");
12617            goto err;
12618        }
12619        /* copy entries into the new dict, converting string keys to int keys */
12620        while (PyDict_Next(x, &i, &key, &value)) {
12621            if (PyUnicode_Check(key)) {
12622                /* convert string keys to integer keys */
12623                PyObject *newkey;
12624                if (PyUnicode_GET_LENGTH(key) != 1) {
12625                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12626                                    "table must be of length 1");
12627                    goto err;
12628                }
12629                kind = PyUnicode_KIND(key);
12630                data = PyUnicode_DATA(key);
12631                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12632                if (!newkey)
12633                    goto err;
12634                res = PyDict_SetItem(new, newkey, value);
12635                Py_DECREF(newkey);
12636                if (res < 0)
12637                    goto err;
12638            } else if (PyLong_Check(key)) {
12639                /* just keep integer keys */
12640                if (PyDict_SetItem(new, key, value) < 0)
12641                    goto err;
12642            } else {
12643                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12644                                "be strings or integers");
12645                goto err;
12646            }
12647        }
12648    }
12649    return new;
12650  err:
12651    Py_DECREF(new);
12652    return NULL;
12653}
12654
12655PyDoc_STRVAR(translate__doc__,
12656             "S.translate(table) -> str\n\
12657\n\
12658Return a copy of the string S, where all characters have been mapped\n\
12659through the given translation table, which must be a mapping of\n\
12660Unicode ordinals to Unicode ordinals, strings, or None.\n\
12661Unmapped characters are left untouched. Characters mapped to None\n\
12662are deleted.");
12663
12664static PyObject*
12665unicode_translate(PyObject *self, PyObject *table)
12666{
12667    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12668}
12669
12670PyDoc_STRVAR(upper__doc__,
12671             "S.upper() -> str\n\
12672\n\
12673Return a copy of S converted to uppercase.");
12674
12675static PyObject*
12676unicode_upper(PyObject *self)
12677{
12678    return fixup(self, fixupper);
12679}
12680
12681PyDoc_STRVAR(zfill__doc__,
12682             "S.zfill(width) -> str\n\
12683\n\
12684Pad a numeric string S with zeros on the left, to fill a field\n\
12685of the specified width. The string S is never truncated.");
12686
12687static PyObject *
12688unicode_zfill(PyObject *self, PyObject *args)
12689{
12690    Py_ssize_t fill;
12691    PyObject *u;
12692    Py_ssize_t width;
12693    int kind;
12694    void *data;
12695    Py_UCS4 chr;
12696
12697    if (PyUnicode_READY(self) == -1)
12698        return NULL;
12699
12700    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12701        return NULL;
12702
12703    if (PyUnicode_GET_LENGTH(self) >= width) {
12704        if (PyUnicode_CheckExact(self)) {
12705            Py_INCREF(self);
12706            return self;
12707        }
12708        else
12709            return PyUnicode_Copy(self);
12710    }
12711
12712    fill = width - _PyUnicode_LENGTH(self);
12713
12714    u = pad(self, fill, 0, '0');
12715
12716    if (u == NULL)
12717        return NULL;
12718
12719    kind = PyUnicode_KIND(u);
12720    data = PyUnicode_DATA(u);
12721    chr = PyUnicode_READ(kind, data, fill);
12722
12723    if (chr == '+' || chr == '-') {
12724        /* move sign to beginning of string */
12725        PyUnicode_WRITE(kind, data, 0, chr);
12726        PyUnicode_WRITE(kind, data, fill, '0');
12727    }
12728
12729    assert(_PyUnicode_CheckConsistency(u, 1));
12730    return u;
12731}
12732
12733#if 0
12734static PyObject *
12735unicode__decimal2ascii(PyObject *self)
12736{
12737    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12738}
12739#endif
12740
12741PyDoc_STRVAR(startswith__doc__,
12742             "S.startswith(prefix[, start[, end]]) -> bool\n\
12743\n\
12744Return True if S starts with the specified prefix, False otherwise.\n\
12745With optional start, test S beginning at that position.\n\
12746With optional end, stop comparing S at that position.\n\
12747prefix can also be a tuple of strings to try.");
12748
12749static PyObject *
12750unicode_startswith(PyObject *self,
12751                   PyObject *args)
12752{
12753    PyObject *subobj;
12754    PyObject *substring;
12755    Py_ssize_t start = 0;
12756    Py_ssize_t end = PY_SSIZE_T_MAX;
12757    int result;
12758
12759    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12760        return NULL;
12761    if (PyTuple_Check(subobj)) {
12762        Py_ssize_t i;
12763        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12764            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12765            if (substring == NULL)
12766                return NULL;
12767            result = tailmatch(self, substring, start, end, -1);
12768            Py_DECREF(substring);
12769            if (result) {
12770                Py_RETURN_TRUE;
12771            }
12772        }
12773        /* nothing matched */
12774        Py_RETURN_FALSE;
12775    }
12776    substring = PyUnicode_FromObject(subobj);
12777    if (substring == NULL) {
12778        if (PyErr_ExceptionMatches(PyExc_TypeError))
12779            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12780                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12781        return NULL;
12782    }
12783    result = tailmatch(self, substring, start, end, -1);
12784    Py_DECREF(substring);
12785    return PyBool_FromLong(result);
12786}
12787
12788
12789PyDoc_STRVAR(endswith__doc__,
12790             "S.endswith(suffix[, start[, end]]) -> bool\n\
12791\n\
12792Return True if S ends with the specified suffix, False otherwise.\n\
12793With optional start, test S beginning at that position.\n\
12794With optional end, stop comparing S at that position.\n\
12795suffix can also be a tuple of strings to try.");
12796
12797static PyObject *
12798unicode_endswith(PyObject *self,
12799                 PyObject *args)
12800{
12801    PyObject *subobj;
12802    PyObject *substring;
12803    Py_ssize_t start = 0;
12804    Py_ssize_t end = PY_SSIZE_T_MAX;
12805    int result;
12806
12807    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12808        return NULL;
12809    if (PyTuple_Check(subobj)) {
12810        Py_ssize_t i;
12811        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12812            substring = PyUnicode_FromObject(
12813                PyTuple_GET_ITEM(subobj, i));
12814            if (substring == NULL)
12815                return NULL;
12816            result = tailmatch(self, substring, start, end, +1);
12817            Py_DECREF(substring);
12818            if (result) {
12819                Py_RETURN_TRUE;
12820            }
12821        }
12822        Py_RETURN_FALSE;
12823    }
12824    substring = PyUnicode_FromObject(subobj);
12825    if (substring == NULL) {
12826        if (PyErr_ExceptionMatches(PyExc_TypeError))
12827            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12828                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12829        return NULL;
12830    }
12831    result = tailmatch(self, substring, start, end, +1);
12832    Py_DECREF(substring);
12833    return PyBool_FromLong(result);
12834}
12835
12836#include "stringlib/unicode_format.h"
12837
12838PyDoc_STRVAR(format__doc__,
12839             "S.format(*args, **kwargs) -> str\n\
12840\n\
12841Return a formatted version of S, using substitutions from args and kwargs.\n\
12842The substitutions are identified by braces ('{' and '}').");
12843
12844PyDoc_STRVAR(format_map__doc__,
12845             "S.format_map(mapping) -> str\n\
12846\n\
12847Return a formatted version of S, using substitutions from mapping.\n\
12848The substitutions are identified by braces ('{' and '}').");
12849
12850static PyObject *
12851unicode__format__(PyObject* self, PyObject* args)
12852{
12853    PyObject *format_spec, *out;
12854
12855    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12856        return NULL;
12857
12858    out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
12859                                     PyUnicode_GET_LENGTH(format_spec));
12860    return out;
12861}
12862
12863PyDoc_STRVAR(p_format__doc__,
12864             "S.__format__(format_spec) -> str\n\
12865\n\
12866Return a formatted version of S as described by format_spec.");
12867
12868static PyObject *
12869unicode__sizeof__(PyObject *v)
12870{
12871    Py_ssize_t size;
12872
12873    /* If it's a compact object, account for base structure +
12874       character data. */
12875    if (PyUnicode_IS_COMPACT_ASCII(v))
12876        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12877    else if (PyUnicode_IS_COMPACT(v))
12878        size = sizeof(PyCompactUnicodeObject) +
12879            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12880    else {
12881        /* If it is a two-block object, account for base object, and
12882           for character block if present. */
12883        size = sizeof(PyUnicodeObject);
12884        if (_PyUnicode_DATA_ANY(v))
12885            size += (PyUnicode_GET_LENGTH(v) + 1) *
12886                PyUnicode_KIND(v);
12887    }
12888    /* If the wstr pointer is present, account for it unless it is shared
12889       with the data pointer. Check if the data is not shared. */
12890    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12891        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12892    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12893        size += PyUnicode_UTF8_LENGTH(v) + 1;
12894
12895    return PyLong_FromSsize_t(size);
12896}
12897
12898PyDoc_STRVAR(sizeof__doc__,
12899             "S.__sizeof__() -> size of S in memory, in bytes");
12900
12901static PyObject *
12902unicode_getnewargs(PyObject *v)
12903{
12904    PyObject *copy = PyUnicode_Copy(v);
12905    if (!copy)
12906        return NULL;
12907    return Py_BuildValue("(N)", copy);
12908}
12909
12910static PyMethodDef unicode_methods[] = {
12911
12912    /* Order is according to common usage: often used methods should
12913       appear first, since lookup is done sequentially. */
12914
12915    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12916    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12917    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12918    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12919    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12920    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12921    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12922    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12923    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12924    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12925    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12926    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12927    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12928    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12929    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12930    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12931    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12932    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12933    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12934    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12935    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12936    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12937    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12938    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12939    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12940    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12941    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12942    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12943    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12944    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12945    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12946    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12947    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12948    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12949    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12950    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12951    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12952    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12953    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12954    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12955    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12956    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12957    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12958    {"maketrans", (PyCFunction) unicode_maketrans,
12959     METH_VARARGS | METH_STATIC, maketrans__doc__},
12960    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12961#if 0
12962    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12963#endif
12964
12965#if 0
12966    /* These methods are just used for debugging the implementation. */
12967    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12968#endif
12969
12970    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12971    {NULL, NULL}
12972};
12973
12974static PyObject *
12975unicode_mod(PyObject *v, PyObject *w)
12976{
12977    if (!PyUnicode_Check(v))
12978        Py_RETURN_NOTIMPLEMENTED;
12979    return PyUnicode_Format(v, w);
12980}
12981
12982static PyNumberMethods unicode_as_number = {
12983    0,              /*nb_add*/
12984    0,              /*nb_subtract*/
12985    0,              /*nb_multiply*/
12986    unicode_mod,            /*nb_remainder*/
12987};
12988
12989static PySequenceMethods unicode_as_sequence = {
12990    (lenfunc) unicode_length,       /* sq_length */
12991    PyUnicode_Concat,           /* sq_concat */
12992    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12993    (ssizeargfunc) unicode_getitem,     /* sq_item */
12994    0,                  /* sq_slice */
12995    0,                  /* sq_ass_item */
12996    0,                  /* sq_ass_slice */
12997    PyUnicode_Contains,         /* sq_contains */
12998};
12999
13000static PyObject*
13001unicode_subscript(PyObject* self, PyObject* item)
13002{
13003    if (PyUnicode_READY(self) == -1)
13004        return NULL;
13005
13006    if (PyIndex_Check(item)) {
13007        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13008        if (i == -1 && PyErr_Occurred())
13009            return NULL;
13010        if (i < 0)
13011            i += PyUnicode_GET_LENGTH(self);
13012        return unicode_getitem(self, i);
13013    } else if (PySlice_Check(item)) {
13014        Py_ssize_t start, stop, step, slicelength, cur, i;
13015        PyObject *result;
13016        void *src_data, *dest_data;
13017        int src_kind, dest_kind;
13018        Py_UCS4 ch, max_char, kind_limit;
13019
13020        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13021                                 &start, &stop, &step, &slicelength) < 0) {
13022            return NULL;
13023        }
13024
13025        if (slicelength <= 0) {
13026            return PyUnicode_New(0, 0);
13027        } else if (start == 0 && step == 1 &&
13028                   slicelength == PyUnicode_GET_LENGTH(self) &&
13029                   PyUnicode_CheckExact(self)) {
13030            Py_INCREF(self);
13031            return self;
13032        } else if (step == 1) {
13033            return PyUnicode_Substring(self,
13034                                       start, start + slicelength);
13035        }
13036        /* General case */
13037        src_kind = PyUnicode_KIND(self);
13038        src_data = PyUnicode_DATA(self);
13039        if (!PyUnicode_IS_ASCII(self)) {
13040            kind_limit = kind_maxchar_limit(src_kind);
13041            max_char = 0;
13042            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13043                ch = PyUnicode_READ(src_kind, src_data, cur);
13044                if (ch > max_char) {
13045                    max_char = ch;
13046                    if (max_char >= kind_limit)
13047                        break;
13048                }
13049            }
13050        }
13051        else
13052            max_char = 127;
13053        result = PyUnicode_New(slicelength, max_char);
13054        if (result == NULL)
13055            return NULL;
13056        dest_kind = PyUnicode_KIND(result);
13057        dest_data = PyUnicode_DATA(result);
13058
13059        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13060            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13061            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13062        }
13063        assert(_PyUnicode_CheckConsistency(result, 1));
13064        return result;
13065    } else {
13066        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13067        return NULL;
13068    }
13069}
13070
13071static PyMappingMethods unicode_as_mapping = {
13072    (lenfunc)unicode_length,        /* mp_length */
13073    (binaryfunc)unicode_subscript,  /* mp_subscript */
13074    (objobjargproc)0,           /* mp_ass_subscript */
13075};
13076
13077
13078/* Helpers for PyUnicode_Format() */
13079
13080static PyObject *
13081getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
13082{
13083    Py_ssize_t argidx = *p_argidx;
13084    if (argidx < arglen) {
13085        (*p_argidx)++;
13086        if (arglen < 0)
13087            return args;
13088        else
13089            return PyTuple_GetItem(args, argidx);
13090    }
13091    PyErr_SetString(PyExc_TypeError,
13092                    "not enough arguments for format string");
13093    return NULL;
13094}
13095
13096/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13097
13098static PyObject *
13099formatfloat(PyObject *v, int flags, int prec, int type)
13100{
13101    char *p;
13102    PyObject *result;
13103    double x;
13104
13105    x = PyFloat_AsDouble(v);
13106    if (x == -1.0 && PyErr_Occurred())
13107        return NULL;
13108
13109    if (prec < 0)
13110        prec = 6;
13111
13112    p = PyOS_double_to_string(x, type, prec,
13113                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
13114    if (p == NULL)
13115        return NULL;
13116    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
13117    PyMem_Free(p);
13118    return result;
13119}
13120
13121static PyObject*
13122formatlong(PyObject *val, int flags, int prec, int type)
13123{
13124    char *buf;
13125    int len;
13126    PyObject *str; /* temporary string object. */
13127    PyObject *result;
13128
13129    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13130    if (!str)
13131        return NULL;
13132    result = PyUnicode_DecodeASCII(buf, len, NULL);
13133    Py_DECREF(str);
13134    return result;
13135}
13136
13137static Py_UCS4
13138formatchar(PyObject *v)
13139{
13140    /* presume that the buffer is at least 3 characters long */
13141    if (PyUnicode_Check(v)) {
13142        if (PyUnicode_GET_LENGTH(v) == 1) {
13143            return PyUnicode_READ_CHAR(v, 0);
13144        }
13145        goto onError;
13146    }
13147    else {
13148        /* Integer input truncated to a character */
13149        long x;
13150        x = PyLong_AsLong(v);
13151        if (x == -1 && PyErr_Occurred())
13152            goto onError;
13153
13154        if (x < 0 || x > 0x10ffff) {
13155            PyErr_SetString(PyExc_OverflowError,
13156                            "%c arg not in range(0x110000)");
13157            return (Py_UCS4) -1;
13158        }
13159
13160        return (Py_UCS4) x;
13161    }
13162
13163  onError:
13164    PyErr_SetString(PyExc_TypeError,
13165                    "%c requires int or char");
13166    return (Py_UCS4) -1;
13167}
13168
13169static int
13170repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13171{
13172    int r;
13173    assert(count > 0);
13174    assert(PyUnicode_Check(obj));
13175    if (count > 5) {
13176        PyObject *repeated = unicode_repeat(obj, count);
13177        if (repeated == NULL)
13178            return -1;
13179        r = _PyAccu_Accumulate(acc, repeated);
13180        Py_DECREF(repeated);
13181        return r;
13182    }
13183    else {
13184        do {
13185            if (_PyAccu_Accumulate(acc, obj))
13186                return -1;
13187        } while (--count);
13188        return 0;
13189    }
13190}
13191
13192PyObject *
13193PyUnicode_Format(PyObject *format, PyObject *args)
13194{
13195    void *fmt;
13196    int fmtkind;
13197    PyObject *result;
13198    int kind;
13199    int r;
13200    Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
13201    int args_owned = 0;
13202    PyObject *dict = NULL;
13203    PyObject *temp = NULL;
13204    PyObject *second = NULL;
13205    PyObject *uformat;
13206    _PyAccu acc;
13207    static PyObject *plus, *minus, *blank, *zero, *percent;
13208
13209    if (!plus && !(plus = get_latin1_char('+')))
13210        return NULL;
13211    if (!minus && !(minus = get_latin1_char('-')))
13212        return NULL;
13213    if (!blank && !(blank = get_latin1_char(' ')))
13214        return NULL;
13215    if (!zero && !(zero = get_latin1_char('0')))
13216        return NULL;
13217    if (!percent && !(percent = get_latin1_char('%')))
13218        return NULL;
13219
13220    if (format == NULL || args == NULL) {
13221        PyErr_BadInternalCall();
13222        return NULL;
13223    }
13224    uformat = PyUnicode_FromObject(format);
13225    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
13226        return NULL;
13227    if (_PyAccu_Init(&acc))
13228        goto onError;
13229    fmt = PyUnicode_DATA(uformat);
13230    fmtkind = PyUnicode_KIND(uformat);
13231    fmtcnt = PyUnicode_GET_LENGTH(uformat);
13232    fmtpos = 0;
13233
13234    if (PyTuple_Check(args)) {
13235        arglen = PyTuple_Size(args);
13236        argidx = 0;
13237    }
13238    else {
13239        arglen = -1;
13240        argidx = -2;
13241    }
13242    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
13243        !PyUnicode_Check(args))
13244        dict = args;
13245
13246    while (--fmtcnt >= 0) {
13247        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13248            PyObject *nonfmt;
13249            Py_ssize_t nonfmtpos;
13250            nonfmtpos = fmtpos++;
13251            while (fmtcnt >= 0 &&
13252                   PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13253                fmtpos++;
13254                fmtcnt--;
13255            }
13256            nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
13257            if (nonfmt == NULL)
13258                goto onError;
13259            r = _PyAccu_Accumulate(&acc, nonfmt);
13260            Py_DECREF(nonfmt);
13261            if (r)
13262                goto onError;
13263        }
13264        else {
13265            /* Got a format specifier */
13266            int flags = 0;
13267            Py_ssize_t width = -1;
13268            int prec = -1;
13269            Py_UCS4 c = '\0';
13270            Py_UCS4 fill, sign;
13271            int isnumok;
13272            PyObject *v = NULL;
13273            void *pbuf = NULL;
13274            Py_ssize_t pindex, len;
13275            PyObject *signobj = NULL, *fillobj = NULL;
13276
13277            fmtpos++;
13278            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13279                Py_ssize_t keystart;
13280                Py_ssize_t keylen;
13281                PyObject *key;
13282                int pcount = 1;
13283
13284                if (dict == NULL) {
13285                    PyErr_SetString(PyExc_TypeError,
13286                                    "format requires a mapping");
13287                    goto onError;
13288                }
13289                ++fmtpos;
13290                --fmtcnt;
13291                keystart = fmtpos;
13292                /* Skip over balanced parentheses */
13293                while (pcount > 0 && --fmtcnt >= 0) {
13294                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
13295                        --pcount;
13296                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
13297                        ++pcount;
13298                    fmtpos++;
13299                }
13300                keylen = fmtpos - keystart - 1;
13301                if (fmtcnt < 0 || pcount > 0) {
13302                    PyErr_SetString(PyExc_ValueError,
13303                                    "incomplete format key");
13304                    goto onError;
13305                }
13306                key = PyUnicode_Substring(uformat,
13307                                          keystart, keystart + keylen);
13308                if (key == NULL)
13309                    goto onError;
13310                if (args_owned) {
13311                    Py_DECREF(args);
13312                    args_owned = 0;
13313                }
13314                args = PyObject_GetItem(dict, key);
13315                Py_DECREF(key);
13316                if (args == NULL) {
13317                    goto onError;
13318                }
13319                args_owned = 1;
13320                arglen = -1;
13321                argidx = -2;
13322            }
13323            while (--fmtcnt >= 0) {
13324                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
13325                case '-': flags |= F_LJUST; continue;
13326                case '+': flags |= F_SIGN; continue;
13327                case ' ': flags |= F_BLANK; continue;
13328                case '#': flags |= F_ALT; continue;
13329                case '0': flags |= F_ZERO; continue;
13330                }
13331                break;
13332            }
13333            if (c == '*') {
13334                v = getnextarg(args, arglen, &argidx);
13335                if (v == NULL)
13336                    goto onError;
13337                if (!PyLong_Check(v)) {
13338                    PyErr_SetString(PyExc_TypeError,
13339                                    "* wants int");
13340                    goto onError;
13341                }
13342                width = PyLong_AsLong(v);
13343                if (width == -1 && PyErr_Occurred())
13344                    goto onError;
13345                if (width < 0) {
13346                    flags |= F_LJUST;
13347                    width = -width;
13348                }
13349                if (--fmtcnt >= 0)
13350                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13351            }
13352            else if (c >= '0' && c <= '9') {
13353                width = c - '0';
13354                while (--fmtcnt >= 0) {
13355                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13356                    if (c < '0' || c > '9')
13357                        break;
13358                    if ((width*10) / 10 != width) {
13359                        PyErr_SetString(PyExc_ValueError,
13360                                        "width too big");
13361                        goto onError;
13362                    }
13363                    width = width*10 + (c - '0');
13364                }
13365            }
13366            if (c == '.') {
13367                prec = 0;
13368                if (--fmtcnt >= 0)
13369                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13370                if (c == '*') {
13371                    v = getnextarg(args, arglen, &argidx);
13372                    if (v == NULL)
13373                        goto onError;
13374                    if (!PyLong_Check(v)) {
13375                        PyErr_SetString(PyExc_TypeError,
13376                                        "* wants int");
13377                        goto onError;
13378                    }
13379                    prec = PyLong_AsLong(v);
13380                    if (prec == -1 && PyErr_Occurred())
13381                        goto onError;
13382                    if (prec < 0)
13383                        prec = 0;
13384                    if (--fmtcnt >= 0)
13385                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13386                }
13387                else if (c >= '0' && c <= '9') {
13388                    prec = c - '0';
13389                    while (--fmtcnt >= 0) {
13390                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13391                        if (c < '0' || c > '9')
13392                            break;
13393                        if ((prec*10) / 10 != prec) {
13394                            PyErr_SetString(PyExc_ValueError,
13395                                            "prec too big");
13396                            goto onError;
13397                        }
13398                        prec = prec*10 + (c - '0');
13399                    }
13400                }
13401            } /* prec */
13402            if (fmtcnt >= 0) {
13403                if (c == 'h' || c == 'l' || c == 'L') {
13404                    if (--fmtcnt >= 0)
13405                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13406                }
13407            }
13408            if (fmtcnt < 0) {
13409                PyErr_SetString(PyExc_ValueError,
13410                                "incomplete format");
13411                goto onError;
13412            }
13413            if (c != '%') {
13414                v = getnextarg(args, arglen, &argidx);
13415                if (v == NULL)
13416                    goto onError;
13417            }
13418            sign = 0;
13419            fill = ' ';
13420            fillobj = blank;
13421            switch (c) {
13422
13423            case '%':
13424                _PyAccu_Accumulate(&acc, percent);
13425                continue;
13426
13427            case 's':
13428            case 'r':
13429            case 'a':
13430                if (PyUnicode_CheckExact(v) && c == 's') {
13431                    temp = v;
13432                    Py_INCREF(temp);
13433                }
13434                else {
13435                    if (c == 's')
13436                        temp = PyObject_Str(v);
13437                    else if (c == 'r')
13438                        temp = PyObject_Repr(v);
13439                    else
13440                        temp = PyObject_ASCII(v);
13441                    if (temp == NULL)
13442                        goto onError;
13443                    if (PyUnicode_Check(temp))
13444                        /* nothing to do */;
13445                    else {
13446                        Py_DECREF(temp);
13447                        PyErr_SetString(PyExc_TypeError,
13448                                        "%s argument has non-string str()");
13449                        goto onError;
13450                    }
13451                }
13452                if (PyUnicode_READY(temp) == -1) {
13453                    Py_CLEAR(temp);
13454                    goto onError;
13455                }
13456                pbuf = PyUnicode_DATA(temp);
13457                kind = PyUnicode_KIND(temp);
13458                len = PyUnicode_GET_LENGTH(temp);
13459                if (prec >= 0 && len > prec)
13460                    len = prec;
13461                break;
13462
13463            case 'i':
13464            case 'd':
13465            case 'u':
13466            case 'o':
13467            case 'x':
13468            case 'X':
13469                isnumok = 0;
13470                if (PyNumber_Check(v)) {
13471                    PyObject *iobj=NULL;
13472
13473                    if (PyLong_Check(v)) {
13474                        iobj = v;
13475                        Py_INCREF(iobj);
13476                    }
13477                    else {
13478                        iobj = PyNumber_Long(v);
13479                    }
13480                    if (iobj!=NULL) {
13481                        if (PyLong_Check(iobj)) {
13482                            isnumok = 1;
13483                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13484                            Py_DECREF(iobj);
13485                            if (!temp)
13486                                goto onError;
13487                            if (PyUnicode_READY(temp) == -1) {
13488                                Py_CLEAR(temp);
13489                                goto onError;
13490                            }
13491                            pbuf = PyUnicode_DATA(temp);
13492                            kind = PyUnicode_KIND(temp);
13493                            len = PyUnicode_GET_LENGTH(temp);
13494                            sign = 1;
13495                        }
13496                        else {
13497                            Py_DECREF(iobj);
13498                        }
13499                    }
13500                }
13501                if (!isnumok) {
13502                    PyErr_Format(PyExc_TypeError,
13503                                 "%%%c format: a number is required, "
13504                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13505                    goto onError;
13506                }
13507                if (flags & F_ZERO) {
13508                    fill = '0';
13509                    fillobj = zero;
13510                }
13511                break;
13512
13513            case 'e':
13514            case 'E':
13515            case 'f':
13516            case 'F':
13517            case 'g':
13518            case 'G':
13519                temp = formatfloat(v, flags, prec, c);
13520                if (!temp)
13521                    goto onError;
13522                if (PyUnicode_READY(temp) == -1) {
13523                    Py_CLEAR(temp);
13524                    goto onError;
13525                }
13526                pbuf = PyUnicode_DATA(temp);
13527                kind = PyUnicode_KIND(temp);
13528                len = PyUnicode_GET_LENGTH(temp);
13529                sign = 1;
13530                if (flags & F_ZERO) {
13531                    fill = '0';
13532                    fillobj = zero;
13533                }
13534                break;
13535
13536            case 'c':
13537            {
13538                Py_UCS4 ch = formatchar(v);
13539                if (ch == (Py_UCS4) -1)
13540                    goto onError;
13541                temp = _PyUnicode_FromUCS4(&ch, 1);
13542                if (temp == NULL)
13543                    goto onError;
13544                pbuf = PyUnicode_DATA(temp);
13545                kind = PyUnicode_KIND(temp);
13546                len = PyUnicode_GET_LENGTH(temp);
13547                break;
13548            }
13549
13550            default:
13551                PyErr_Format(PyExc_ValueError,
13552                             "unsupported format character '%c' (0x%x) "
13553                             "at index %zd",
13554                             (31<=c && c<=126) ? (char)c : '?',
13555                             (int)c,
13556                             fmtpos - 1);
13557                goto onError;
13558            }
13559            /* pbuf is initialized here. */
13560            pindex = 0;
13561            if (sign) {
13562                if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13563                    signobj = minus;
13564                    len--;
13565                    pindex++;
13566                }
13567                else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13568                    signobj = plus;
13569                    len--;
13570                    pindex++;
13571                }
13572                else if (flags & F_SIGN)
13573                    signobj = plus;
13574                else if (flags & F_BLANK)
13575                    signobj = blank;
13576                else
13577                    sign = 0;
13578            }
13579            if (width < len)
13580                width = len;
13581            if (sign) {
13582                if (fill != ' ') {
13583                    assert(signobj != NULL);
13584                    if (_PyAccu_Accumulate(&acc, signobj))
13585                        goto onError;
13586                }
13587                if (width > len)
13588                    width--;
13589            }
13590            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13591                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13592                assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13593                if (fill != ' ') {
13594                    second = get_latin1_char(
13595                        PyUnicode_READ(kind, pbuf, pindex + 1));
13596                    pindex += 2;
13597                    if (second == NULL ||
13598                        _PyAccu_Accumulate(&acc, zero) ||
13599                        _PyAccu_Accumulate(&acc, second))
13600                        goto onError;
13601                    Py_CLEAR(second);
13602                }
13603                width -= 2;
13604                if (width < 0)
13605                    width = 0;
13606                len -= 2;
13607            }
13608            if (width > len && !(flags & F_LJUST)) {
13609                assert(fillobj != NULL);
13610                if (repeat_accumulate(&acc, fillobj, width - len))
13611                    goto onError;
13612                width = len;
13613            }
13614            if (fill == ' ') {
13615                if (sign) {
13616                    assert(signobj != NULL);
13617                    if (_PyAccu_Accumulate(&acc, signobj))
13618                        goto onError;
13619                }
13620                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13621                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13622                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13623                    second = get_latin1_char(
13624                        PyUnicode_READ(kind, pbuf, pindex + 1));
13625                    pindex += 2;
13626                    if (second == NULL ||
13627                        _PyAccu_Accumulate(&acc, zero) ||
13628                        _PyAccu_Accumulate(&acc, second))
13629                        goto onError;
13630                    Py_CLEAR(second);
13631                }
13632            }
13633            /* Copy all characters, preserving len */
13634            if (temp != NULL) {
13635                assert(pbuf == PyUnicode_DATA(temp));
13636                v = PyUnicode_Substring(temp, pindex, pindex + len);
13637            }
13638            else {
13639                const char *p = (const char *) pbuf;
13640                assert(pbuf != NULL);
13641                p += kind * pindex;
13642                v = PyUnicode_FromKindAndData(kind, p, len);
13643            }
13644            if (v == NULL)
13645                goto onError;
13646            r = _PyAccu_Accumulate(&acc, v);
13647            Py_DECREF(v);
13648            if (r)
13649                goto onError;
13650            if (width > len && repeat_accumulate(&acc, blank, width - len))
13651                goto onError;
13652            if (dict && (argidx < arglen) && c != '%') {
13653                PyErr_SetString(PyExc_TypeError,
13654                                "not all arguments converted during string formatting");
13655                goto onError;
13656            }
13657            Py_CLEAR(temp);
13658        } /* '%' */
13659    } /* until end */
13660    if (argidx < arglen && !dict) {
13661        PyErr_SetString(PyExc_TypeError,
13662                        "not all arguments converted during string formatting");
13663        goto onError;
13664    }
13665
13666    result = _PyAccu_Finish(&acc);
13667    if (args_owned) {
13668        Py_DECREF(args);
13669    }
13670    Py_DECREF(uformat);
13671    Py_XDECREF(temp);
13672    Py_XDECREF(second);
13673    return result;
13674
13675  onError:
13676    Py_DECREF(uformat);
13677    Py_XDECREF(temp);
13678    Py_XDECREF(second);
13679    _PyAccu_Destroy(&acc);
13680    if (args_owned) {
13681        Py_DECREF(args);
13682    }
13683    return NULL;
13684}
13685
13686static PyObject *
13687unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13688
13689static PyObject *
13690unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13691{
13692    PyObject *x = NULL;
13693    static char *kwlist[] = {"object", "encoding", "errors", 0};
13694    char *encoding = NULL;
13695    char *errors = NULL;
13696
13697    if (type != &PyUnicode_Type)
13698        return unicode_subtype_new(type, args, kwds);
13699    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13700                                     kwlist, &x, &encoding, &errors))
13701        return NULL;
13702    if (x == NULL)
13703        return PyUnicode_New(0, 0);
13704    if (encoding == NULL && errors == NULL)
13705        return PyObject_Str(x);
13706    else
13707        return PyUnicode_FromEncodedObject(x, encoding, errors);
13708}
13709
13710static PyObject *
13711unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13712{
13713    PyObject *unicode, *self;
13714    Py_ssize_t length, char_size;
13715    int share_wstr, share_utf8;
13716    unsigned int kind;
13717    void *data;
13718
13719    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13720
13721    unicode = unicode_new(&PyUnicode_Type, args, kwds);
13722    if (unicode == NULL)
13723        return NULL;
13724    assert(_PyUnicode_CHECK(unicode));
13725    if (PyUnicode_READY(unicode))
13726        return NULL;
13727
13728    self = type->tp_alloc(type, 0);
13729    if (self == NULL) {
13730        Py_DECREF(unicode);
13731        return NULL;
13732    }
13733    kind = PyUnicode_KIND(unicode);
13734    length = PyUnicode_GET_LENGTH(unicode);
13735
13736    _PyUnicode_LENGTH(self) = length;
13737#ifdef Py_DEBUG
13738    _PyUnicode_HASH(self) = -1;
13739#else
13740    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13741#endif
13742    _PyUnicode_STATE(self).interned = 0;
13743    _PyUnicode_STATE(self).kind = kind;
13744    _PyUnicode_STATE(self).compact = 0;
13745    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13746    _PyUnicode_STATE(self).ready = 1;
13747    _PyUnicode_WSTR(self) = NULL;
13748    _PyUnicode_UTF8_LENGTH(self) = 0;
13749    _PyUnicode_UTF8(self) = NULL;
13750    _PyUnicode_WSTR_LENGTH(self) = 0;
13751    _PyUnicode_DATA_ANY(self) = NULL;
13752
13753    share_utf8 = 0;
13754    share_wstr = 0;
13755    if (kind == PyUnicode_1BYTE_KIND) {
13756        char_size = 1;
13757        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13758            share_utf8 = 1;
13759    }
13760    else if (kind == PyUnicode_2BYTE_KIND) {
13761        char_size = 2;
13762        if (sizeof(wchar_t) == 2)
13763            share_wstr = 1;
13764    }
13765    else {
13766        assert(kind == PyUnicode_4BYTE_KIND);
13767        char_size = 4;
13768        if (sizeof(wchar_t) == 4)
13769            share_wstr = 1;
13770    }
13771
13772    /* Ensure we won't overflow the length. */
13773    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13774        PyErr_NoMemory();
13775        goto onError;
13776    }
13777    data = PyObject_MALLOC((length + 1) * char_size);
13778    if (data == NULL) {
13779        PyErr_NoMemory();
13780        goto onError;
13781    }
13782
13783    _PyUnicode_DATA_ANY(self) = data;
13784    if (share_utf8) {
13785        _PyUnicode_UTF8_LENGTH(self) = length;
13786        _PyUnicode_UTF8(self) = data;
13787    }
13788    if (share_wstr) {
13789        _PyUnicode_WSTR_LENGTH(self) = length;
13790        _PyUnicode_WSTR(self) = (wchar_t *)data;
13791    }
13792
13793    Py_MEMCPY(data, PyUnicode_DATA(unicode),
13794              kind * (length + 1));
13795    assert(_PyUnicode_CheckConsistency(self, 1));
13796#ifdef Py_DEBUG
13797    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13798#endif
13799    Py_DECREF(unicode);
13800    return self;
13801
13802onError:
13803    Py_DECREF(unicode);
13804    Py_DECREF(self);
13805    return NULL;
13806}
13807
13808PyDoc_STRVAR(unicode_doc,
13809             "str(string[, encoding[, errors]]) -> str\n\
13810\n\
13811Create a new string object from the given encoded string.\n\
13812encoding defaults to the current default string encoding.\n\
13813errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
13814
13815static PyObject *unicode_iter(PyObject *seq);
13816
13817PyTypeObject PyUnicode_Type = {
13818    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13819    "str",              /* tp_name */
13820    sizeof(PyUnicodeObject),        /* tp_size */
13821    0,                  /* tp_itemsize */
13822    /* Slots */
13823    (destructor)unicode_dealloc,    /* tp_dealloc */
13824    0,                  /* tp_print */
13825    0,                  /* tp_getattr */
13826    0,                  /* tp_setattr */
13827    0,                  /* tp_reserved */
13828    unicode_repr,           /* tp_repr */
13829    &unicode_as_number,         /* tp_as_number */
13830    &unicode_as_sequence,       /* tp_as_sequence */
13831    &unicode_as_mapping,        /* tp_as_mapping */
13832    (hashfunc) unicode_hash,        /* tp_hash*/
13833    0,                  /* tp_call*/
13834    (reprfunc) unicode_str,     /* tp_str */
13835    PyObject_GenericGetAttr,        /* tp_getattro */
13836    0,                  /* tp_setattro */
13837    0,                  /* tp_as_buffer */
13838    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
13839    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
13840    unicode_doc,            /* tp_doc */
13841    0,                  /* tp_traverse */
13842    0,                  /* tp_clear */
13843    PyUnicode_RichCompare,      /* tp_richcompare */
13844    0,                  /* tp_weaklistoffset */
13845    unicode_iter,           /* tp_iter */
13846    0,                  /* tp_iternext */
13847    unicode_methods,            /* tp_methods */
13848    0,                  /* tp_members */
13849    0,                  /* tp_getset */
13850    &PyBaseObject_Type,         /* tp_base */
13851    0,                  /* tp_dict */
13852    0,                  /* tp_descr_get */
13853    0,                  /* tp_descr_set */
13854    0,                  /* tp_dictoffset */
13855    0,                  /* tp_init */
13856    0,                  /* tp_alloc */
13857    unicode_new,            /* tp_new */
13858    PyObject_Del,           /* tp_free */
13859};
13860
13861/* Initialize the Unicode implementation */
13862
13863int _PyUnicode_Init(void)
13864{
13865    int i;
13866
13867    /* XXX - move this array to unicodectype.c ? */
13868    Py_UCS2 linebreak[] = {
13869        0x000A, /* LINE FEED */
13870        0x000D, /* CARRIAGE RETURN */
13871        0x001C, /* FILE SEPARATOR */
13872        0x001D, /* GROUP SEPARATOR */
13873        0x001E, /* RECORD SEPARATOR */
13874        0x0085, /* NEXT LINE */
13875        0x2028, /* LINE SEPARATOR */
13876        0x2029, /* PARAGRAPH SEPARATOR */
13877    };
13878
13879    /* Init the implementation */
13880    unicode_empty = PyUnicode_New(0, 0);
13881    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
13882    if (!unicode_empty)
13883        Py_FatalError("Can't create empty string");
13884
13885    for (i = 0; i < 256; i++)
13886        unicode_latin1[i] = NULL;
13887    if (PyType_Ready(&PyUnicode_Type) < 0)
13888        Py_FatalError("Can't initialize 'unicode'");
13889
13890    /* initialize the linebreak bloom filter */
13891    bloom_linebreak = make_bloom_mask(
13892        PyUnicode_2BYTE_KIND, linebreak,
13893        Py_ARRAY_LENGTH(linebreak));
13894
13895    PyType_Ready(&EncodingMapType);
13896
13897#ifdef HAVE_MBCS
13898    winver.dwOSVersionInfoSize = sizeof(winver);
13899    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13900        PyErr_SetFromWindowsErr(0);
13901        return -1;
13902    }
13903#endif
13904    return 0;
13905}
13906
13907/* Finalize the Unicode implementation */
13908
13909int
13910PyUnicode_ClearFreeList(void)
13911{
13912    return 0;
13913}
13914
13915void
13916_PyUnicode_Fini(void)
13917{
13918    int i;
13919
13920    Py_XDECREF(unicode_empty);
13921    unicode_empty = NULL;
13922
13923    for (i = 0; i < 256; i++) {
13924        if (unicode_latin1[i]) {
13925            Py_DECREF(unicode_latin1[i]);
13926            unicode_latin1[i] = NULL;
13927        }
13928    }
13929    _PyUnicode_ClearStaticStrings();
13930    (void)PyUnicode_ClearFreeList();
13931}
13932
13933void
13934PyUnicode_InternInPlace(PyObject **p)
13935{
13936    register PyObject *s = *p;
13937    PyObject *t;
13938#ifdef Py_DEBUG
13939    assert(s != NULL);
13940    assert(_PyUnicode_CHECK(s));
13941#else
13942    if (s == NULL || !PyUnicode_Check(s))
13943        return;
13944#endif
13945    /* If it's a subclass, we don't really know what putting
13946       it in the interned dict might do. */
13947    if (!PyUnicode_CheckExact(s))
13948        return;
13949    if (PyUnicode_CHECK_INTERNED(s))
13950        return;
13951    if (_PyUnicode_READY_REPLACE(p)) {
13952        assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
13953        return;
13954    }
13955    s = *p;
13956    if (interned == NULL) {
13957        interned = PyDict_New();
13958        if (interned == NULL) {
13959            PyErr_Clear(); /* Don't leave an exception */
13960            return;
13961        }
13962    }
13963    /* It might be that the GetItem call fails even
13964       though the key is present in the dictionary,
13965       namely when this happens during a stack overflow. */
13966    Py_ALLOW_RECURSION
13967    t = PyDict_GetItem(interned, s);
13968    Py_END_ALLOW_RECURSION
13969
13970        if (t) {
13971            Py_INCREF(t);
13972            Py_DECREF(*p);
13973            *p = t;
13974            return;
13975        }
13976
13977    PyThreadState_GET()->recursion_critical = 1;
13978    if (PyDict_SetItem(interned, s, s) < 0) {
13979        PyErr_Clear();
13980        PyThreadState_GET()->recursion_critical = 0;
13981        return;
13982    }
13983    PyThreadState_GET()->recursion_critical = 0;
13984    /* The two references in interned are not counted by refcnt.
13985       The deallocator will take care of this */
13986    Py_REFCNT(s) -= 2;
13987    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13988}
13989
13990void
13991PyUnicode_InternImmortal(PyObject **p)
13992{
13993    PyUnicode_InternInPlace(p);
13994    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13995        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
13996        Py_INCREF(*p);
13997    }
13998}
13999
14000PyObject *
14001PyUnicode_InternFromString(const char *cp)
14002{
14003    PyObject *s = PyUnicode_FromString(cp);
14004    if (s == NULL)
14005        return NULL;
14006    PyUnicode_InternInPlace(&s);
14007    return s;
14008}
14009
14010void
14011_Py_ReleaseInternedUnicodeStrings(void)
14012{
14013    PyObject *keys;
14014    PyObject *s;
14015    Py_ssize_t i, n;
14016    Py_ssize_t immortal_size = 0, mortal_size = 0;
14017
14018    if (interned == NULL || !PyDict_Check(interned))
14019        return;
14020    keys = PyDict_Keys(interned);
14021    if (keys == NULL || !PyList_Check(keys)) {
14022        PyErr_Clear();
14023        return;
14024    }
14025
14026    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14027       detector, interned unicode strings are not forcibly deallocated;
14028       rather, we give them their stolen references back, and then clear
14029       and DECREF the interned dict. */
14030
14031    n = PyList_GET_SIZE(keys);
14032    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14033            n);
14034    for (i = 0; i < n; i++) {
14035        s = PyList_GET_ITEM(keys, i);
14036        if (PyUnicode_READY(s) == -1) {
14037            assert(0 && "could not ready string");
14038            fprintf(stderr, "could not ready string\n");
14039        }
14040        switch (PyUnicode_CHECK_INTERNED(s)) {
14041        case SSTATE_NOT_INTERNED:
14042            /* XXX Shouldn't happen */
14043            break;
14044        case SSTATE_INTERNED_IMMORTAL:
14045            Py_REFCNT(s) += 1;
14046            immortal_size += PyUnicode_GET_LENGTH(s);
14047            break;
14048        case SSTATE_INTERNED_MORTAL:
14049            Py_REFCNT(s) += 2;
14050            mortal_size += PyUnicode_GET_LENGTH(s);
14051            break;
14052        default:
14053            Py_FatalError("Inconsistent interned string state.");
14054        }
14055        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14056    }
14057    fprintf(stderr, "total size of all interned strings: "
14058            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14059            "mortal/immortal\n", mortal_size, immortal_size);
14060    Py_DECREF(keys);
14061    PyDict_Clear(interned);
14062    Py_DECREF(interned);
14063    interned = NULL;
14064}
14065
14066
14067/********************* Unicode Iterator **************************/
14068
14069typedef struct {
14070    PyObject_HEAD
14071    Py_ssize_t it_index;
14072    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14073} unicodeiterobject;
14074
14075static void
14076unicodeiter_dealloc(unicodeiterobject *it)
14077{
14078    _PyObject_GC_UNTRACK(it);
14079    Py_XDECREF(it->it_seq);
14080    PyObject_GC_Del(it);
14081}
14082
14083static int
14084unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14085{
14086    Py_VISIT(it->it_seq);
14087    return 0;
14088}
14089
14090static PyObject *
14091unicodeiter_next(unicodeiterobject *it)
14092{
14093    PyObject *seq, *item;
14094
14095    assert(it != NULL);
14096    seq = it->it_seq;
14097    if (seq == NULL)
14098        return NULL;
14099    assert(_PyUnicode_CHECK(seq));
14100
14101    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14102        int kind = PyUnicode_KIND(seq);
14103        void *data = PyUnicode_DATA(seq);
14104        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14105        item = PyUnicode_FromOrdinal(chr);
14106        if (item != NULL)
14107            ++it->it_index;
14108        return item;
14109    }
14110
14111    Py_DECREF(seq);
14112    it->it_seq = NULL;
14113    return NULL;
14114}
14115
14116static PyObject *
14117unicodeiter_len(unicodeiterobject *it)
14118{
14119    Py_ssize_t len = 0;
14120    if (it->it_seq)
14121        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14122    return PyLong_FromSsize_t(len);
14123}
14124
14125PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14126
14127static PyMethodDef unicodeiter_methods[] = {
14128    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14129     length_hint_doc},
14130    {NULL,      NULL}       /* sentinel */
14131};
14132
14133PyTypeObject PyUnicodeIter_Type = {
14134    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14135    "str_iterator",         /* tp_name */
14136    sizeof(unicodeiterobject),      /* tp_basicsize */
14137    0,                  /* tp_itemsize */
14138    /* methods */
14139    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14140    0,                  /* tp_print */
14141    0,                  /* tp_getattr */
14142    0,                  /* tp_setattr */
14143    0,                  /* tp_reserved */
14144    0,                  /* tp_repr */
14145    0,                  /* tp_as_number */
14146    0,                  /* tp_as_sequence */
14147    0,                  /* tp_as_mapping */
14148    0,                  /* tp_hash */
14149    0,                  /* tp_call */
14150    0,                  /* tp_str */
14151    PyObject_GenericGetAttr,        /* tp_getattro */
14152    0,                  /* tp_setattro */
14153    0,                  /* tp_as_buffer */
14154    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14155    0,                  /* tp_doc */
14156    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14157    0,                  /* tp_clear */
14158    0,                  /* tp_richcompare */
14159    0,                  /* tp_weaklistoffset */
14160    PyObject_SelfIter,          /* tp_iter */
14161    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14162    unicodeiter_methods,            /* tp_methods */
14163    0,
14164};
14165
14166static PyObject *
14167unicode_iter(PyObject *seq)
14168{
14169    unicodeiterobject *it;
14170
14171    if (!PyUnicode_Check(seq)) {
14172        PyErr_BadInternalCall();
14173        return NULL;
14174    }
14175    if (PyUnicode_READY(seq) == -1)
14176        return NULL;
14177    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14178    if (it == NULL)
14179        return NULL;
14180    it->it_index = 0;
14181    Py_INCREF(seq);
14182    it->it_seq = seq;
14183    _PyObject_GC_TRACK(it);
14184    return (PyObject *)it;
14185}
14186
14187
14188size_t
14189Py_UNICODE_strlen(const Py_UNICODE *u)
14190{
14191    int res = 0;
14192    while(*u++)
14193        res++;
14194    return res;
14195}
14196
14197Py_UNICODE*
14198Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14199{
14200    Py_UNICODE *u = s1;
14201    while ((*u++ = *s2++));
14202    return s1;
14203}
14204
14205Py_UNICODE*
14206Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14207{
14208    Py_UNICODE *u = s1;
14209    while ((*u++ = *s2++))
14210        if (n-- == 0)
14211            break;
14212    return s1;
14213}
14214
14215Py_UNICODE*
14216Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14217{
14218    Py_UNICODE *u1 = s1;
14219    u1 += Py_UNICODE_strlen(u1);
14220    Py_UNICODE_strcpy(u1, s2);
14221    return s1;
14222}
14223
14224int
14225Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14226{
14227    while (*s1 && *s2 && *s1 == *s2)
14228        s1++, s2++;
14229    if (*s1 && *s2)
14230        return (*s1 < *s2) ? -1 : +1;
14231    if (*s1)
14232        return 1;
14233    if (*s2)
14234        return -1;
14235    return 0;
14236}
14237
14238int
14239Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14240{
14241    register Py_UNICODE u1, u2;
14242    for (; n != 0; n--) {
14243        u1 = *s1;
14244        u2 = *s2;
14245        if (u1 != u2)
14246            return (u1 < u2) ? -1 : +1;
14247        if (u1 == '\0')
14248            return 0;
14249        s1++;
14250        s2++;
14251    }
14252    return 0;
14253}
14254
14255Py_UNICODE*
14256Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14257{
14258    const Py_UNICODE *p;
14259    for (p = s; *p; p++)
14260        if (*p == c)
14261            return (Py_UNICODE*)p;
14262    return NULL;
14263}
14264
14265Py_UNICODE*
14266Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14267{
14268    const Py_UNICODE *p;
14269    p = s + Py_UNICODE_strlen(s);
14270    while (p != s) {
14271        p--;
14272        if (*p == c)
14273            return (Py_UNICODE*)p;
14274    }
14275    return NULL;
14276}
14277
14278Py_UNICODE*
14279PyUnicode_AsUnicodeCopy(PyObject *unicode)
14280{
14281    Py_UNICODE *u, *copy;
14282    Py_ssize_t len, size;
14283
14284    if (!PyUnicode_Check(unicode)) {
14285        PyErr_BadArgument();
14286        return NULL;
14287    }
14288    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14289    if (u == NULL)
14290        return NULL;
14291    /* Ensure we won't overflow the size. */
14292    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14293        PyErr_NoMemory();
14294        return NULL;
14295    }
14296    size = len + 1; /* copy the null character */
14297    size *= sizeof(Py_UNICODE);
14298    copy = PyMem_Malloc(size);
14299    if (copy == NULL) {
14300        PyErr_NoMemory();
14301        return NULL;
14302    }
14303    memcpy(copy, u, size);
14304    return copy;
14305}
14306
14307/* A _string module, to export formatter_parser and formatter_field_name_split
14308   to the string.Formatter class implemented in Python. */
14309
14310static PyMethodDef _string_methods[] = {
14311    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14312     METH_O, PyDoc_STR("split the argument as a field name")},
14313    {"formatter_parser", (PyCFunction) formatter_parser,
14314     METH_O, PyDoc_STR("parse the argument as a format string")},
14315    {NULL, NULL}
14316};
14317
14318static struct PyModuleDef _string_module = {
14319    PyModuleDef_HEAD_INIT,
14320    "_string",
14321    PyDoc_STR("string helper module"),
14322    0,
14323    _string_methods,
14324    NULL,
14325    NULL,
14326    NULL,
14327    NULL
14328};
14329
14330PyMODINIT_FUNC
14331PyInit__string(void)
14332{
14333    return PyModule_Create(&_string_module);
14334}
14335
14336
14337#ifdef __cplusplus
14338}
14339#endif
14340