unicodeobject.c revision 12be46ca8418593fb2716234912b6a8a8d262966
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49#ifdef Py_DEBUG
50#  define DONT_MAKE_RESULT_READY
51#endif
52
53/* Endianness switches; defaults to little endian */
54
55#ifdef WORDS_BIGENDIAN
56# define BYTEORDER_IS_BIG_ENDIAN
57#else
58# define BYTEORDER_IS_LITTLE_ENDIAN
59#endif
60
61/* --- Globals ------------------------------------------------------------
62
63   The globals are initialized by the _PyUnicode_Init() API and should
64   not be used before calling that API.
65
66*/
67
68
69#ifdef __cplusplus
70extern "C" {
71#endif
72
73#ifdef Py_DEBUG
74#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
75#else
76#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
78
79#define _PyUnicode_UTF8(op)                             \
80    (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op)                              \
82    (assert(_PyUnicode_CHECK(op)),                      \
83     assert(PyUnicode_IS_READY(op)),                    \
84     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
85         ((char*)((PyASCIIObject*)(op) + 1)) :          \
86         _PyUnicode_UTF8(op))
87#define _PyUnicode_UTF8_LENGTH(op)                      \
88    (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op)                       \
90    (assert(_PyUnicode_CHECK(op)),                      \
91     assert(PyUnicode_IS_READY(op)),                    \
92     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
93         ((PyASCIIObject*)(op))->length :               \
94         _PyUnicode_UTF8_LENGTH(op))
95#define _PyUnicode_WSTR(op)                             \
96    (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op)                      \
98    (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op)                           \
100    (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op)                            \
102    (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op)                             \
104    (((PyASCIIObject *)(op))->hash)
105#define _PyUnicode_KIND(op)                             \
106    (assert(_PyUnicode_CHECK(op)),                      \
107     ((PyASCIIObject *)(op))->state.kind)
108#define _PyUnicode_GET_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     ((PyASCIIObject *)(op))->length)
111#define _PyUnicode_DATA_ANY(op)                         \
112    (((PyUnicodeObject*)(op))->data.any)
113
114#undef PyUnicode_READY
115#define PyUnicode_READY(op)                             \
116    (assert(_PyUnicode_CHECK(op)),                      \
117     (PyUnicode_IS_READY(op) ?                          \
118      0 :                                               \
119      _PyUnicode_Ready(op)))
120
121#define _PyUnicode_READY_REPLACE(p_obj)                 \
122    (assert(_PyUnicode_CHECK(*p_obj)),                  \
123     (PyUnicode_IS_READY(*p_obj) ?                      \
124      0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
125
126#define _PyUnicode_SHARE_UTF8(op)                       \
127    (assert(_PyUnicode_CHECK(op)),                      \
128     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
129     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op)                       \
131    (assert(_PyUnicode_CHECK(op)),                      \
132     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated UTF-8 memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
137    (assert(_PyUnicode_CHECK(op)),                      \
138     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
139      && _PyUnicode_UTF8(op)                            \
140      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
142/* true if the Unicode object has an allocated wstr memory block
143   (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
145    (assert(_PyUnicode_CHECK(op)),                      \
146     (_PyUnicode_WSTR(op) &&                            \
147      (!PyUnicode_IS_READY(op) ||                       \
148       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
150/* Generic helper macro to convert characters of different types.
151   from_type and to_type have to be valid type names, begin and end
152   are pointers to the source characters which should be of type
153   "from_type *".  to is a pointer of type "to_type *" and points to the
154   buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156    do {                                                \
157        to_type *_to = (to_type *) to;                  \
158        const from_type *_iter = (begin);               \
159        const from_type *_end = (end);                  \
160        Py_ssize_t n = (_end) - (_iter);                \
161        const from_type *_unrolled_end =                \
162            _iter + (n & ~ (Py_ssize_t) 3);             \
163        while (_iter < (_unrolled_end)) {               \
164            _to[0] = (to_type) _iter[0];                \
165            _to[1] = (to_type) _iter[1];                \
166            _to[2] = (to_type) _iter[2];                \
167            _to[3] = (to_type) _iter[3];                \
168            _iter += 4; _to += 4;                       \
169        }                                               \
170        while (_iter < (_end))                          \
171            *_to++ = (to_type) *_iter++;                \
172    } while (0)
173
174/* The Unicode string has been modified: reset the hash */
175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
176
177/* This dictionary holds all interned unicode strings.  Note that references
178   to strings in this dictionary are *not* counted in the string's ob_refcnt.
179   When the interned string reaches a refcnt of 0 the string deallocation
180   function will delete the reference from this dictionary.
181
182   Another way to look at this is that to say that the actual reference
183   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
184*/
185static PyObject *interned;
186
187/* The empty Unicode object is shared to improve performance. */
188static PyObject *unicode_empty;
189
190/* List of static strings. */
191static _Py_Identifier *static_strings;
192
193/* Single character Unicode strings in the Latin-1 range are being
194   shared as well. */
195static PyObject *unicode_latin1[256];
196
197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
199    0, 0, 0, 0, 0, 0, 0, 0,
200/*     case 0x0009: * CHARACTER TABULATION */
201/*     case 0x000A: * LINE FEED */
202/*     case 0x000B: * LINE TABULATION */
203/*     case 0x000C: * FORM FEED */
204/*     case 0x000D: * CARRIAGE RETURN */
205    0, 1, 1, 1, 1, 1, 0, 0,
206    0, 0, 0, 0, 0, 0, 0, 0,
207/*     case 0x001C: * FILE SEPARATOR */
208/*     case 0x001D: * GROUP SEPARATOR */
209/*     case 0x001E: * RECORD SEPARATOR */
210/*     case 0x001F: * UNIT SEPARATOR */
211    0, 0, 0, 0, 1, 1, 1, 1,
212/*     case 0x0020: * SPACE */
213    1, 0, 0, 0, 0, 0, 0, 0,
214    0, 0, 0, 0, 0, 0, 0, 0,
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0,
223    0, 0, 0, 0, 0, 0, 0, 0,
224    0, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0
226};
227
228/* forward */
229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
230static PyObject* get_latin1_char(unsigned char ch);
231static void copy_characters(
232    PyObject *to, Py_ssize_t to_start,
233    PyObject *from, Py_ssize_t from_start,
234    Py_ssize_t how_many);
235#ifdef Py_DEBUG
236static int unicode_is_singleton(PyObject *unicode);
237#endif
238
239static PyObject *
240unicode_fromascii(const unsigned char *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
243static PyObject *
244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
245static PyObject *
246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
247
248static PyObject *
249unicode_encode_call_errorhandler(const char *errors,
250       PyObject **errorHandler,const char *encoding, const char *reason,
251       PyObject *unicode, PyObject **exceptionObject,
252       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
254static void
255raise_encode_exception(PyObject **exceptionObject,
256                       const char *encoding,
257                       PyObject *unicode,
258                       Py_ssize_t startpos, Py_ssize_t endpos,
259                       const char *reason);
260
261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
263    0, 0, 0, 0, 0, 0, 0, 0,
264/*         0x000A, * LINE FEED */
265/*         0x000B, * LINE TABULATION */
266/*         0x000C, * FORM FEED */
267/*         0x000D, * CARRIAGE RETURN */
268    0, 0, 1, 1, 1, 1, 0, 0,
269    0, 0, 0, 0, 0, 0, 0, 0,
270/*         0x001C, * FILE SEPARATOR */
271/*         0x001D, * GROUP SEPARATOR */
272/*         0x001E, * RECORD SEPARATOR */
273    0, 0, 0, 0, 1, 1, 1, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0,
277    0, 0, 0, 0, 0, 0, 0, 0,
278
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0,
283    0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0
287};
288
289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290   This function is kept for backward compatibility with the old API. */
291Py_UNICODE
292PyUnicode_GetMax(void)
293{
294#ifdef Py_UNICODE_WIDE
295    return 0x10FFFF;
296#else
297    /* This is actually an illegal character, so it should
298       not be passed to unichr. */
299    return 0xFFFF;
300#endif
301}
302
303#ifdef Py_DEBUG
304int
305_PyUnicode_CheckConsistency(PyObject *op, int check_content)
306{
307    PyASCIIObject *ascii;
308    unsigned int kind;
309
310    assert(PyUnicode_Check(op));
311
312    ascii = (PyASCIIObject *)op;
313    kind = ascii->state.kind;
314
315    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
316        assert(kind == PyUnicode_1BYTE_KIND);
317        assert(ascii->state.ready == 1);
318    }
319    else {
320        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
321        void *data;
322
323        if (ascii->state.compact == 1) {
324            data = compact + 1;
325            assert(kind == PyUnicode_1BYTE_KIND
326                   || kind == PyUnicode_2BYTE_KIND
327                   || kind == PyUnicode_4BYTE_KIND);
328            assert(ascii->state.ascii == 0);
329            assert(ascii->state.ready == 1);
330            assert (compact->utf8 != data);
331        } else {
332            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
333
334            data = unicode->data.any;
335            if (kind == PyUnicode_WCHAR_KIND) {
336                assert(ascii->state.compact == 0);
337                assert(ascii->state.ascii == 0);
338                assert(ascii->state.ready == 0);
339                assert(ascii->wstr != NULL);
340                assert(data == NULL);
341                assert(compact->utf8 == NULL);
342                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
343            }
344            else {
345                assert(kind == PyUnicode_1BYTE_KIND
346                       || kind == PyUnicode_2BYTE_KIND
347                       || kind == PyUnicode_4BYTE_KIND);
348                assert(ascii->state.compact == 0);
349                assert(ascii->state.ready == 1);
350                assert(data != NULL);
351                if (ascii->state.ascii) {
352                    assert (compact->utf8 == data);
353                    assert (compact->utf8_length == ascii->length);
354                }
355                else
356                    assert (compact->utf8 != data);
357            }
358        }
359        if (kind != PyUnicode_WCHAR_KIND) {
360            if (
361#if SIZEOF_WCHAR_T == 2
362                kind == PyUnicode_2BYTE_KIND
363#else
364                kind == PyUnicode_4BYTE_KIND
365#endif
366               )
367            {
368                assert(ascii->wstr == data);
369                assert(compact->wstr_length == ascii->length);
370            } else
371                assert(ascii->wstr != data);
372        }
373
374        if (compact->utf8 == NULL)
375            assert(compact->utf8_length == 0);
376        if (ascii->wstr == NULL)
377            assert(compact->wstr_length == 0);
378    }
379    /* check that the best kind is used */
380    if (check_content && kind != PyUnicode_WCHAR_KIND)
381    {
382        Py_ssize_t i;
383        Py_UCS4 maxchar = 0;
384        void *data = PyUnicode_DATA(ascii);
385        for (i=0; i < ascii->length; i++)
386        {
387            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
388            if (ch > maxchar)
389                maxchar = ch;
390        }
391        if (kind == PyUnicode_1BYTE_KIND) {
392            if (ascii->state.ascii == 0)
393                assert(maxchar >= 128);
394            else
395                assert(maxchar < 128);
396        }
397        else if (kind == PyUnicode_2BYTE_KIND)
398            assert(maxchar >= 0x100);
399        else
400            assert(maxchar >= 0x10000);
401    }
402    if (check_content && !unicode_is_singleton(op))
403        assert(ascii->hash == -1);
404    return 1;
405}
406#endif
407
408#ifdef HAVE_MBCS
409static OSVERSIONINFOEX winver;
410#endif
411
412/* --- Bloom Filters ----------------------------------------------------- */
413
414/* stuff to implement simple "bloom filters" for Unicode characters.
415   to keep things simple, we use a single bitmask, using the least 5
416   bits from each unicode characters as the bit index. */
417
418/* the linebreak mask is set up by Unicode_Init below */
419
420#if LONG_BIT >= 128
421#define BLOOM_WIDTH 128
422#elif LONG_BIT >= 64
423#define BLOOM_WIDTH 64
424#elif LONG_BIT >= 32
425#define BLOOM_WIDTH 32
426#else
427#error "LONG_BIT is smaller than 32"
428#endif
429
430#define BLOOM_MASK unsigned long
431
432static BLOOM_MASK bloom_linebreak;
433
434#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
435#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
436
437#define BLOOM_LINEBREAK(ch)                                             \
438    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
439     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
440
441Py_LOCAL_INLINE(BLOOM_MASK)
442make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
443{
444    /* calculate simple bloom-style bitmask for a given unicode string */
445
446    BLOOM_MASK mask;
447    Py_ssize_t i;
448
449    mask = 0;
450    for (i = 0; i < len; i++)
451        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
452
453    return mask;
454}
455
456#define BLOOM_MEMBER(mask, chr, str) \
457    (BLOOM(mask, chr) \
458     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
459
460/* Compilation of templated routines */
461
462#include "stringlib/asciilib.h"
463#include "stringlib/fastsearch.h"
464#include "stringlib/partition.h"
465#include "stringlib/split.h"
466#include "stringlib/count.h"
467#include "stringlib/find.h"
468#include "stringlib/find_max_char.h"
469#include "stringlib/localeutil.h"
470#include "stringlib/undef.h"
471
472#include "stringlib/ucs1lib.h"
473#include "stringlib/fastsearch.h"
474#include "stringlib/partition.h"
475#include "stringlib/split.h"
476#include "stringlib/count.h"
477#include "stringlib/find.h"
478#include "stringlib/find_max_char.h"
479#include "stringlib/localeutil.h"
480#include "stringlib/undef.h"
481
482#include "stringlib/ucs2lib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs4lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/unicodedefs.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/count.h"
505#include "stringlib/find.h"
506
507/* --- Unicode Object ----------------------------------------------------- */
508
509static PyObject *
510fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
511
512Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
513                                     Py_ssize_t size, Py_UCS4 ch,
514                                     int direction)
515{
516    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
517
518    switch (kind) {
519    case PyUnicode_1BYTE_KIND:
520        {
521            Py_UCS1 ch1 = (Py_UCS1) ch;
522            if (ch1 == ch)
523                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
524            else
525                return -1;
526        }
527    case PyUnicode_2BYTE_KIND:
528        {
529            Py_UCS2 ch2 = (Py_UCS2) ch;
530            if (ch2 == ch)
531                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
532            else
533                return -1;
534        }
535    case PyUnicode_4BYTE_KIND:
536        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
537    default:
538        assert(0);
539        return -1;
540    }
541}
542
543static PyObject*
544resize_compact(PyObject *unicode, Py_ssize_t length)
545{
546    Py_ssize_t char_size;
547    Py_ssize_t struct_size;
548    Py_ssize_t new_size;
549    int share_wstr;
550
551    assert(PyUnicode_IS_READY(unicode));
552    char_size = PyUnicode_KIND(unicode);
553    if (PyUnicode_IS_COMPACT_ASCII(unicode))
554        struct_size = sizeof(PyASCIIObject);
555    else
556        struct_size = sizeof(PyCompactUnicodeObject);
557    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
558
559    _Py_DEC_REFTOTAL;
560    _Py_ForgetReference(unicode);
561
562    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
563        PyErr_NoMemory();
564        return NULL;
565    }
566    new_size = (struct_size + (length + 1) * char_size);
567
568    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
569    if (unicode == NULL) {
570        PyObject_Del(unicode);
571        PyErr_NoMemory();
572        return NULL;
573    }
574    _Py_NewReference(unicode);
575    _PyUnicode_LENGTH(unicode) = length;
576    if (share_wstr) {
577        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
578        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
579            _PyUnicode_WSTR_LENGTH(unicode) = length;
580    }
581    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
582                    length, 0);
583    return unicode;
584}
585
586static int
587resize_inplace(PyObject *unicode, Py_ssize_t length)
588{
589    wchar_t *wstr;
590    assert(!PyUnicode_IS_COMPACT(unicode));
591    assert(Py_REFCNT(unicode) == 1);
592
593    _PyUnicode_DIRTY(unicode);
594
595    if (PyUnicode_IS_READY(unicode)) {
596        Py_ssize_t char_size;
597        Py_ssize_t new_size;
598        int share_wstr, share_utf8;
599        void *data;
600
601        data = _PyUnicode_DATA_ANY(unicode);
602        assert(data != NULL);
603        char_size = PyUnicode_KIND(unicode);
604        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
605        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
606        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
607        {
608            PyObject_DEL(_PyUnicode_UTF8(unicode));
609            _PyUnicode_UTF8(unicode) = NULL;
610            _PyUnicode_UTF8_LENGTH(unicode) = 0;
611        }
612
613        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
614            PyErr_NoMemory();
615            return -1;
616        }
617        new_size = (length + 1) * char_size;
618
619        data = (PyObject *)PyObject_REALLOC(data, new_size);
620        if (data == NULL) {
621            PyErr_NoMemory();
622            return -1;
623        }
624        _PyUnicode_DATA_ANY(unicode) = data;
625        if (share_wstr) {
626            _PyUnicode_WSTR(unicode) = data;
627            _PyUnicode_WSTR_LENGTH(unicode) = length;
628        }
629        if (share_utf8) {
630            _PyUnicode_UTF8(unicode) = data;
631            _PyUnicode_UTF8_LENGTH(unicode) = length;
632        }
633        _PyUnicode_LENGTH(unicode) = length;
634        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
635        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
636            assert(_PyUnicode_CheckConsistency(unicode, 0));
637            return 0;
638        }
639    }
640    assert(_PyUnicode_WSTR(unicode) != NULL);
641
642    /* check for integer overflow */
643    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
644        PyErr_NoMemory();
645        return -1;
646    }
647    wstr =  _PyUnicode_WSTR(unicode);
648    wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
649    if (!wstr) {
650        PyErr_NoMemory();
651        return -1;
652    }
653    _PyUnicode_WSTR(unicode) = wstr;
654    _PyUnicode_WSTR(unicode)[length] = 0;
655    _PyUnicode_WSTR_LENGTH(unicode) = length;
656    assert(_PyUnicode_CheckConsistency(unicode, 0));
657    return 0;
658}
659
660static PyObject*
661resize_copy(PyObject *unicode, Py_ssize_t length)
662{
663    Py_ssize_t copy_length;
664    if (PyUnicode_IS_COMPACT(unicode)) {
665        PyObject *copy;
666        assert(PyUnicode_IS_READY(unicode));
667
668        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
669        if (copy == NULL)
670            return NULL;
671
672        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
673        copy_characters(copy, 0, unicode, 0, copy_length);
674        return copy;
675    }
676    else {
677        PyObject *w;
678        assert(_PyUnicode_WSTR(unicode) != NULL);
679        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
680        w = (PyObject*)_PyUnicode_New(length);
681        if (w == NULL)
682            return NULL;
683        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
684        copy_length = Py_MIN(copy_length, length);
685        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
686                        copy_length);
687        return w;
688    }
689}
690
691/* We allocate one more byte to make sure the string is
692   Ux0000 terminated; some code (e.g. new_identifier)
693   relies on that.
694
695   XXX This allocator could further be enhanced by assuring that the
696   free list never reduces its size below 1.
697
698*/
699
700#ifdef Py_DEBUG
701static int unicode_old_new_calls = 0;
702#endif
703
704static PyUnicodeObject *
705_PyUnicode_New(Py_ssize_t length)
706{
707    register PyUnicodeObject *unicode;
708    size_t new_size;
709
710    /* Optimization for empty strings */
711    if (length == 0 && unicode_empty != NULL) {
712        Py_INCREF(unicode_empty);
713        return (PyUnicodeObject*)unicode_empty;
714    }
715
716    /* Ensure we won't overflow the size. */
717    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
718        return (PyUnicodeObject *)PyErr_NoMemory();
719    }
720    if (length < 0) {
721        PyErr_SetString(PyExc_SystemError,
722                        "Negative size passed to _PyUnicode_New");
723        return NULL;
724    }
725
726#ifdef Py_DEBUG
727    ++unicode_old_new_calls;
728#endif
729
730    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
731    if (unicode == NULL)
732        return NULL;
733    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
734    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
735    if (!_PyUnicode_WSTR(unicode)) {
736        PyErr_NoMemory();
737        goto onError;
738    }
739
740    /* Initialize the first element to guard against cases where
741     * the caller fails before initializing str -- unicode_resize()
742     * reads str[0], and the Keep-Alive optimization can keep memory
743     * allocated for str alive across a call to unicode_dealloc(unicode).
744     * We don't want unicode_resize to read uninitialized memory in
745     * that case.
746     */
747    _PyUnicode_WSTR(unicode)[0] = 0;
748    _PyUnicode_WSTR(unicode)[length] = 0;
749    _PyUnicode_WSTR_LENGTH(unicode) = length;
750    _PyUnicode_HASH(unicode) = -1;
751    _PyUnicode_STATE(unicode).interned = 0;
752    _PyUnicode_STATE(unicode).kind = 0;
753    _PyUnicode_STATE(unicode).compact = 0;
754    _PyUnicode_STATE(unicode).ready = 0;
755    _PyUnicode_STATE(unicode).ascii = 0;
756    _PyUnicode_DATA_ANY(unicode) = NULL;
757    _PyUnicode_LENGTH(unicode) = 0;
758    _PyUnicode_UTF8(unicode) = NULL;
759    _PyUnicode_UTF8_LENGTH(unicode) = 0;
760    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
761    return unicode;
762
763  onError:
764    /* XXX UNREF/NEWREF interface should be more symmetrical */
765    _Py_DEC_REFTOTAL;
766    _Py_ForgetReference((PyObject *)unicode);
767    PyObject_Del(unicode);
768    return NULL;
769}
770
771static const char*
772unicode_kind_name(PyObject *unicode)
773{
774    /* don't check consistency: unicode_kind_name() is called from
775       _PyUnicode_Dump() */
776    if (!PyUnicode_IS_COMPACT(unicode))
777    {
778        if (!PyUnicode_IS_READY(unicode))
779            return "wstr";
780        switch(PyUnicode_KIND(unicode))
781        {
782        case PyUnicode_1BYTE_KIND:
783            if (PyUnicode_IS_ASCII(unicode))
784                return "legacy ascii";
785            else
786                return "legacy latin1";
787        case PyUnicode_2BYTE_KIND:
788            return "legacy UCS2";
789        case PyUnicode_4BYTE_KIND:
790            return "legacy UCS4";
791        default:
792            return "<legacy invalid kind>";
793        }
794    }
795    assert(PyUnicode_IS_READY(unicode));
796    switch(PyUnicode_KIND(unicode))
797    {
798    case PyUnicode_1BYTE_KIND:
799        if (PyUnicode_IS_ASCII(unicode))
800            return "ascii";
801        else
802            return "latin1";
803    case PyUnicode_2BYTE_KIND:
804        return "UCS2";
805    case PyUnicode_4BYTE_KIND:
806        return "UCS4";
807    default:
808        return "<invalid compact kind>";
809    }
810}
811
812#ifdef Py_DEBUG
813static int unicode_new_new_calls = 0;
814
815/* Functions wrapping macros for use in debugger */
816char *_PyUnicode_utf8(void *unicode){
817    return PyUnicode_UTF8(unicode);
818}
819
820void *_PyUnicode_compact_data(void *unicode) {
821    return _PyUnicode_COMPACT_DATA(unicode);
822}
823void *_PyUnicode_data(void *unicode){
824    printf("obj %p\n", unicode);
825    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
826    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
827    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
828    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
829    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
830    return PyUnicode_DATA(unicode);
831}
832
833void
834_PyUnicode_Dump(PyObject *op)
835{
836    PyASCIIObject *ascii = (PyASCIIObject *)op;
837    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
838    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
839    void *data;
840
841    if (ascii->state.compact)
842    {
843        if (ascii->state.ascii)
844            data = (ascii + 1);
845        else
846            data = (compact + 1);
847    }
848    else
849        data = unicode->data.any;
850    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
851
852    if (ascii->wstr == data)
853        printf("shared ");
854    printf("wstr=%p", ascii->wstr);
855
856    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
857        printf(" (%zu), ", compact->wstr_length);
858        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
859            printf("shared ");
860        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
861    }
862    printf(", data=%p\n", data);
863}
864#endif
865
866PyObject *
867PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
868{
869    PyObject *obj;
870    PyCompactUnicodeObject *unicode;
871    void *data;
872    int kind_state;
873    int is_sharing, is_ascii;
874    Py_ssize_t char_size;
875    Py_ssize_t struct_size;
876
877    /* Optimization for empty strings */
878    if (size == 0 && unicode_empty != NULL) {
879        Py_INCREF(unicode_empty);
880        return unicode_empty;
881    }
882
883#ifdef Py_DEBUG
884    ++unicode_new_new_calls;
885#endif
886
887    is_ascii = 0;
888    is_sharing = 0;
889    struct_size = sizeof(PyCompactUnicodeObject);
890    if (maxchar < 128) {
891        kind_state = PyUnicode_1BYTE_KIND;
892        char_size = 1;
893        is_ascii = 1;
894        struct_size = sizeof(PyASCIIObject);
895    }
896    else if (maxchar < 256) {
897        kind_state = PyUnicode_1BYTE_KIND;
898        char_size = 1;
899    }
900    else if (maxchar < 65536) {
901        kind_state = PyUnicode_2BYTE_KIND;
902        char_size = 2;
903        if (sizeof(wchar_t) == 2)
904            is_sharing = 1;
905    }
906    else {
907        kind_state = PyUnicode_4BYTE_KIND;
908        char_size = 4;
909        if (sizeof(wchar_t) == 4)
910            is_sharing = 1;
911    }
912
913    /* Ensure we won't overflow the size. */
914    if (size < 0) {
915        PyErr_SetString(PyExc_SystemError,
916                        "Negative size passed to PyUnicode_New");
917        return NULL;
918    }
919    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
920        return PyErr_NoMemory();
921
922    /* Duplicated allocation code from _PyObject_New() instead of a call to
923     * PyObject_New() so we are able to allocate space for the object and
924     * it's data buffer.
925     */
926    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
927    if (obj == NULL)
928        return PyErr_NoMemory();
929    obj = PyObject_INIT(obj, &PyUnicode_Type);
930    if (obj == NULL)
931        return NULL;
932
933    unicode = (PyCompactUnicodeObject *)obj;
934    if (is_ascii)
935        data = ((PyASCIIObject*)obj) + 1;
936    else
937        data = unicode + 1;
938    _PyUnicode_LENGTH(unicode) = size;
939    _PyUnicode_HASH(unicode) = -1;
940    _PyUnicode_STATE(unicode).interned = 0;
941    _PyUnicode_STATE(unicode).kind = kind_state;
942    _PyUnicode_STATE(unicode).compact = 1;
943    _PyUnicode_STATE(unicode).ready = 1;
944    _PyUnicode_STATE(unicode).ascii = is_ascii;
945    if (is_ascii) {
946        ((char*)data)[size] = 0;
947        _PyUnicode_WSTR(unicode) = NULL;
948    }
949    else if (kind_state == PyUnicode_1BYTE_KIND) {
950        ((char*)data)[size] = 0;
951        _PyUnicode_WSTR(unicode) = NULL;
952        _PyUnicode_WSTR_LENGTH(unicode) = 0;
953        unicode->utf8 = NULL;
954        unicode->utf8_length = 0;
955        }
956    else {
957        unicode->utf8 = NULL;
958        unicode->utf8_length = 0;
959        if (kind_state == PyUnicode_2BYTE_KIND)
960            ((Py_UCS2*)data)[size] = 0;
961        else /* kind_state == PyUnicode_4BYTE_KIND */
962            ((Py_UCS4*)data)[size] = 0;
963        if (is_sharing) {
964            _PyUnicode_WSTR_LENGTH(unicode) = size;
965            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
966        }
967        else {
968            _PyUnicode_WSTR_LENGTH(unicode) = 0;
969            _PyUnicode_WSTR(unicode) = NULL;
970        }
971    }
972    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
973    return obj;
974}
975
976#if SIZEOF_WCHAR_T == 2
977/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
978   will decode surrogate pairs, the other conversions are implemented as macros
979   for efficiency.
980
981   This function assumes that unicode can hold one more code point than wstr
982   characters for a terminating null character. */
983static void
984unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
985                              PyObject *unicode)
986{
987    const wchar_t *iter;
988    Py_UCS4 *ucs4_out;
989
990    assert(unicode != NULL);
991    assert(_PyUnicode_CHECK(unicode));
992    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
993    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
994
995    for (iter = begin; iter < end; ) {
996        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
997                           _PyUnicode_GET_LENGTH(unicode)));
998        if (*iter >= 0xD800 && *iter <= 0xDBFF
999            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1000        {
1001            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1002            iter += 2;
1003        }
1004        else {
1005            *ucs4_out++ = *iter;
1006            iter++;
1007        }
1008    }
1009    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1010                        _PyUnicode_GET_LENGTH(unicode)));
1011
1012}
1013#endif
1014
1015static int
1016_PyUnicode_Dirty(PyObject *unicode)
1017{
1018    assert(_PyUnicode_CHECK(unicode));
1019    if (Py_REFCNT(unicode) != 1) {
1020        PyErr_SetString(PyExc_SystemError,
1021                        "Cannot modify a string having more than 1 reference");
1022        return -1;
1023    }
1024    _PyUnicode_DIRTY(unicode);
1025    return 0;
1026}
1027
1028static int
1029_copy_characters(PyObject *to, Py_ssize_t to_start,
1030                 PyObject *from, Py_ssize_t from_start,
1031                 Py_ssize_t how_many, int check_maxchar)
1032{
1033    unsigned int from_kind, to_kind;
1034    void *from_data, *to_data;
1035    int fast;
1036
1037    assert(PyUnicode_Check(from));
1038    assert(PyUnicode_Check(to));
1039    assert(PyUnicode_IS_READY(from));
1040    assert(PyUnicode_IS_READY(to));
1041
1042    assert(PyUnicode_GET_LENGTH(from) >= how_many);
1043    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1044    assert(0 <= how_many);
1045
1046    if (how_many == 0)
1047        return 0;
1048
1049    from_kind = PyUnicode_KIND(from);
1050    from_data = PyUnicode_DATA(from);
1051    to_kind = PyUnicode_KIND(to);
1052    to_data = PyUnicode_DATA(to);
1053
1054#ifdef Py_DEBUG
1055    if (!check_maxchar
1056        && (from_kind > to_kind
1057            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
1058    {
1059        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1060        Py_UCS4 ch;
1061        Py_ssize_t i;
1062        for (i=0; i < how_many; i++) {
1063            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1064            assert(ch <= to_maxchar);
1065        }
1066    }
1067#endif
1068    fast = (from_kind == to_kind);
1069    if (check_maxchar
1070        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1071    {
1072        /* deny latin1 => ascii */
1073        fast = 0;
1074    }
1075
1076    if (fast) {
1077        Py_MEMCPY((char*)to_data + to_kind * to_start,
1078                  (char*)from_data + from_kind * from_start,
1079                  to_kind * how_many);
1080    }
1081    else if (from_kind == PyUnicode_1BYTE_KIND
1082             && to_kind == PyUnicode_2BYTE_KIND)
1083    {
1084        _PyUnicode_CONVERT_BYTES(
1085            Py_UCS1, Py_UCS2,
1086            PyUnicode_1BYTE_DATA(from) + from_start,
1087            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1088            PyUnicode_2BYTE_DATA(to) + to_start
1089            );
1090    }
1091    else if (from_kind == PyUnicode_1BYTE_KIND
1092             && to_kind == PyUnicode_4BYTE_KIND)
1093    {
1094        _PyUnicode_CONVERT_BYTES(
1095            Py_UCS1, Py_UCS4,
1096            PyUnicode_1BYTE_DATA(from) + from_start,
1097            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1098            PyUnicode_4BYTE_DATA(to) + to_start
1099            );
1100    }
1101    else if (from_kind == PyUnicode_2BYTE_KIND
1102             && to_kind == PyUnicode_4BYTE_KIND)
1103    {
1104        _PyUnicode_CONVERT_BYTES(
1105            Py_UCS2, Py_UCS4,
1106            PyUnicode_2BYTE_DATA(from) + from_start,
1107            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1108            PyUnicode_4BYTE_DATA(to) + to_start
1109            );
1110    }
1111    else {
1112        /* check if max_char(from substring) <= max_char(to) */
1113        if (from_kind > to_kind
1114                /* latin1 => ascii */
1115            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1116        {
1117            /* slow path to check for character overflow */
1118            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1119            Py_UCS4 ch;
1120            Py_ssize_t i;
1121
1122#ifdef Py_DEBUG
1123            for (i=0; i < how_many; i++) {
1124                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1125                assert(ch <= to_maxchar);
1126                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1127            }
1128#else
1129            if (!check_maxchar) {
1130                for (i=0; i < how_many; i++) {
1131                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1132                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1133                }
1134            }
1135            else {
1136                for (i=0; i < how_many; i++) {
1137                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1138                    if (ch > to_maxchar)
1139                        return 1;
1140                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1141                }
1142            }
1143#endif
1144        }
1145        else {
1146            assert(0 && "inconsistent state");
1147            return 1;
1148        }
1149    }
1150    return 0;
1151}
1152
1153static void
1154copy_characters(PyObject *to, Py_ssize_t to_start,
1155                       PyObject *from, Py_ssize_t from_start,
1156                       Py_ssize_t how_many)
1157{
1158    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1159}
1160
1161Py_ssize_t
1162PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1163                         PyObject *from, Py_ssize_t from_start,
1164                         Py_ssize_t how_many)
1165{
1166    int err;
1167
1168    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1169        PyErr_BadInternalCall();
1170        return -1;
1171    }
1172
1173    if (PyUnicode_READY(from))
1174        return -1;
1175    if (PyUnicode_READY(to))
1176        return -1;
1177
1178    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1179    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1180        PyErr_Format(PyExc_SystemError,
1181                     "Cannot write %zi characters at %zi "
1182                     "in a string of %zi characters",
1183                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1184        return -1;
1185    }
1186
1187    if (how_many == 0)
1188        return 0;
1189
1190    if (_PyUnicode_Dirty(to))
1191        return -1;
1192
1193    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1194    if (err) {
1195        PyErr_Format(PyExc_SystemError,
1196                     "Cannot copy %s characters "
1197                     "into a string of %s characters",
1198                     unicode_kind_name(from),
1199                     unicode_kind_name(to));
1200        return -1;
1201    }
1202    return how_many;
1203}
1204
1205/* Find the maximum code point and count the number of surrogate pairs so a
1206   correct string length can be computed before converting a string to UCS4.
1207   This function counts single surrogates as a character and not as a pair.
1208
1209   Return 0 on success, or -1 on error. */
1210static int
1211find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1212                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1213{
1214    const wchar_t *iter;
1215
1216    assert(num_surrogates != NULL && maxchar != NULL);
1217    *num_surrogates = 0;
1218    *maxchar = 0;
1219
1220    for (iter = begin; iter < end; ) {
1221        if (*iter > *maxchar) {
1222            *maxchar = *iter;
1223#if SIZEOF_WCHAR_T != 2
1224            if (*maxchar >= 0x10000)
1225                return 0;
1226#endif
1227        }
1228#if SIZEOF_WCHAR_T == 2
1229        if (*iter >= 0xD800 && *iter <= 0xDBFF
1230            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1231        {
1232            Py_UCS4 surrogate_val;
1233            surrogate_val = (((iter[0] & 0x3FF)<<10)
1234                             | (iter[1] & 0x3FF)) + 0x10000;
1235            ++(*num_surrogates);
1236            if (surrogate_val > *maxchar)
1237                *maxchar = surrogate_val;
1238            iter += 2;
1239        }
1240        else
1241            iter++;
1242#else
1243        iter++;
1244#endif
1245    }
1246    return 0;
1247}
1248
1249#ifdef Py_DEBUG
1250static int unicode_ready_calls = 0;
1251#endif
1252
1253static int
1254unicode_ready(PyObject **p_obj, int replace)
1255{
1256    PyObject *unicode;
1257    wchar_t *end;
1258    Py_UCS4 maxchar = 0;
1259    Py_ssize_t num_surrogates;
1260#if SIZEOF_WCHAR_T == 2
1261    Py_ssize_t length_wo_surrogates;
1262#endif
1263
1264    assert(p_obj != NULL);
1265    unicode = *p_obj;
1266
1267    /* _PyUnicode_Ready() is only intended for old-style API usage where
1268       strings were created using _PyObject_New() and where no canonical
1269       representation (the str field) has been set yet aka strings
1270       which are not yet ready. */
1271    assert(_PyUnicode_CHECK(unicode));
1272    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1273    assert(_PyUnicode_WSTR(unicode) != NULL);
1274    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1275    assert(_PyUnicode_UTF8(unicode) == NULL);
1276    /* Actually, it should neither be interned nor be anything else: */
1277    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1278
1279#ifdef Py_DEBUG
1280    ++unicode_ready_calls;
1281#endif
1282
1283#ifdef Py_DEBUG
1284    assert(!replace || Py_REFCNT(unicode) == 1);
1285#else
1286    if (replace && Py_REFCNT(unicode) != 1)
1287        replace = 0;
1288#endif
1289    if (replace) {
1290        Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1291        wchar_t *wstr = _PyUnicode_WSTR(unicode);
1292        /* Optimization for empty strings */
1293        if (len == 0) {
1294            Py_INCREF(unicode_empty);
1295            Py_DECREF(*p_obj);
1296            *p_obj = unicode_empty;
1297            return 0;
1298        }
1299        if (len == 1 && wstr[0] < 256) {
1300            PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1301            if (latin1_char == NULL)
1302                return -1;
1303            Py_DECREF(*p_obj);
1304            *p_obj = latin1_char;
1305            return 0;
1306        }
1307    }
1308
1309    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1310    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1311                                &maxchar, &num_surrogates) == -1)
1312        return -1;
1313
1314    if (maxchar < 256) {
1315        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1316        if (!_PyUnicode_DATA_ANY(unicode)) {
1317            PyErr_NoMemory();
1318            return -1;
1319        }
1320        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1321                                _PyUnicode_WSTR(unicode), end,
1322                                PyUnicode_1BYTE_DATA(unicode));
1323        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1324        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1325        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1326        if (maxchar < 128) {
1327            _PyUnicode_STATE(unicode).ascii = 1;
1328            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1329            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1330        }
1331        else {
1332            _PyUnicode_STATE(unicode).ascii = 0;
1333            _PyUnicode_UTF8(unicode) = NULL;
1334            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1335        }
1336        PyObject_FREE(_PyUnicode_WSTR(unicode));
1337        _PyUnicode_WSTR(unicode) = NULL;
1338        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339    }
1340    /* In this case we might have to convert down from 4-byte native
1341       wchar_t to 2-byte unicode. */
1342    else if (maxchar < 65536) {
1343        assert(num_surrogates == 0 &&
1344               "FindMaxCharAndNumSurrogatePairs() messed up");
1345
1346#if SIZEOF_WCHAR_T == 2
1347        /* We can share representations and are done. */
1348        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1349        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1350        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1351        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1352        _PyUnicode_UTF8(unicode) = NULL;
1353        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1354#else
1355        /* sizeof(wchar_t) == 4 */
1356        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1357            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1358        if (!_PyUnicode_DATA_ANY(unicode)) {
1359            PyErr_NoMemory();
1360            return -1;
1361        }
1362        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1363                                _PyUnicode_WSTR(unicode), end,
1364                                PyUnicode_2BYTE_DATA(unicode));
1365        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1366        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1367        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1368        _PyUnicode_UTF8(unicode) = NULL;
1369        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1370        PyObject_FREE(_PyUnicode_WSTR(unicode));
1371        _PyUnicode_WSTR(unicode) = NULL;
1372        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1373#endif
1374    }
1375    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1376    else {
1377#if SIZEOF_WCHAR_T == 2
1378        /* in case the native representation is 2-bytes, we need to allocate a
1379           new normalized 4-byte version. */
1380        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1381        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1382        if (!_PyUnicode_DATA_ANY(unicode)) {
1383            PyErr_NoMemory();
1384            return -1;
1385        }
1386        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1387        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1388        _PyUnicode_UTF8(unicode) = NULL;
1389        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1390        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1391        _PyUnicode_STATE(unicode).ready = 1;
1392        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1393        PyObject_FREE(_PyUnicode_WSTR(unicode));
1394        _PyUnicode_WSTR(unicode) = NULL;
1395        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1396#else
1397        assert(num_surrogates == 0);
1398
1399        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1400        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1401        _PyUnicode_UTF8(unicode) = NULL;
1402        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1403        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1404#endif
1405        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1406    }
1407    _PyUnicode_STATE(unicode).ready = 1;
1408    assert(_PyUnicode_CheckConsistency(unicode, 1));
1409    return 0;
1410}
1411
1412int
1413_PyUnicode_ReadyReplace(PyObject **op)
1414{
1415    return unicode_ready(op, 1);
1416}
1417
1418int
1419_PyUnicode_Ready(PyObject *op)
1420{
1421    return unicode_ready(&op, 0);
1422}
1423
1424static void
1425unicode_dealloc(register PyObject *unicode)
1426{
1427    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1428    case SSTATE_NOT_INTERNED:
1429        break;
1430
1431    case SSTATE_INTERNED_MORTAL:
1432        /* revive dead object temporarily for DelItem */
1433        Py_REFCNT(unicode) = 3;
1434        if (PyDict_DelItem(interned, unicode) != 0)
1435            Py_FatalError(
1436                "deletion of interned string failed");
1437        break;
1438
1439    case SSTATE_INTERNED_IMMORTAL:
1440        Py_FatalError("Immortal interned string died.");
1441
1442    default:
1443        Py_FatalError("Inconsistent interned string state.");
1444    }
1445
1446    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1447        PyObject_DEL(_PyUnicode_WSTR(unicode));
1448    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1449        PyObject_DEL(_PyUnicode_UTF8(unicode));
1450
1451    if (PyUnicode_IS_COMPACT(unicode)) {
1452        Py_TYPE(unicode)->tp_free(unicode);
1453    }
1454    else {
1455        if (_PyUnicode_DATA_ANY(unicode))
1456            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1457        Py_TYPE(unicode)->tp_free(unicode);
1458    }
1459}
1460
1461#ifdef Py_DEBUG
1462static int
1463unicode_is_singleton(PyObject *unicode)
1464{
1465    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1466    if (unicode == unicode_empty)
1467        return 1;
1468    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1469    {
1470        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1471        if (ch < 256 && unicode_latin1[ch] == unicode)
1472            return 1;
1473    }
1474    return 0;
1475}
1476#endif
1477
1478static int
1479unicode_resizable(PyObject *unicode)
1480{
1481    if (Py_REFCNT(unicode) != 1)
1482        return 0;
1483    if (PyUnicode_CHECK_INTERNED(unicode))
1484        return 0;
1485#ifdef Py_DEBUG
1486    /* singleton refcount is greater than 1 */
1487    assert(!unicode_is_singleton(unicode));
1488#endif
1489    return 1;
1490}
1491
1492static int
1493unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1494{
1495    PyObject *unicode;
1496    Py_ssize_t old_length;
1497
1498    assert(p_unicode != NULL);
1499    unicode = *p_unicode;
1500
1501    assert(unicode != NULL);
1502    assert(PyUnicode_Check(unicode));
1503    assert(0 <= length);
1504
1505    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1506        old_length = PyUnicode_WSTR_LENGTH(unicode);
1507    else
1508        old_length = PyUnicode_GET_LENGTH(unicode);
1509    if (old_length == length)
1510        return 0;
1511
1512    if (!unicode_resizable(unicode)) {
1513        PyObject *copy = resize_copy(unicode, length);
1514        if (copy == NULL)
1515            return -1;
1516        Py_DECREF(*p_unicode);
1517        *p_unicode = copy;
1518        return 0;
1519    }
1520
1521    if (PyUnicode_IS_COMPACT(unicode)) {
1522        *p_unicode = resize_compact(unicode, length);
1523        if (*p_unicode == NULL)
1524            return -1;
1525        assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
1526        return 0;
1527    }
1528    return resize_inplace(unicode, length);
1529}
1530
1531int
1532PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1533{
1534    PyObject *unicode;
1535    if (p_unicode == NULL) {
1536        PyErr_BadInternalCall();
1537        return -1;
1538    }
1539    unicode = *p_unicode;
1540    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1541        || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1542    {
1543        PyErr_BadInternalCall();
1544        return -1;
1545    }
1546    return unicode_resize(p_unicode, length);
1547}
1548
1549static PyObject*
1550get_latin1_char(unsigned char ch)
1551{
1552    PyObject *unicode = unicode_latin1[ch];
1553    if (!unicode) {
1554        unicode = PyUnicode_New(1, ch);
1555        if (!unicode)
1556            return NULL;
1557        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1558        assert(_PyUnicode_CheckConsistency(unicode, 1));
1559        unicode_latin1[ch] = unicode;
1560    }
1561    Py_INCREF(unicode);
1562    return unicode;
1563}
1564
1565PyObject *
1566PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1567{
1568    PyObject *unicode;
1569    Py_UCS4 maxchar = 0;
1570    Py_ssize_t num_surrogates;
1571
1572    if (u == NULL)
1573        return (PyObject*)_PyUnicode_New(size);
1574
1575    /* If the Unicode data is known at construction time, we can apply
1576       some optimizations which share commonly used objects. */
1577
1578    /* Optimization for empty strings */
1579    if (size == 0 && unicode_empty != NULL) {
1580        Py_INCREF(unicode_empty);
1581        return unicode_empty;
1582    }
1583
1584    /* Single character Unicode objects in the Latin-1 range are
1585       shared when using this constructor */
1586    if (size == 1 && *u < 256)
1587        return get_latin1_char((unsigned char)*u);
1588
1589    /* If not empty and not single character, copy the Unicode data
1590       into the new object */
1591    if (find_maxchar_surrogates(u, u + size,
1592                                &maxchar, &num_surrogates) == -1)
1593        return NULL;
1594
1595    unicode = PyUnicode_New(size - num_surrogates,
1596                                                maxchar);
1597    if (!unicode)
1598        return NULL;
1599
1600    switch (PyUnicode_KIND(unicode)) {
1601    case PyUnicode_1BYTE_KIND:
1602        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1603                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1604        break;
1605    case PyUnicode_2BYTE_KIND:
1606#if Py_UNICODE_SIZE == 2
1607        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1608#else
1609        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1610                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1611#endif
1612        break;
1613    case PyUnicode_4BYTE_KIND:
1614#if SIZEOF_WCHAR_T == 2
1615        /* This is the only case which has to process surrogates, thus
1616           a simple copy loop is not enough and we need a function. */
1617        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1618#else
1619        assert(num_surrogates == 0);
1620        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1621#endif
1622        break;
1623    default:
1624        assert(0 && "Impossible state");
1625    }
1626
1627    assert(_PyUnicode_CheckConsistency(unicode, 1));
1628    return unicode;
1629}
1630
1631PyObject *
1632PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1633{
1634    if (size < 0) {
1635        PyErr_SetString(PyExc_SystemError,
1636                        "Negative size passed to PyUnicode_FromStringAndSize");
1637        return NULL;
1638    }
1639
1640    /* If the Unicode data is known at construction time, we can apply
1641       some optimizations which share commonly used objects.
1642       Also, this means the input must be UTF-8, so fall back to the
1643       UTF-8 decoder at the end. */
1644    if (u != NULL) {
1645
1646        /* Optimization for empty strings */
1647        if (size == 0 && unicode_empty != NULL) {
1648            Py_INCREF(unicode_empty);
1649            return unicode_empty;
1650        }
1651
1652        /* Single characters are shared when using this constructor.
1653           Restrict to ASCII, since the input must be UTF-8. */
1654        if (size == 1 && (unsigned char)*u < 128)
1655            return get_latin1_char((unsigned char)*u);
1656
1657        return PyUnicode_DecodeUTF8(u, size, NULL);
1658    }
1659
1660    return (PyObject *)_PyUnicode_New(size);
1661}
1662
1663PyObject *
1664PyUnicode_FromString(const char *u)
1665{
1666    size_t size = strlen(u);
1667    if (size > PY_SSIZE_T_MAX) {
1668        PyErr_SetString(PyExc_OverflowError, "input too long");
1669        return NULL;
1670    }
1671
1672    return PyUnicode_FromStringAndSize(u, size);
1673}
1674
1675PyObject *
1676_PyUnicode_FromId(_Py_Identifier *id)
1677{
1678    if (!id->object) {
1679        id->object = PyUnicode_FromString(id->string);
1680        if (!id->object)
1681            return NULL;
1682        PyUnicode_InternInPlace(&id->object);
1683        assert(!id->next);
1684        id->next = static_strings;
1685        static_strings = id;
1686    }
1687    Py_INCREF(id->object);
1688    return id->object;
1689}
1690
1691void
1692_PyUnicode_ClearStaticStrings()
1693{
1694    _Py_Identifier *i;
1695    for (i = static_strings; i; i = i->next) {
1696        Py_DECREF(i->object);
1697        i->object = NULL;
1698        i->next = NULL;
1699    }
1700}
1701
1702static PyObject*
1703unicode_fromascii(const unsigned char* s, Py_ssize_t size)
1704{
1705    PyObject *res;
1706#ifdef Py_DEBUG
1707    const unsigned char *p;
1708    const unsigned char *end = s + size;
1709    for (p=s; p < end; p++) {
1710        assert(*p < 128);
1711    }
1712#endif
1713    if (size == 1)
1714        return get_latin1_char(s[0]);
1715    res = PyUnicode_New(size, 127);
1716    if (!res)
1717        return NULL;
1718    memcpy(PyUnicode_1BYTE_DATA(res), s, size);
1719    return res;
1720}
1721
1722static Py_UCS4
1723kind_maxchar_limit(unsigned int kind)
1724{
1725    switch(kind) {
1726    case PyUnicode_1BYTE_KIND:
1727        return 0x80;
1728    case PyUnicode_2BYTE_KIND:
1729        return 0x100;
1730    case PyUnicode_4BYTE_KIND:
1731        return 0x10000;
1732    default:
1733        assert(0 && "invalid kind");
1734        return 0x10ffff;
1735    }
1736}
1737
1738static PyObject*
1739_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1740{
1741    PyObject *res;
1742    unsigned char max_char = 127;
1743
1744    assert(size >= 0);
1745    if (size == 1)
1746        return get_latin1_char(u[0]);
1747    max_char = ucs1lib_find_max_char(u, u + size);
1748    res = PyUnicode_New(size, max_char);
1749    if (!res)
1750        return NULL;
1751    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1752    assert(_PyUnicode_CheckConsistency(res, 1));
1753    return res;
1754}
1755
1756static PyObject*
1757_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1758{
1759    PyObject *res;
1760    Py_UCS2 max_char = 0;
1761
1762    assert(size >= 0);
1763    if (size == 1 && u[0] < 256)
1764        return get_latin1_char((unsigned char)u[0]);
1765    max_char = ucs2lib_find_max_char(u, u + size);
1766    res = PyUnicode_New(size, max_char);
1767    if (!res)
1768        return NULL;
1769    if (max_char >= 256)
1770        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1771    else {
1772        _PyUnicode_CONVERT_BYTES(
1773            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1774    }
1775    assert(_PyUnicode_CheckConsistency(res, 1));
1776    return res;
1777}
1778
1779static PyObject*
1780_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1781{
1782    PyObject *res;
1783    Py_UCS4 max_char = 0;
1784
1785    assert(size >= 0);
1786    if (size == 1 && u[0] < 256)
1787        return get_latin1_char(u[0]);
1788    max_char = ucs4lib_find_max_char(u, u + size);
1789    res = PyUnicode_New(size, max_char);
1790    if (!res)
1791        return NULL;
1792    if (max_char < 256)
1793        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1794                                 PyUnicode_1BYTE_DATA(res));
1795    else if (max_char < 0x10000)
1796        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1797                                 PyUnicode_2BYTE_DATA(res));
1798    else
1799        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1800    assert(_PyUnicode_CheckConsistency(res, 1));
1801    return res;
1802}
1803
1804PyObject*
1805PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1806{
1807    switch(kind) {
1808    case PyUnicode_1BYTE_KIND:
1809        return _PyUnicode_FromUCS1(buffer, size);
1810    case PyUnicode_2BYTE_KIND:
1811        return _PyUnicode_FromUCS2(buffer, size);
1812    case PyUnicode_4BYTE_KIND:
1813        return _PyUnicode_FromUCS4(buffer, size);
1814    default:
1815        assert(0 && "invalid kind");
1816        PyErr_SetString(PyExc_SystemError, "invalid kind");
1817        return NULL;
1818    }
1819}
1820
1821/* Ensure that a string uses the most efficient storage, if it is not the
1822   case: create a new string with of the right kind. Write NULL into *p_unicode
1823   on error. */
1824static void
1825unicode_adjust_maxchar(PyObject **p_unicode)
1826{
1827    PyObject *unicode, *copy;
1828    Py_UCS4 max_char;
1829    Py_ssize_t len;
1830    unsigned int kind;
1831
1832    assert(p_unicode != NULL);
1833    unicode = *p_unicode;
1834    assert(PyUnicode_IS_READY(unicode));
1835    if (PyUnicode_IS_ASCII(unicode))
1836        return;
1837
1838    len = PyUnicode_GET_LENGTH(unicode);
1839    kind = PyUnicode_KIND(unicode);
1840    if (kind == PyUnicode_1BYTE_KIND) {
1841        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1842        max_char = ucs1lib_find_max_char(u, u + len);
1843        if (max_char >= 128)
1844            return;
1845    }
1846    else if (kind == PyUnicode_2BYTE_KIND) {
1847        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1848        max_char = ucs2lib_find_max_char(u, u + len);
1849        if (max_char >= 256)
1850            return;
1851    }
1852    else {
1853        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
1854        assert(kind == PyUnicode_4BYTE_KIND);
1855        max_char = ucs4lib_find_max_char(u, u + len);
1856        if (max_char >= 0x10000)
1857            return;
1858    }
1859    copy = PyUnicode_New(len, max_char);
1860    copy_characters(copy, 0, unicode, 0, len);
1861    Py_DECREF(unicode);
1862    *p_unicode = copy;
1863}
1864
1865PyObject*
1866PyUnicode_Copy(PyObject *unicode)
1867{
1868    Py_ssize_t size;
1869    PyObject *copy;
1870    void *data;
1871
1872    if (!PyUnicode_Check(unicode)) {
1873        PyErr_BadInternalCall();
1874        return NULL;
1875    }
1876    if (PyUnicode_READY(unicode))
1877        return NULL;
1878
1879    size = PyUnicode_GET_LENGTH(unicode);
1880    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1881    if (!copy)
1882        return NULL;
1883    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1884
1885    data = PyUnicode_DATA(unicode);
1886    switch (PyUnicode_KIND(unicode))
1887    {
1888    case PyUnicode_1BYTE_KIND:
1889        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1890        break;
1891    case PyUnicode_2BYTE_KIND:
1892        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1893        break;
1894    case PyUnicode_4BYTE_KIND:
1895        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1896        break;
1897    default:
1898        assert(0);
1899        break;
1900    }
1901    assert(_PyUnicode_CheckConsistency(copy, 1));
1902    return copy;
1903}
1904
1905
1906/* Widen Unicode objects to larger buffers. Don't write terminating null
1907   character. Return NULL on error. */
1908
1909void*
1910_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1911{
1912    Py_ssize_t len;
1913    void *result;
1914    unsigned int skind;
1915
1916    if (PyUnicode_READY(s))
1917        return NULL;
1918
1919    len = PyUnicode_GET_LENGTH(s);
1920    skind = PyUnicode_KIND(s);
1921    if (skind >= kind) {
1922        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
1923        return NULL;
1924    }
1925    switch(kind) {
1926    case PyUnicode_2BYTE_KIND:
1927        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1928        if (!result)
1929            return PyErr_NoMemory();
1930        assert(skind == PyUnicode_1BYTE_KIND);
1931        _PyUnicode_CONVERT_BYTES(
1932            Py_UCS1, Py_UCS2,
1933            PyUnicode_1BYTE_DATA(s),
1934            PyUnicode_1BYTE_DATA(s) + len,
1935            result);
1936        return result;
1937    case PyUnicode_4BYTE_KIND:
1938        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1939        if (!result)
1940            return PyErr_NoMemory();
1941        if (skind == PyUnicode_2BYTE_KIND) {
1942            _PyUnicode_CONVERT_BYTES(
1943                Py_UCS2, Py_UCS4,
1944                PyUnicode_2BYTE_DATA(s),
1945                PyUnicode_2BYTE_DATA(s) + len,
1946                result);
1947        }
1948        else {
1949            assert(skind == PyUnicode_1BYTE_KIND);
1950            _PyUnicode_CONVERT_BYTES(
1951                Py_UCS1, Py_UCS4,
1952                PyUnicode_1BYTE_DATA(s),
1953                PyUnicode_1BYTE_DATA(s) + len,
1954                result);
1955        }
1956        return result;
1957    default:
1958        break;
1959    }
1960    PyErr_SetString(PyExc_SystemError, "invalid kind");
1961    return NULL;
1962}
1963
1964static Py_UCS4*
1965as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1966        int copy_null)
1967{
1968    int kind;
1969    void *data;
1970    Py_ssize_t len, targetlen;
1971    if (PyUnicode_READY(string) == -1)
1972        return NULL;
1973    kind = PyUnicode_KIND(string);
1974    data = PyUnicode_DATA(string);
1975    len = PyUnicode_GET_LENGTH(string);
1976    targetlen = len;
1977    if (copy_null)
1978        targetlen++;
1979    if (!target) {
1980        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1981            PyErr_NoMemory();
1982            return NULL;
1983        }
1984        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1985        if (!target) {
1986            PyErr_NoMemory();
1987            return NULL;
1988        }
1989    }
1990    else {
1991        if (targetsize < targetlen) {
1992            PyErr_Format(PyExc_SystemError,
1993                         "string is longer than the buffer");
1994            if (copy_null && 0 < targetsize)
1995                target[0] = 0;
1996            return NULL;
1997        }
1998    }
1999    if (kind == PyUnicode_1BYTE_KIND) {
2000        Py_UCS1 *start = (Py_UCS1 *) data;
2001        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2002    }
2003    else if (kind == PyUnicode_2BYTE_KIND) {
2004        Py_UCS2 *start = (Py_UCS2 *) data;
2005        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2006    }
2007    else {
2008        assert(kind == PyUnicode_4BYTE_KIND);
2009        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2010    }
2011    if (copy_null)
2012        target[len] = 0;
2013    return target;
2014}
2015
2016Py_UCS4*
2017PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2018                 int copy_null)
2019{
2020    if (target == NULL || targetsize < 1) {
2021        PyErr_BadInternalCall();
2022        return NULL;
2023    }
2024    return as_ucs4(string, target, targetsize, copy_null);
2025}
2026
2027Py_UCS4*
2028PyUnicode_AsUCS4Copy(PyObject *string)
2029{
2030    return as_ucs4(string, NULL, 0, 1);
2031}
2032
2033#ifdef HAVE_WCHAR_H
2034
2035PyObject *
2036PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2037{
2038    if (w == NULL) {
2039        if (size == 0)
2040            return PyUnicode_New(0, 0);
2041        PyErr_BadInternalCall();
2042        return NULL;
2043    }
2044
2045    if (size == -1) {
2046        size = wcslen(w);
2047    }
2048
2049    return PyUnicode_FromUnicode(w, size);
2050}
2051
2052#endif /* HAVE_WCHAR_H */
2053
2054static void
2055makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2056        int zeropad, int width, int precision, char c)
2057{
2058    *fmt++ = '%';
2059    if (width) {
2060        if (zeropad)
2061            *fmt++ = '0';
2062        fmt += sprintf(fmt, "%d", width);
2063    }
2064    if (precision)
2065        fmt += sprintf(fmt, ".%d", precision);
2066    if (longflag)
2067        *fmt++ = 'l';
2068    else if (longlongflag) {
2069        /* longlongflag should only ever be nonzero on machines with
2070           HAVE_LONG_LONG defined */
2071#ifdef HAVE_LONG_LONG
2072        char *f = PY_FORMAT_LONG_LONG;
2073        while (*f)
2074            *fmt++ = *f++;
2075#else
2076        /* we shouldn't ever get here */
2077        assert(0);
2078        *fmt++ = 'l';
2079#endif
2080    }
2081    else if (size_tflag) {
2082        char *f = PY_FORMAT_SIZE_T;
2083        while (*f)
2084            *fmt++ = *f++;
2085    }
2086    *fmt++ = c;
2087    *fmt = '\0';
2088}
2089
2090/* helper for PyUnicode_FromFormatV() */
2091
2092static const char*
2093parse_format_flags(const char *f,
2094                   int *p_width, int *p_precision,
2095                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2096{
2097    int width, precision, longflag, longlongflag, size_tflag;
2098
2099    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2100    f++;
2101    width = 0;
2102    while (Py_ISDIGIT((unsigned)*f))
2103        width = (width*10) + *f++ - '0';
2104    precision = 0;
2105    if (*f == '.') {
2106        f++;
2107        while (Py_ISDIGIT((unsigned)*f))
2108            precision = (precision*10) + *f++ - '0';
2109        if (*f == '%') {
2110            /* "%.3%s" => f points to "3" */
2111            f--;
2112        }
2113    }
2114    if (*f == '\0') {
2115        /* bogus format "%.1" => go backward, f points to "1" */
2116        f--;
2117    }
2118    if (p_width != NULL)
2119        *p_width = width;
2120    if (p_precision != NULL)
2121        *p_precision = precision;
2122
2123    /* Handle %ld, %lu, %lld and %llu. */
2124    longflag = 0;
2125    longlongflag = 0;
2126    size_tflag = 0;
2127
2128    if (*f == 'l') {
2129        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2130            longflag = 1;
2131            ++f;
2132        }
2133#ifdef HAVE_LONG_LONG
2134        else if (f[1] == 'l' &&
2135                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2136            longlongflag = 1;
2137            f += 2;
2138        }
2139#endif
2140    }
2141    /* handle the size_t flag. */
2142    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2143        size_tflag = 1;
2144        ++f;
2145    }
2146    if (p_longflag != NULL)
2147        *p_longflag = longflag;
2148    if (p_longlongflag != NULL)
2149        *p_longlongflag = longlongflag;
2150    if (p_size_tflag != NULL)
2151        *p_size_tflag = size_tflag;
2152    return f;
2153}
2154
2155/* maximum number of characters required for output of %ld.  21 characters
2156   allows for 64-bit integers (in decimal) and an optional sign. */
2157#define MAX_LONG_CHARS 21
2158/* maximum number of characters required for output of %lld.
2159   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2160   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2161#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2162
2163PyObject *
2164PyUnicode_FromFormatV(const char *format, va_list vargs)
2165{
2166    va_list count;
2167    Py_ssize_t callcount = 0;
2168    PyObject **callresults = NULL;
2169    PyObject **callresult = NULL;
2170    Py_ssize_t n = 0;
2171    int width = 0;
2172    int precision = 0;
2173    int zeropad;
2174    const char* f;
2175    PyObject *string;
2176    /* used by sprintf */
2177    char fmt[61]; /* should be enough for %0width.precisionlld */
2178    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2179    Py_UCS4 argmaxchar;
2180    Py_ssize_t numbersize = 0;
2181    char *numberresults = NULL;
2182    char *numberresult = NULL;
2183    Py_ssize_t i;
2184    int kind;
2185    void *data;
2186
2187    Py_VA_COPY(count, vargs);
2188    /* step 1: count the number of %S/%R/%A/%s format specifications
2189     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2190     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2191     * result in an array)
2192     * also estimate a upper bound for all the number formats in the string,
2193     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2194     * buffer before putting everything together. */
2195    for (f = format; *f; f++) {
2196        if (*f == '%') {
2197            int longlongflag;
2198            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2199            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2200            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2201                ++callcount;
2202
2203            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2204#ifdef HAVE_LONG_LONG
2205                if (longlongflag) {
2206                    if (width < MAX_LONG_LONG_CHARS)
2207                        width = MAX_LONG_LONG_CHARS;
2208                }
2209                else
2210#endif
2211                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2212                       including sign.  Decimal takes the most space.  This
2213                       isn't enough for octal.  If a width is specified we
2214                       need more (which we allocate later). */
2215                    if (width < MAX_LONG_CHARS)
2216                        width = MAX_LONG_CHARS;
2217
2218                /* account for the size + '\0' to separate numbers
2219                   inside of the numberresults buffer */
2220                numbersize += (width + 1);
2221            }
2222        }
2223        else if ((unsigned char)*f > 127) {
2224            PyErr_Format(PyExc_ValueError,
2225                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2226                "string, got a non-ASCII byte: 0x%02x",
2227                (unsigned char)*f);
2228            return NULL;
2229        }
2230    }
2231    /* step 2: allocate memory for the results of
2232     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2233    if (callcount) {
2234        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2235        if (!callresults) {
2236            PyErr_NoMemory();
2237            return NULL;
2238        }
2239        callresult = callresults;
2240    }
2241    /* step 2.5: allocate memory for the results of formating numbers */
2242    if (numbersize) {
2243        numberresults = PyObject_Malloc(numbersize);
2244        if (!numberresults) {
2245            PyErr_NoMemory();
2246            goto fail;
2247        }
2248        numberresult = numberresults;
2249    }
2250
2251    /* step 3: format numbers and figure out how large a buffer we need */
2252    for (f = format; *f; f++) {
2253        if (*f == '%') {
2254            const char* p;
2255            int longflag;
2256            int longlongflag;
2257            int size_tflag;
2258            int numprinted;
2259
2260            p = f;
2261            zeropad = (f[1] == '0');
2262            f = parse_format_flags(f, &width, &precision,
2263                                   &longflag, &longlongflag, &size_tflag);
2264            switch (*f) {
2265            case 'c':
2266            {
2267                Py_UCS4 ordinal = va_arg(count, int);
2268                maxchar = Py_MAX(maxchar, ordinal);
2269                n++;
2270                break;
2271            }
2272            case '%':
2273                n++;
2274                break;
2275            case 'i':
2276            case 'd':
2277                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2278                        width, precision, *f);
2279                if (longflag)
2280                    numprinted = sprintf(numberresult, fmt,
2281                                         va_arg(count, long));
2282#ifdef HAVE_LONG_LONG
2283                else if (longlongflag)
2284                    numprinted = sprintf(numberresult, fmt,
2285                                         va_arg(count, PY_LONG_LONG));
2286#endif
2287                else if (size_tflag)
2288                    numprinted = sprintf(numberresult, fmt,
2289                                         va_arg(count, Py_ssize_t));
2290                else
2291                    numprinted = sprintf(numberresult, fmt,
2292                                         va_arg(count, int));
2293                n += numprinted;
2294                /* advance by +1 to skip over the '\0' */
2295                numberresult += (numprinted + 1);
2296                assert(*(numberresult - 1) == '\0');
2297                assert(*(numberresult - 2) != '\0');
2298                assert(numprinted >= 0);
2299                assert(numberresult <= numberresults + numbersize);
2300                break;
2301            case 'u':
2302                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2303                        width, precision, 'u');
2304                if (longflag)
2305                    numprinted = sprintf(numberresult, fmt,
2306                                         va_arg(count, unsigned long));
2307#ifdef HAVE_LONG_LONG
2308                else if (longlongflag)
2309                    numprinted = sprintf(numberresult, fmt,
2310                                         va_arg(count, unsigned PY_LONG_LONG));
2311#endif
2312                else if (size_tflag)
2313                    numprinted = sprintf(numberresult, fmt,
2314                                         va_arg(count, size_t));
2315                else
2316                    numprinted = sprintf(numberresult, fmt,
2317                                         va_arg(count, unsigned int));
2318                n += numprinted;
2319                numberresult += (numprinted + 1);
2320                assert(*(numberresult - 1) == '\0');
2321                assert(*(numberresult - 2) != '\0');
2322                assert(numprinted >= 0);
2323                assert(numberresult <= numberresults + numbersize);
2324                break;
2325            case 'x':
2326                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2327                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2328                n += numprinted;
2329                numberresult += (numprinted + 1);
2330                assert(*(numberresult - 1) == '\0');
2331                assert(*(numberresult - 2) != '\0');
2332                assert(numprinted >= 0);
2333                assert(numberresult <= numberresults + numbersize);
2334                break;
2335            case 'p':
2336                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2337                /* %p is ill-defined:  ensure leading 0x. */
2338                if (numberresult[1] == 'X')
2339                    numberresult[1] = 'x';
2340                else if (numberresult[1] != 'x') {
2341                    memmove(numberresult + 2, numberresult,
2342                            strlen(numberresult) + 1);
2343                    numberresult[0] = '0';
2344                    numberresult[1] = 'x';
2345                    numprinted += 2;
2346                }
2347                n += numprinted;
2348                numberresult += (numprinted + 1);
2349                assert(*(numberresult - 1) == '\0');
2350                assert(*(numberresult - 2) != '\0');
2351                assert(numprinted >= 0);
2352                assert(numberresult <= numberresults + numbersize);
2353                break;
2354            case 's':
2355            {
2356                /* UTF-8 */
2357                const char *s = va_arg(count, const char*);
2358                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2359                if (!str)
2360                    goto fail;
2361                /* since PyUnicode_DecodeUTF8 returns already flexible
2362                   unicode objects, there is no need to call ready on them */
2363                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2364                maxchar = Py_MAX(maxchar, argmaxchar);
2365                n += PyUnicode_GET_LENGTH(str);
2366                /* Remember the str and switch to the next slot */
2367                *callresult++ = str;
2368                break;
2369            }
2370            case 'U':
2371            {
2372                PyObject *obj = va_arg(count, PyObject *);
2373                assert(obj && _PyUnicode_CHECK(obj));
2374                if (PyUnicode_READY(obj) == -1)
2375                    goto fail;
2376                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2377                maxchar = Py_MAX(maxchar, argmaxchar);
2378                n += PyUnicode_GET_LENGTH(obj);
2379                break;
2380            }
2381            case 'V':
2382            {
2383                PyObject *obj = va_arg(count, PyObject *);
2384                const char *str = va_arg(count, const char *);
2385                PyObject *str_obj;
2386                assert(obj || str);
2387                assert(!obj || _PyUnicode_CHECK(obj));
2388                if (obj) {
2389                    if (PyUnicode_READY(obj) == -1)
2390                        goto fail;
2391                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2392                    maxchar = Py_MAX(maxchar, argmaxchar);
2393                    n += PyUnicode_GET_LENGTH(obj);
2394                    *callresult++ = NULL;
2395                }
2396                else {
2397                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2398                    if (!str_obj)
2399                        goto fail;
2400                    if (PyUnicode_READY(str_obj)) {
2401                        Py_DECREF(str_obj);
2402                        goto fail;
2403                    }
2404                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2405                    maxchar = Py_MAX(maxchar, argmaxchar);
2406                    n += PyUnicode_GET_LENGTH(str_obj);
2407                    *callresult++ = str_obj;
2408                }
2409                break;
2410            }
2411            case 'S':
2412            {
2413                PyObject *obj = va_arg(count, PyObject *);
2414                PyObject *str;
2415                assert(obj);
2416                str = PyObject_Str(obj);
2417                if (!str || PyUnicode_READY(str) == -1)
2418                    goto fail;
2419                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2420                maxchar = Py_MAX(maxchar, argmaxchar);
2421                n += PyUnicode_GET_LENGTH(str);
2422                /* Remember the str and switch to the next slot */
2423                *callresult++ = str;
2424                break;
2425            }
2426            case 'R':
2427            {
2428                PyObject *obj = va_arg(count, PyObject *);
2429                PyObject *repr;
2430                assert(obj);
2431                repr = PyObject_Repr(obj);
2432                if (!repr || PyUnicode_READY(repr) == -1)
2433                    goto fail;
2434                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2435                maxchar = Py_MAX(maxchar, argmaxchar);
2436                n += PyUnicode_GET_LENGTH(repr);
2437                /* Remember the repr and switch to the next slot */
2438                *callresult++ = repr;
2439                break;
2440            }
2441            case 'A':
2442            {
2443                PyObject *obj = va_arg(count, PyObject *);
2444                PyObject *ascii;
2445                assert(obj);
2446                ascii = PyObject_ASCII(obj);
2447                if (!ascii || PyUnicode_READY(ascii) == -1)
2448                    goto fail;
2449                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2450                maxchar = Py_MAX(maxchar, argmaxchar);
2451                n += PyUnicode_GET_LENGTH(ascii);
2452                /* Remember the repr and switch to the next slot */
2453                *callresult++ = ascii;
2454                break;
2455            }
2456            default:
2457                /* if we stumble upon an unknown
2458                   formatting code, copy the rest of
2459                   the format string to the output
2460                   string. (we cannot just skip the
2461                   code, since there's no way to know
2462                   what's in the argument list) */
2463                n += strlen(p);
2464                goto expand;
2465            }
2466        } else
2467            n++;
2468    }
2469  expand:
2470    /* step 4: fill the buffer */
2471    /* Since we've analyzed how much space we need,
2472       we don't have to resize the string.
2473       There can be no errors beyond this point. */
2474    string = PyUnicode_New(n, maxchar);
2475    if (!string)
2476        goto fail;
2477    kind = PyUnicode_KIND(string);
2478    data = PyUnicode_DATA(string);
2479    callresult = callresults;
2480    numberresult = numberresults;
2481
2482    for (i = 0, f = format; *f; f++) {
2483        if (*f == '%') {
2484            const char* p;
2485
2486            p = f;
2487            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2488            /* checking for == because the last argument could be a empty
2489               string, which causes i to point to end, the assert at the end of
2490               the loop */
2491            assert(i <= PyUnicode_GET_LENGTH(string));
2492
2493            switch (*f) {
2494            case 'c':
2495            {
2496                const int ordinal = va_arg(vargs, int);
2497                PyUnicode_WRITE(kind, data, i++, ordinal);
2498                break;
2499            }
2500            case 'i':
2501            case 'd':
2502            case 'u':
2503            case 'x':
2504            case 'p':
2505                /* unused, since we already have the result */
2506                if (*f == 'p')
2507                    (void) va_arg(vargs, void *);
2508                else
2509                    (void) va_arg(vargs, int);
2510                /* extract the result from numberresults and append. */
2511                for (; *numberresult; ++i, ++numberresult)
2512                    PyUnicode_WRITE(kind, data, i, *numberresult);
2513                /* skip over the separating '\0' */
2514                assert(*numberresult == '\0');
2515                numberresult++;
2516                assert(numberresult <= numberresults + numbersize);
2517                break;
2518            case 's':
2519            {
2520                /* unused, since we already have the result */
2521                Py_ssize_t size;
2522                (void) va_arg(vargs, char *);
2523                size = PyUnicode_GET_LENGTH(*callresult);
2524                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2525                copy_characters(string, i, *callresult, 0, size);
2526                i += size;
2527                /* We're done with the unicode()/repr() => forget it */
2528                Py_DECREF(*callresult);
2529                /* switch to next unicode()/repr() result */
2530                ++callresult;
2531                break;
2532            }
2533            case 'U':
2534            {
2535                PyObject *obj = va_arg(vargs, PyObject *);
2536                Py_ssize_t size;
2537                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2538                size = PyUnicode_GET_LENGTH(obj);
2539                copy_characters(string, i, obj, 0, size);
2540                i += size;
2541                break;
2542            }
2543            case 'V':
2544            {
2545                Py_ssize_t size;
2546                PyObject *obj = va_arg(vargs, PyObject *);
2547                va_arg(vargs, const char *);
2548                if (obj) {
2549                    size = PyUnicode_GET_LENGTH(obj);
2550                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2551                    copy_characters(string, i, obj, 0, size);
2552                    i += size;
2553                } else {
2554                    size = PyUnicode_GET_LENGTH(*callresult);
2555                    assert(PyUnicode_KIND(*callresult) <=
2556                           PyUnicode_KIND(string));
2557                    copy_characters(string, i, *callresult, 0, size);
2558                    i += size;
2559                    Py_DECREF(*callresult);
2560                }
2561                ++callresult;
2562                break;
2563            }
2564            case 'S':
2565            case 'R':
2566            case 'A':
2567            {
2568                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2569                /* unused, since we already have the result */
2570                (void) va_arg(vargs, PyObject *);
2571                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2572                copy_characters(string, i, *callresult, 0,  size);
2573                i += size;
2574                /* We're done with the unicode()/repr() => forget it */
2575                Py_DECREF(*callresult);
2576                /* switch to next unicode()/repr() result */
2577                ++callresult;
2578                break;
2579            }
2580            case '%':
2581                PyUnicode_WRITE(kind, data, i++, '%');
2582                break;
2583            default:
2584                for (; *p; ++p, ++i)
2585                    PyUnicode_WRITE(kind, data, i, *p);
2586                assert(i == PyUnicode_GET_LENGTH(string));
2587                goto end;
2588            }
2589        }
2590        else {
2591            assert(i < PyUnicode_GET_LENGTH(string));
2592            PyUnicode_WRITE(kind, data, i++, *f);
2593        }
2594    }
2595    assert(i == PyUnicode_GET_LENGTH(string));
2596
2597  end:
2598    if (callresults)
2599        PyObject_Free(callresults);
2600    if (numberresults)
2601        PyObject_Free(numberresults);
2602    assert(_PyUnicode_CheckConsistency(string, 1));
2603    return string;
2604  fail:
2605    if (callresults) {
2606        PyObject **callresult2 = callresults;
2607        while (callresult2 < callresult) {
2608            Py_XDECREF(*callresult2);
2609            ++callresult2;
2610        }
2611        PyObject_Free(callresults);
2612    }
2613    if (numberresults)
2614        PyObject_Free(numberresults);
2615    return NULL;
2616}
2617
2618PyObject *
2619PyUnicode_FromFormat(const char *format, ...)
2620{
2621    PyObject* ret;
2622    va_list vargs;
2623
2624#ifdef HAVE_STDARG_PROTOTYPES
2625    va_start(vargs, format);
2626#else
2627    va_start(vargs);
2628#endif
2629    ret = PyUnicode_FromFormatV(format, vargs);
2630    va_end(vargs);
2631    return ret;
2632}
2633
2634#ifdef HAVE_WCHAR_H
2635
2636/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2637   convert a Unicode object to a wide character string.
2638
2639   - If w is NULL: return the number of wide characters (including the null
2640     character) required to convert the unicode object. Ignore size argument.
2641
2642   - Otherwise: return the number of wide characters (excluding the null
2643     character) written into w. Write at most size wide characters (including
2644     the null character). */
2645static Py_ssize_t
2646unicode_aswidechar(PyObject *unicode,
2647                   wchar_t *w,
2648                   Py_ssize_t size)
2649{
2650    Py_ssize_t res;
2651    const wchar_t *wstr;
2652
2653    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2654    if (wstr == NULL)
2655        return -1;
2656
2657    if (w != NULL) {
2658        if (size > res)
2659            size = res + 1;
2660        else
2661            res = size;
2662        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2663        return res;
2664    }
2665    else
2666        return res + 1;
2667}
2668
2669Py_ssize_t
2670PyUnicode_AsWideChar(PyObject *unicode,
2671                     wchar_t *w,
2672                     Py_ssize_t size)
2673{
2674    if (unicode == NULL) {
2675        PyErr_BadInternalCall();
2676        return -1;
2677    }
2678    return unicode_aswidechar(unicode, w, size);
2679}
2680
2681wchar_t*
2682PyUnicode_AsWideCharString(PyObject *unicode,
2683                           Py_ssize_t *size)
2684{
2685    wchar_t* buffer;
2686    Py_ssize_t buflen;
2687
2688    if (unicode == NULL) {
2689        PyErr_BadInternalCall();
2690        return NULL;
2691    }
2692
2693    buflen = unicode_aswidechar(unicode, NULL, 0);
2694    if (buflen == -1)
2695        return NULL;
2696    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2697        PyErr_NoMemory();
2698        return NULL;
2699    }
2700
2701    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2702    if (buffer == NULL) {
2703        PyErr_NoMemory();
2704        return NULL;
2705    }
2706    buflen = unicode_aswidechar(unicode, buffer, buflen);
2707    if (buflen == -1)
2708        return NULL;
2709    if (size != NULL)
2710        *size = buflen;
2711    return buffer;
2712}
2713
2714#endif /* HAVE_WCHAR_H */
2715
2716PyObject *
2717PyUnicode_FromOrdinal(int ordinal)
2718{
2719    PyObject *v;
2720    if (ordinal < 0 || ordinal > 0x10ffff) {
2721        PyErr_SetString(PyExc_ValueError,
2722                        "chr() arg not in range(0x110000)");
2723        return NULL;
2724    }
2725
2726    if (ordinal < 256)
2727        return get_latin1_char(ordinal);
2728
2729    v = PyUnicode_New(1, ordinal);
2730    if (v == NULL)
2731        return NULL;
2732    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2733    assert(_PyUnicode_CheckConsistency(v, 1));
2734    return v;
2735}
2736
2737PyObject *
2738PyUnicode_FromObject(register PyObject *obj)
2739{
2740    /* XXX Perhaps we should make this API an alias of
2741       PyObject_Str() instead ?! */
2742    if (PyUnicode_CheckExact(obj)) {
2743        if (PyUnicode_READY(obj))
2744            return NULL;
2745        Py_INCREF(obj);
2746        return obj;
2747    }
2748    if (PyUnicode_Check(obj)) {
2749        /* For a Unicode subtype that's not a Unicode object,
2750           return a true Unicode object with the same data. */
2751        return PyUnicode_Copy(obj);
2752    }
2753    PyErr_Format(PyExc_TypeError,
2754                 "Can't convert '%.100s' object to str implicitly",
2755                 Py_TYPE(obj)->tp_name);
2756    return NULL;
2757}
2758
2759PyObject *
2760PyUnicode_FromEncodedObject(register PyObject *obj,
2761                            const char *encoding,
2762                            const char *errors)
2763{
2764    Py_buffer buffer;
2765    PyObject *v;
2766
2767    if (obj == NULL) {
2768        PyErr_BadInternalCall();
2769        return NULL;
2770    }
2771
2772    /* Decoding bytes objects is the most common case and should be fast */
2773    if (PyBytes_Check(obj)) {
2774        if (PyBytes_GET_SIZE(obj) == 0) {
2775            Py_INCREF(unicode_empty);
2776            v = unicode_empty;
2777        }
2778        else {
2779            v = PyUnicode_Decode(
2780                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2781                    encoding, errors);
2782        }
2783        return v;
2784    }
2785
2786    if (PyUnicode_Check(obj)) {
2787        PyErr_SetString(PyExc_TypeError,
2788                        "decoding str is not supported");
2789        return NULL;
2790    }
2791
2792    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2793    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2794        PyErr_Format(PyExc_TypeError,
2795                     "coercing to str: need bytes, bytearray "
2796                     "or buffer-like object, %.80s found",
2797                     Py_TYPE(obj)->tp_name);
2798        return NULL;
2799    }
2800
2801    if (buffer.len == 0) {
2802        Py_INCREF(unicode_empty);
2803        v = unicode_empty;
2804    }
2805    else
2806        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2807
2808    PyBuffer_Release(&buffer);
2809    return v;
2810}
2811
2812/* Convert encoding to lower case and replace '_' with '-' in order to
2813   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2814   1 on success. */
2815static int
2816normalize_encoding(const char *encoding,
2817                   char *lower,
2818                   size_t lower_len)
2819{
2820    const char *e;
2821    char *l;
2822    char *l_end;
2823
2824    if (encoding == NULL) {
2825        strcpy(lower, "utf-8");
2826        return 1;
2827    }
2828    e = encoding;
2829    l = lower;
2830    l_end = &lower[lower_len - 1];
2831    while (*e) {
2832        if (l == l_end)
2833            return 0;
2834        if (Py_ISUPPER(*e)) {
2835            *l++ = Py_TOLOWER(*e++);
2836        }
2837        else if (*e == '_') {
2838            *l++ = '-';
2839            e++;
2840        }
2841        else {
2842            *l++ = *e++;
2843        }
2844    }
2845    *l = '\0';
2846    return 1;
2847}
2848
2849PyObject *
2850PyUnicode_Decode(const char *s,
2851                 Py_ssize_t size,
2852                 const char *encoding,
2853                 const char *errors)
2854{
2855    PyObject *buffer = NULL, *unicode;
2856    Py_buffer info;
2857    char lower[11];  /* Enough for any encoding shortcut */
2858
2859    /* Shortcuts for common default encodings */
2860    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2861        if ((strcmp(lower, "utf-8") == 0) ||
2862            (strcmp(lower, "utf8") == 0))
2863            return PyUnicode_DecodeUTF8(s, size, errors);
2864        else if ((strcmp(lower, "latin-1") == 0) ||
2865                 (strcmp(lower, "latin1") == 0) ||
2866                 (strcmp(lower, "iso-8859-1") == 0))
2867            return PyUnicode_DecodeLatin1(s, size, errors);
2868#ifdef HAVE_MBCS
2869        else if (strcmp(lower, "mbcs") == 0)
2870            return PyUnicode_DecodeMBCS(s, size, errors);
2871#endif
2872        else if (strcmp(lower, "ascii") == 0)
2873            return PyUnicode_DecodeASCII(s, size, errors);
2874        else if (strcmp(lower, "utf-16") == 0)
2875            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2876        else if (strcmp(lower, "utf-32") == 0)
2877            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2878    }
2879
2880    /* Decode via the codec registry */
2881    buffer = NULL;
2882    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2883        goto onError;
2884    buffer = PyMemoryView_FromBuffer(&info);
2885    if (buffer == NULL)
2886        goto onError;
2887    unicode = PyCodec_Decode(buffer, encoding, errors);
2888    if (unicode == NULL)
2889        goto onError;
2890    if (!PyUnicode_Check(unicode)) {
2891        PyErr_Format(PyExc_TypeError,
2892                     "decoder did not return a str object (type=%.400s)",
2893                     Py_TYPE(unicode)->tp_name);
2894        Py_DECREF(unicode);
2895        goto onError;
2896    }
2897    Py_DECREF(buffer);
2898#ifndef DONT_MAKE_RESULT_READY
2899    if (_PyUnicode_READY_REPLACE(&unicode)) {
2900        Py_DECREF(unicode);
2901        return NULL;
2902    }
2903#endif
2904    assert(_PyUnicode_CheckConsistency(unicode, 1));
2905    return unicode;
2906
2907  onError:
2908    Py_XDECREF(buffer);
2909    return NULL;
2910}
2911
2912PyObject *
2913PyUnicode_AsDecodedObject(PyObject *unicode,
2914                          const char *encoding,
2915                          const char *errors)
2916{
2917    PyObject *v;
2918
2919    if (!PyUnicode_Check(unicode)) {
2920        PyErr_BadArgument();
2921        goto onError;
2922    }
2923
2924    if (encoding == NULL)
2925        encoding = PyUnicode_GetDefaultEncoding();
2926
2927    /* Decode via the codec registry */
2928    v = PyCodec_Decode(unicode, encoding, errors);
2929    if (v == NULL)
2930        goto onError;
2931    assert(_PyUnicode_CheckConsistency(v, 1));
2932    return v;
2933
2934  onError:
2935    return NULL;
2936}
2937
2938PyObject *
2939PyUnicode_AsDecodedUnicode(PyObject *unicode,
2940                           const char *encoding,
2941                           const char *errors)
2942{
2943    PyObject *v;
2944
2945    if (!PyUnicode_Check(unicode)) {
2946        PyErr_BadArgument();
2947        goto onError;
2948    }
2949
2950    if (encoding == NULL)
2951        encoding = PyUnicode_GetDefaultEncoding();
2952
2953    /* Decode via the codec registry */
2954    v = PyCodec_Decode(unicode, encoding, errors);
2955    if (v == NULL)
2956        goto onError;
2957    if (!PyUnicode_Check(v)) {
2958        PyErr_Format(PyExc_TypeError,
2959                     "decoder did not return a str object (type=%.400s)",
2960                     Py_TYPE(v)->tp_name);
2961        Py_DECREF(v);
2962        goto onError;
2963    }
2964    assert(_PyUnicode_CheckConsistency(v, 1));
2965    return v;
2966
2967  onError:
2968    return NULL;
2969}
2970
2971PyObject *
2972PyUnicode_Encode(const Py_UNICODE *s,
2973                 Py_ssize_t size,
2974                 const char *encoding,
2975                 const char *errors)
2976{
2977    PyObject *v, *unicode;
2978
2979    unicode = PyUnicode_FromUnicode(s, size);
2980    if (unicode == NULL)
2981        return NULL;
2982    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2983    Py_DECREF(unicode);
2984    return v;
2985}
2986
2987PyObject *
2988PyUnicode_AsEncodedObject(PyObject *unicode,
2989                          const char *encoding,
2990                          const char *errors)
2991{
2992    PyObject *v;
2993
2994    if (!PyUnicode_Check(unicode)) {
2995        PyErr_BadArgument();
2996        goto onError;
2997    }
2998
2999    if (encoding == NULL)
3000        encoding = PyUnicode_GetDefaultEncoding();
3001
3002    /* Encode via the codec registry */
3003    v = PyCodec_Encode(unicode, encoding, errors);
3004    if (v == NULL)
3005        goto onError;
3006    return v;
3007
3008  onError:
3009    return NULL;
3010}
3011
3012PyObject *
3013PyUnicode_EncodeFSDefault(PyObject *unicode)
3014{
3015#ifdef HAVE_MBCS
3016    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3017                                PyUnicode_GET_SIZE(unicode),
3018                                NULL);
3019#elif defined(__APPLE__)
3020    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3021#else
3022    PyInterpreterState *interp = PyThreadState_GET()->interp;
3023    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3024       cannot use it to encode and decode filenames before it is loaded. Load
3025       the Python codec requires to encode at least its own filename. Use the C
3026       version of the locale codec until the codec registry is initialized and
3027       the Python codec is loaded.
3028
3029       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3030       cannot only rely on it: check also interp->fscodec_initialized for
3031       subinterpreters. */
3032    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3033        return PyUnicode_AsEncodedString(unicode,
3034                                         Py_FileSystemDefaultEncoding,
3035                                         "surrogateescape");
3036    }
3037    else {
3038        /* locale encoding with surrogateescape */
3039        wchar_t *wchar;
3040        char *bytes;
3041        PyObject *bytes_obj;
3042        size_t error_pos;
3043
3044        wchar = PyUnicode_AsWideCharString(unicode, NULL);
3045        if (wchar == NULL)
3046            return NULL;
3047        bytes = _Py_wchar2char(wchar, &error_pos);
3048        if (bytes == NULL) {
3049            if (error_pos != (size_t)-1) {
3050                char *errmsg = strerror(errno);
3051                PyObject *exc = NULL;
3052                if (errmsg == NULL)
3053                    errmsg = "Py_wchar2char() failed";
3054                raise_encode_exception(&exc,
3055                    "filesystemencoding", unicode,
3056                    error_pos, error_pos+1,
3057                    errmsg);
3058                Py_XDECREF(exc);
3059            }
3060            else
3061                PyErr_NoMemory();
3062            PyMem_Free(wchar);
3063            return NULL;
3064        }
3065        PyMem_Free(wchar);
3066
3067        bytes_obj = PyBytes_FromString(bytes);
3068        PyMem_Free(bytes);
3069        return bytes_obj;
3070    }
3071#endif
3072}
3073
3074PyObject *
3075PyUnicode_AsEncodedString(PyObject *unicode,
3076                          const char *encoding,
3077                          const char *errors)
3078{
3079    PyObject *v;
3080    char lower[11];  /* Enough for any encoding shortcut */
3081
3082    if (!PyUnicode_Check(unicode)) {
3083        PyErr_BadArgument();
3084        return NULL;
3085    }
3086
3087    /* Shortcuts for common default encodings */
3088    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3089        if ((strcmp(lower, "utf-8") == 0) ||
3090            (strcmp(lower, "utf8") == 0))
3091        {
3092            if (errors == NULL || strcmp(errors, "strict") == 0)
3093                return _PyUnicode_AsUTF8String(unicode, NULL);
3094            else
3095                return _PyUnicode_AsUTF8String(unicode, errors);
3096        }
3097        else if ((strcmp(lower, "latin-1") == 0) ||
3098                 (strcmp(lower, "latin1") == 0) ||
3099                 (strcmp(lower, "iso-8859-1") == 0))
3100            return _PyUnicode_AsLatin1String(unicode, errors);
3101#ifdef HAVE_MBCS
3102        else if (strcmp(lower, "mbcs") == 0)
3103            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3104                                        PyUnicode_GET_SIZE(unicode),
3105                                        errors);
3106#endif
3107        else if (strcmp(lower, "ascii") == 0)
3108            return _PyUnicode_AsASCIIString(unicode, errors);
3109    }
3110
3111    /* Encode via the codec registry */
3112    v = PyCodec_Encode(unicode, encoding, errors);
3113    if (v == NULL)
3114        return NULL;
3115
3116    /* The normal path */
3117    if (PyBytes_Check(v))
3118        return v;
3119
3120    /* If the codec returns a buffer, raise a warning and convert to bytes */
3121    if (PyByteArray_Check(v)) {
3122        int error;
3123        PyObject *b;
3124
3125        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3126            "encoder %s returned bytearray instead of bytes",
3127            encoding);
3128        if (error) {
3129            Py_DECREF(v);
3130            return NULL;
3131        }
3132
3133        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3134        Py_DECREF(v);
3135        return b;
3136    }
3137
3138    PyErr_Format(PyExc_TypeError,
3139                 "encoder did not return a bytes object (type=%.400s)",
3140                 Py_TYPE(v)->tp_name);
3141    Py_DECREF(v);
3142    return NULL;
3143}
3144
3145PyObject *
3146PyUnicode_AsEncodedUnicode(PyObject *unicode,
3147                           const char *encoding,
3148                           const char *errors)
3149{
3150    PyObject *v;
3151
3152    if (!PyUnicode_Check(unicode)) {
3153        PyErr_BadArgument();
3154        goto onError;
3155    }
3156
3157    if (encoding == NULL)
3158        encoding = PyUnicode_GetDefaultEncoding();
3159
3160    /* Encode via the codec registry */
3161    v = PyCodec_Encode(unicode, encoding, errors);
3162    if (v == NULL)
3163        goto onError;
3164    if (!PyUnicode_Check(v)) {
3165        PyErr_Format(PyExc_TypeError,
3166                     "encoder did not return an str object (type=%.400s)",
3167                     Py_TYPE(v)->tp_name);
3168        Py_DECREF(v);
3169        goto onError;
3170    }
3171    return v;
3172
3173  onError:
3174    return NULL;
3175}
3176
3177PyObject*
3178PyUnicode_DecodeFSDefault(const char *s) {
3179    Py_ssize_t size = (Py_ssize_t)strlen(s);
3180    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3181}
3182
3183PyObject*
3184PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3185{
3186#ifdef HAVE_MBCS
3187    return PyUnicode_DecodeMBCS(s, size, NULL);
3188#elif defined(__APPLE__)
3189    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3190#else
3191    PyInterpreterState *interp = PyThreadState_GET()->interp;
3192    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3193       cannot use it to encode and decode filenames before it is loaded. Load
3194       the Python codec requires to encode at least its own filename. Use the C
3195       version of the locale codec until the codec registry is initialized and
3196       the Python codec is loaded.
3197
3198       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3199       cannot only rely on it: check also interp->fscodec_initialized for
3200       subinterpreters. */
3201    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3202        return PyUnicode_Decode(s, size,
3203                                Py_FileSystemDefaultEncoding,
3204                                "surrogateescape");
3205    }
3206    else {
3207        /* locale encoding with surrogateescape */
3208        wchar_t *wchar;
3209        PyObject *unicode;
3210        size_t len;
3211
3212        if (s[size] != '\0' || size != strlen(s)) {
3213            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3214            return NULL;
3215        }
3216
3217        wchar = _Py_char2wchar(s, &len);
3218        if (wchar == NULL)
3219            return PyErr_NoMemory();
3220
3221        unicode = PyUnicode_FromWideChar(wchar, len);
3222        PyMem_Free(wchar);
3223        return unicode;
3224    }
3225#endif
3226}
3227
3228
3229int
3230PyUnicode_FSConverter(PyObject* arg, void* addr)
3231{
3232    PyObject *output = NULL;
3233    Py_ssize_t size;
3234    void *data;
3235    if (arg == NULL) {
3236        Py_DECREF(*(PyObject**)addr);
3237        return 1;
3238    }
3239    if (PyBytes_Check(arg)) {
3240        output = arg;
3241        Py_INCREF(output);
3242    }
3243    else {
3244        arg = PyUnicode_FromObject(arg);
3245        if (!arg)
3246            return 0;
3247        output = PyUnicode_EncodeFSDefault(arg);
3248        Py_DECREF(arg);
3249        if (!output)
3250            return 0;
3251        if (!PyBytes_Check(output)) {
3252            Py_DECREF(output);
3253            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3254            return 0;
3255        }
3256    }
3257    size = PyBytes_GET_SIZE(output);
3258    data = PyBytes_AS_STRING(output);
3259    if (size != strlen(data)) {
3260        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3261        Py_DECREF(output);
3262        return 0;
3263    }
3264    *(PyObject**)addr = output;
3265    return Py_CLEANUP_SUPPORTED;
3266}
3267
3268
3269int
3270PyUnicode_FSDecoder(PyObject* arg, void* addr)
3271{
3272    PyObject *output = NULL;
3273    if (arg == NULL) {
3274        Py_DECREF(*(PyObject**)addr);
3275        return 1;
3276    }
3277    if (PyUnicode_Check(arg)) {
3278        if (PyUnicode_READY(arg))
3279            return 0;
3280        output = arg;
3281        Py_INCREF(output);
3282    }
3283    else {
3284        arg = PyBytes_FromObject(arg);
3285        if (!arg)
3286            return 0;
3287        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3288                                                  PyBytes_GET_SIZE(arg));
3289        Py_DECREF(arg);
3290        if (!output)
3291            return 0;
3292        if (!PyUnicode_Check(output)) {
3293            Py_DECREF(output);
3294            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3295            return 0;
3296        }
3297    }
3298    if (PyUnicode_READY(output) < 0) {
3299        Py_DECREF(output);
3300        return 0;
3301    }
3302    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3303                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3304        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3305        Py_DECREF(output);
3306        return 0;
3307    }
3308    *(PyObject**)addr = output;
3309    return Py_CLEANUP_SUPPORTED;
3310}
3311
3312
3313char*
3314PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3315{
3316    PyObject *bytes;
3317
3318    if (!PyUnicode_Check(unicode)) {
3319        PyErr_BadArgument();
3320        return NULL;
3321    }
3322    if (PyUnicode_READY(unicode) == -1)
3323        return NULL;
3324
3325    if (PyUnicode_UTF8(unicode) == NULL) {
3326        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3327        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3328        if (bytes == NULL)
3329            return NULL;
3330        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3331        if (_PyUnicode_UTF8(unicode) == NULL) {
3332            Py_DECREF(bytes);
3333            return NULL;
3334        }
3335        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3336        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3337                  PyBytes_AS_STRING(bytes),
3338                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3339        Py_DECREF(bytes);
3340    }
3341
3342    if (psize)
3343        *psize = PyUnicode_UTF8_LENGTH(unicode);
3344    return PyUnicode_UTF8(unicode);
3345}
3346
3347char*
3348PyUnicode_AsUTF8(PyObject *unicode)
3349{
3350    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3351}
3352
3353#ifdef Py_DEBUG
3354static int unicode_as_unicode_calls = 0;
3355#endif
3356
3357
3358Py_UNICODE *
3359PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3360{
3361    const unsigned char *one_byte;
3362#if SIZEOF_WCHAR_T == 4
3363    const Py_UCS2 *two_bytes;
3364#else
3365    const Py_UCS4 *four_bytes;
3366    const Py_UCS4 *ucs4_end;
3367    Py_ssize_t num_surrogates;
3368#endif
3369    wchar_t *w;
3370    wchar_t *wchar_end;
3371
3372    if (!PyUnicode_Check(unicode)) {
3373        PyErr_BadArgument();
3374        return NULL;
3375    }
3376    if (_PyUnicode_WSTR(unicode) == NULL) {
3377        /* Non-ASCII compact unicode object */
3378        assert(_PyUnicode_KIND(unicode) != 0);
3379        assert(PyUnicode_IS_READY(unicode));
3380
3381#ifdef Py_DEBUG
3382        ++unicode_as_unicode_calls;
3383#endif
3384
3385        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3386#if SIZEOF_WCHAR_T == 2
3387            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3388            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3389            num_surrogates = 0;
3390
3391            for (; four_bytes < ucs4_end; ++four_bytes) {
3392                if (*four_bytes > 0xFFFF)
3393                    ++num_surrogates;
3394            }
3395
3396            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3397                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3398            if (!_PyUnicode_WSTR(unicode)) {
3399                PyErr_NoMemory();
3400                return NULL;
3401            }
3402            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3403
3404            w = _PyUnicode_WSTR(unicode);
3405            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3406            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3407            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3408                if (*four_bytes > 0xFFFF) {
3409                    /* encode surrogate pair in this case */
3410                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3411                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3412                }
3413                else
3414                    *w = *four_bytes;
3415
3416                if (w > wchar_end) {
3417                    assert(0 && "Miscalculated string end");
3418                }
3419            }
3420            *w = 0;
3421#else
3422            /* sizeof(wchar_t) == 4 */
3423            Py_FatalError("Impossible unicode object state, wstr and str "
3424                          "should share memory already.");
3425            return NULL;
3426#endif
3427        }
3428        else {
3429            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3430                                                  (_PyUnicode_LENGTH(unicode) + 1));
3431            if (!_PyUnicode_WSTR(unicode)) {
3432                PyErr_NoMemory();
3433                return NULL;
3434            }
3435            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3436                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3437            w = _PyUnicode_WSTR(unicode);
3438            wchar_end = w + _PyUnicode_LENGTH(unicode);
3439
3440            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3441                one_byte = PyUnicode_1BYTE_DATA(unicode);
3442                for (; w < wchar_end; ++one_byte, ++w)
3443                    *w = *one_byte;
3444                /* null-terminate the wstr */
3445                *w = 0;
3446            }
3447            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3448#if SIZEOF_WCHAR_T == 4
3449                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3450                for (; w < wchar_end; ++two_bytes, ++w)
3451                    *w = *two_bytes;
3452                /* null-terminate the wstr */
3453                *w = 0;
3454#else
3455                /* sizeof(wchar_t) == 2 */
3456                PyObject_FREE(_PyUnicode_WSTR(unicode));
3457                _PyUnicode_WSTR(unicode) = NULL;
3458                Py_FatalError("Impossible unicode object state, wstr "
3459                              "and str should share memory already.");
3460                return NULL;
3461#endif
3462            }
3463            else {
3464                assert(0 && "This should never happen.");
3465            }
3466        }
3467    }
3468    if (size != NULL)
3469        *size = PyUnicode_WSTR_LENGTH(unicode);
3470    return _PyUnicode_WSTR(unicode);
3471}
3472
3473Py_UNICODE *
3474PyUnicode_AsUnicode(PyObject *unicode)
3475{
3476    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3477}
3478
3479
3480Py_ssize_t
3481PyUnicode_GetSize(PyObject *unicode)
3482{
3483    if (!PyUnicode_Check(unicode)) {
3484        PyErr_BadArgument();
3485        goto onError;
3486    }
3487    return PyUnicode_GET_SIZE(unicode);
3488
3489  onError:
3490    return -1;
3491}
3492
3493Py_ssize_t
3494PyUnicode_GetLength(PyObject *unicode)
3495{
3496    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3497        PyErr_BadArgument();
3498        return -1;
3499    }
3500
3501    return PyUnicode_GET_LENGTH(unicode);
3502}
3503
3504Py_UCS4
3505PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3506{
3507    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3508        PyErr_BadArgument();
3509        return (Py_UCS4)-1;
3510    }
3511    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3512        PyErr_SetString(PyExc_IndexError, "string index out of range");
3513        return (Py_UCS4)-1;
3514    }
3515    return PyUnicode_READ_CHAR(unicode, index);
3516}
3517
3518int
3519PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3520{
3521    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3522        PyErr_BadArgument();
3523        return -1;
3524    }
3525    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3526        PyErr_SetString(PyExc_IndexError, "string index out of range");
3527        return -1;
3528    }
3529    if (_PyUnicode_Dirty(unicode))
3530        return -1;
3531    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3532                    index, ch);
3533    return 0;
3534}
3535
3536const char *
3537PyUnicode_GetDefaultEncoding(void)
3538{
3539    return "utf-8";
3540}
3541
3542/* create or adjust a UnicodeDecodeError */
3543static void
3544make_decode_exception(PyObject **exceptionObject,
3545                      const char *encoding,
3546                      const char *input, Py_ssize_t length,
3547                      Py_ssize_t startpos, Py_ssize_t endpos,
3548                      const char *reason)
3549{
3550    if (*exceptionObject == NULL) {
3551        *exceptionObject = PyUnicodeDecodeError_Create(
3552            encoding, input, length, startpos, endpos, reason);
3553    }
3554    else {
3555        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3556            goto onError;
3557        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3558            goto onError;
3559        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3560            goto onError;
3561    }
3562    return;
3563
3564onError:
3565    Py_DECREF(*exceptionObject);
3566    *exceptionObject = NULL;
3567}
3568
3569/* error handling callback helper:
3570   build arguments, call the callback and check the arguments,
3571   if no exception occurred, copy the replacement to the output
3572   and adjust various state variables.
3573   return 0 on success, -1 on error
3574*/
3575
3576static int
3577unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3578                                 const char *encoding, const char *reason,
3579                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3580                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3581                                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
3582{
3583    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3584
3585    PyObject *restuple = NULL;
3586    PyObject *repunicode = NULL;
3587    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
3588    Py_ssize_t insize;
3589    Py_ssize_t requiredsize;
3590    Py_ssize_t newpos;
3591    const Py_UNICODE *repptr;
3592    PyObject *inputobj = NULL;
3593    Py_ssize_t repsize;
3594    int res = -1;
3595
3596    if (*errorHandler == NULL) {
3597        *errorHandler = PyCodec_LookupError(errors);
3598        if (*errorHandler == NULL)
3599            goto onError;
3600    }
3601
3602    make_decode_exception(exceptionObject,
3603        encoding,
3604        *input, *inend - *input,
3605        *startinpos, *endinpos,
3606        reason);
3607    if (*exceptionObject == NULL)
3608        goto onError;
3609
3610    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3611    if (restuple == NULL)
3612        goto onError;
3613    if (!PyTuple_Check(restuple)) {
3614        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3615        goto onError;
3616    }
3617    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3618        goto onError;
3619
3620    /* Copy back the bytes variables, which might have been modified by the
3621       callback */
3622    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3623    if (!inputobj)
3624        goto onError;
3625    if (!PyBytes_Check(inputobj)) {
3626        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3627    }
3628    *input = PyBytes_AS_STRING(inputobj);
3629    insize = PyBytes_GET_SIZE(inputobj);
3630    *inend = *input + insize;
3631    /* we can DECREF safely, as the exception has another reference,
3632       so the object won't go away. */
3633    Py_DECREF(inputobj);
3634
3635    if (newpos<0)
3636        newpos = insize+newpos;
3637    if (newpos<0 || newpos>insize) {
3638        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3639        goto onError;
3640    }
3641
3642    /* need more space? (at least enough for what we
3643       have+the replacement+the rest of the string (starting
3644       at the new input position), so we won't have to check space
3645       when there are no errors in the rest of the string) */
3646    repptr = PyUnicode_AS_UNICODE(repunicode);
3647    repsize = PyUnicode_GET_SIZE(repunicode);
3648    requiredsize = *outpos + repsize + insize-newpos;
3649    if (requiredsize > outsize) {
3650        if (requiredsize<2*outsize)
3651            requiredsize = 2*outsize;
3652        if (PyUnicode_Resize(output, requiredsize) < 0)
3653            goto onError;
3654        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
3655    }
3656    *endinpos = newpos;
3657    *inptr = *input + newpos;
3658    Py_UNICODE_COPY(*outptr, repptr, repsize);
3659    *outptr += repsize;
3660    *outpos += repsize;
3661
3662    /* we made it! */
3663    res = 0;
3664
3665  onError:
3666    Py_XDECREF(restuple);
3667    return res;
3668}
3669
3670/* --- UTF-7 Codec -------------------------------------------------------- */
3671
3672/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3673
3674/* Three simple macros defining base-64. */
3675
3676/* Is c a base-64 character? */
3677
3678#define IS_BASE64(c) \
3679    (((c) >= 'A' && (c) <= 'Z') ||     \
3680     ((c) >= 'a' && (c) <= 'z') ||     \
3681     ((c) >= '0' && (c) <= '9') ||     \
3682     (c) == '+' || (c) == '/')
3683
3684/* given that c is a base-64 character, what is its base-64 value? */
3685
3686#define FROM_BASE64(c)                                                  \
3687    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3688     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3689     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3690     (c) == '+' ? 62 : 63)
3691
3692/* What is the base-64 character of the bottom 6 bits of n? */
3693
3694#define TO_BASE64(n)  \
3695    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3696
3697/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3698 * decoded as itself.  We are permissive on decoding; the only ASCII
3699 * byte not decoding to itself is the + which begins a base64
3700 * string. */
3701
3702#define DECODE_DIRECT(c)                                \
3703    ((c) <= 127 && (c) != '+')
3704
3705/* The UTF-7 encoder treats ASCII characters differently according to
3706 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3707 * the above).  See RFC2152.  This array identifies these different
3708 * sets:
3709 * 0 : "Set D"
3710 *     alphanumeric and '(),-./:?
3711 * 1 : "Set O"
3712 *     !"#$%&*;<=>@[]^_`{|}
3713 * 2 : "whitespace"
3714 *     ht nl cr sp
3715 * 3 : special (must be base64 encoded)
3716 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3717 */
3718
3719static
3720char utf7_category[128] = {
3721/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3722    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3723/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3724    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3725/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3726    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3727/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3728    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3729/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3730    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3731/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3732    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3733/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3734    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3735/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3736    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3737};
3738
3739/* ENCODE_DIRECT: this character should be encoded as itself.  The
3740 * answer depends on whether we are encoding set O as itself, and also
3741 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3742 * clear that the answers to these questions vary between
3743 * applications, so this code needs to be flexible.  */
3744
3745#define ENCODE_DIRECT(c, directO, directWS)             \
3746    ((c) < 128 && (c) > 0 &&                            \
3747     ((utf7_category[(c)] == 0) ||                      \
3748      (directWS && (utf7_category[(c)] == 2)) ||        \
3749      (directO && (utf7_category[(c)] == 1))))
3750
3751PyObject *
3752PyUnicode_DecodeUTF7(const char *s,
3753                     Py_ssize_t size,
3754                     const char *errors)
3755{
3756    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3757}
3758
3759/* The decoder.  The only state we preserve is our read position,
3760 * i.e. how many characters we have consumed.  So if we end in the
3761 * middle of a shift sequence we have to back off the read position
3762 * and the output to the beginning of the sequence, otherwise we lose
3763 * all the shift state (seen bits, number of bits seen, high
3764 * surrogate). */
3765
3766PyObject *
3767PyUnicode_DecodeUTF7Stateful(const char *s,
3768                             Py_ssize_t size,
3769                             const char *errors,
3770                             Py_ssize_t *consumed)
3771{
3772    const char *starts = s;
3773    Py_ssize_t startinpos;
3774    Py_ssize_t endinpos;
3775    Py_ssize_t outpos;
3776    const char *e;
3777    PyObject *unicode;
3778    Py_UNICODE *p;
3779    const char *errmsg = "";
3780    int inShift = 0;
3781    Py_UNICODE *shiftOutStart;
3782    unsigned int base64bits = 0;
3783    unsigned long base64buffer = 0;
3784    Py_UNICODE surrogate = 0;
3785    PyObject *errorHandler = NULL;
3786    PyObject *exc = NULL;
3787
3788    unicode = (PyObject*)_PyUnicode_New(size);
3789    if (!unicode)
3790        return NULL;
3791    if (size == 0) {
3792        if (consumed)
3793            *consumed = 0;
3794        return unicode;
3795    }
3796
3797    p = PyUnicode_AS_UNICODE(unicode);
3798    shiftOutStart = p;
3799    e = s + size;
3800
3801    while (s < e) {
3802        Py_UNICODE ch;
3803      restart:
3804        ch = (unsigned char) *s;
3805
3806        if (inShift) { /* in a base-64 section */
3807            if (IS_BASE64(ch)) { /* consume a base-64 character */
3808                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3809                base64bits += 6;
3810                s++;
3811                if (base64bits >= 16) {
3812                    /* we have enough bits for a UTF-16 value */
3813                    Py_UNICODE outCh = (Py_UNICODE)
3814                                       (base64buffer >> (base64bits-16));
3815                    base64bits -= 16;
3816                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3817                    if (surrogate) {
3818                        /* expecting a second surrogate */
3819                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3820#ifdef Py_UNICODE_WIDE
3821                            *p++ = (((surrogate & 0x3FF)<<10)
3822                                    | (outCh & 0x3FF)) + 0x10000;
3823#else
3824                            *p++ = surrogate;
3825                            *p++ = outCh;
3826#endif
3827                            surrogate = 0;
3828                        }
3829                        else {
3830                            surrogate = 0;
3831                            errmsg = "second surrogate missing";
3832                            goto utf7Error;
3833                        }
3834                    }
3835                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3836                        /* first surrogate */
3837                        surrogate = outCh;
3838                    }
3839                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3840                        errmsg = "unexpected second surrogate";
3841                        goto utf7Error;
3842                    }
3843                    else {
3844                        *p++ = outCh;
3845                    }
3846                }
3847            }
3848            else { /* now leaving a base-64 section */
3849                inShift = 0;
3850                s++;
3851                if (surrogate) {
3852                    errmsg = "second surrogate missing at end of shift sequence";
3853                    goto utf7Error;
3854                }
3855                if (base64bits > 0) { /* left-over bits */
3856                    if (base64bits >= 6) {
3857                        /* We've seen at least one base-64 character */
3858                        errmsg = "partial character in shift sequence";
3859                        goto utf7Error;
3860                    }
3861                    else {
3862                        /* Some bits remain; they should be zero */
3863                        if (base64buffer != 0) {
3864                            errmsg = "non-zero padding bits in shift sequence";
3865                            goto utf7Error;
3866                        }
3867                    }
3868                }
3869                if (ch != '-') {
3870                    /* '-' is absorbed; other terminating
3871                       characters are preserved */
3872                    *p++ = ch;
3873                }
3874            }
3875        }
3876        else if ( ch == '+' ) {
3877            startinpos = s-starts;
3878            s++; /* consume '+' */
3879            if (s < e && *s == '-') { /* '+-' encodes '+' */
3880                s++;
3881                *p++ = '+';
3882            }
3883            else { /* begin base64-encoded section */
3884                inShift = 1;
3885                shiftOutStart = p;
3886                base64bits = 0;
3887            }
3888        }
3889        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3890            *p++ = ch;
3891            s++;
3892        }
3893        else {
3894            startinpos = s-starts;
3895            s++;
3896            errmsg = "unexpected special character";
3897            goto utf7Error;
3898        }
3899        continue;
3900utf7Error:
3901        outpos = p-PyUnicode_AS_UNICODE(unicode);
3902        endinpos = s-starts;
3903        if (unicode_decode_call_errorhandler(
3904                errors, &errorHandler,
3905                "utf7", errmsg,
3906                &starts, &e, &startinpos, &endinpos, &exc, &s,
3907                &unicode, &outpos, &p))
3908            goto onError;
3909    }
3910
3911    /* end of string */
3912
3913    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3914        /* if we're in an inconsistent state, that's an error */
3915        if (surrogate ||
3916                (base64bits >= 6) ||
3917                (base64bits > 0 && base64buffer != 0)) {
3918            outpos = p-PyUnicode_AS_UNICODE(unicode);
3919            endinpos = size;
3920            if (unicode_decode_call_errorhandler(
3921                    errors, &errorHandler,
3922                    "utf7", "unterminated shift sequence",
3923                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3924                    &unicode, &outpos, &p))
3925                goto onError;
3926            if (s < e)
3927                goto restart;
3928        }
3929    }
3930
3931    /* return state */
3932    if (consumed) {
3933        if (inShift) {
3934            p = shiftOutStart; /* back off output */
3935            *consumed = startinpos;
3936        }
3937        else {
3938            *consumed = s-starts;
3939        }
3940    }
3941
3942    if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
3943        goto onError;
3944
3945    Py_XDECREF(errorHandler);
3946    Py_XDECREF(exc);
3947#ifndef DONT_MAKE_RESULT_READY
3948    if (_PyUnicode_READY_REPLACE(&unicode)) {
3949        Py_DECREF(unicode);
3950        return NULL;
3951    }
3952#endif
3953    assert(_PyUnicode_CheckConsistency(unicode, 1));
3954    return unicode;
3955
3956  onError:
3957    Py_XDECREF(errorHandler);
3958    Py_XDECREF(exc);
3959    Py_DECREF(unicode);
3960    return NULL;
3961}
3962
3963
3964PyObject *
3965PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3966                     Py_ssize_t size,
3967                     int base64SetO,
3968                     int base64WhiteSpace,
3969                     const char *errors)
3970{
3971    PyObject *v;
3972    /* It might be possible to tighten this worst case */
3973    Py_ssize_t allocated = 8 * size;
3974    int inShift = 0;
3975    Py_ssize_t i = 0;
3976    unsigned int base64bits = 0;
3977    unsigned long base64buffer = 0;
3978    char * out;
3979    char * start;
3980
3981    if (size == 0)
3982        return PyBytes_FromStringAndSize(NULL, 0);
3983
3984    if (allocated / 8 != size)
3985        return PyErr_NoMemory();
3986
3987    v = PyBytes_FromStringAndSize(NULL, allocated);
3988    if (v == NULL)
3989        return NULL;
3990
3991    start = out = PyBytes_AS_STRING(v);
3992    for (;i < size; ++i) {
3993        Py_UNICODE ch = s[i];
3994
3995        if (inShift) {
3996            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3997                /* shifting out */
3998                if (base64bits) { /* output remaining bits */
3999                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4000                    base64buffer = 0;
4001                    base64bits = 0;
4002                }
4003                inShift = 0;
4004                /* Characters not in the BASE64 set implicitly unshift the sequence
4005                   so no '-' is required, except if the character is itself a '-' */
4006                if (IS_BASE64(ch) || ch == '-') {
4007                    *out++ = '-';
4008                }
4009                *out++ = (char) ch;
4010            }
4011            else {
4012                goto encode_char;
4013            }
4014        }
4015        else { /* not in a shift sequence */
4016            if (ch == '+') {
4017                *out++ = '+';
4018                        *out++ = '-';
4019            }
4020            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4021                *out++ = (char) ch;
4022            }
4023            else {
4024                *out++ = '+';
4025                inShift = 1;
4026                goto encode_char;
4027            }
4028        }
4029        continue;
4030encode_char:
4031#ifdef Py_UNICODE_WIDE
4032        if (ch >= 0x10000) {
4033            /* code first surrogate */
4034            base64bits += 16;
4035            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4036            while (base64bits >= 6) {
4037                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4038                base64bits -= 6;
4039            }
4040            /* prepare second surrogate */
4041            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
4042        }
4043#endif
4044        base64bits += 16;
4045        base64buffer = (base64buffer << 16) | ch;
4046        while (base64bits >= 6) {
4047            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4048            base64bits -= 6;
4049        }
4050    }
4051    if (base64bits)
4052        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4053    if (inShift)
4054        *out++ = '-';
4055    if (_PyBytes_Resize(&v, out - start) < 0)
4056        return NULL;
4057    return v;
4058}
4059
4060#undef IS_BASE64
4061#undef FROM_BASE64
4062#undef TO_BASE64
4063#undef DECODE_DIRECT
4064#undef ENCODE_DIRECT
4065
4066/* --- UTF-8 Codec -------------------------------------------------------- */
4067
4068static
4069char utf8_code_length[256] = {
4070    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
4071       illegal prefix.  See RFC 3629 for details */
4072    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4073    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4074    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4075    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4076    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4077    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4078    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4079    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4080    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
4081    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4082    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4083    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4084    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4085    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4086    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4087    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
4088};
4089
4090PyObject *
4091PyUnicode_DecodeUTF8(const char *s,
4092                     Py_ssize_t size,
4093                     const char *errors)
4094{
4095    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4096}
4097
4098/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4099#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4100
4101/* Mask to quickly check whether a C 'long' contains a
4102   non-ASCII, UTF8-encoded char. */
4103#if (SIZEOF_LONG == 8)
4104# define ASCII_CHAR_MASK 0x8080808080808080L
4105#elif (SIZEOF_LONG == 4)
4106# define ASCII_CHAR_MASK 0x80808080L
4107#else
4108# error C 'long' size should be either 4 or 8!
4109#endif
4110
4111/* Scans a UTF-8 string and returns the maximum character to be expected,
4112   the size of the decoded unicode string and if any major errors were
4113   encountered.
4114
4115   This function does check basic UTF-8 sanity, it does however NOT CHECK
4116   if the string contains surrogates, and if all continuation bytes are
4117   within the correct ranges, these checks are performed in
4118   PyUnicode_DecodeUTF8Stateful.
4119
4120   If it sets has_errors to 1, it means the value of unicode_size and max_char
4121   will be bogus and you should not rely on useful information in them.
4122   */
4123static Py_UCS4
4124utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4125                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4126                                  int *has_errors)
4127{
4128    Py_ssize_t n;
4129    Py_ssize_t char_count = 0;
4130    Py_UCS4 max_char = 127, new_max;
4131    Py_UCS4 upper_bound;
4132    const unsigned char *p = (const unsigned char *)s;
4133    const unsigned char *end = p + string_size;
4134    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4135    int err = 0;
4136
4137    for (; p < end && !err; ++p, ++char_count) {
4138        /* Only check value if it's not a ASCII char... */
4139        if (*p < 0x80) {
4140            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4141               an explanation. */
4142            if (!((size_t) p & LONG_PTR_MASK)) {
4143                /* Help register allocation */
4144                register const unsigned char *_p = p;
4145                while (_p < aligned_end) {
4146                    unsigned long value = *(unsigned long *) _p;
4147                    if (value & ASCII_CHAR_MASK)
4148                        break;
4149                    _p += SIZEOF_LONG;
4150                    char_count += SIZEOF_LONG;
4151                }
4152                p = _p;
4153                if (p == end)
4154                    break;
4155            }
4156        }
4157        if (*p >= 0x80) {
4158            n = utf8_code_length[*p];
4159            new_max = max_char;
4160            switch (n) {
4161            /* invalid start byte */
4162            case 0:
4163                err = 1;
4164                break;
4165            case 2:
4166                /* Code points between 0x00FF and 0x07FF inclusive.
4167                   Approximate the upper bound of the code point,
4168                   if this flips over 255 we can be sure it will be more
4169                   than 255 and the string will need 2 bytes per code coint,
4170                   if it stays under or equal to 255, we can be sure 1 byte
4171                   is enough.
4172                   ((*p & 0b00011111) << 6) | 0b00111111 */
4173                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4174                if (max_char < upper_bound)
4175                    new_max = upper_bound;
4176                /* Ensure we track at least that we left ASCII space. */
4177                if (new_max < 128)
4178                    new_max = 128;
4179                break;
4180            case 3:
4181                /* Between 0x0FFF and 0xFFFF inclusive, so values are
4182                   always > 255 and <= 65535 and will always need 2 bytes. */
4183                if (max_char < 65535)
4184                    new_max = 65535;
4185                break;
4186            case 4:
4187                /* Code point will be above 0xFFFF for sure in this case. */
4188                new_max = 65537;
4189                break;
4190            /* Internal error, this should be caught by the first if */
4191            case 1:
4192            default:
4193                assert(0 && "Impossible case in utf8_max_char_and_size");
4194                err = 1;
4195            }
4196            /* Instead of number of overall bytes for this code point,
4197               n contains the number of following bytes: */
4198            --n;
4199            /* Check if the follow up chars are all valid continuation bytes */
4200            if (n >= 1) {
4201                const unsigned char *cont;
4202                if ((p + n) >= end) {
4203                    if (consumed == 0)
4204                        /* incomplete data, non-incremental decoding */
4205                        err = 1;
4206                    break;
4207                }
4208                for (cont = p + 1; cont < (p + n); ++cont) {
4209                    if ((*cont & 0xc0) != 0x80) {
4210                        err = 1;
4211                        break;
4212                    }
4213                }
4214                p += n;
4215            }
4216            else
4217                err = 1;
4218            max_char = new_max;
4219        }
4220    }
4221
4222    if (unicode_size)
4223        *unicode_size = char_count;
4224    if (has_errors)
4225        *has_errors = err;
4226    return max_char;
4227}
4228
4229/* Similar to PyUnicode_WRITE but can also write into wstr field
4230   of the legacy unicode representation */
4231#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4232    do { \
4233        const int k_ = (kind); \
4234        if (k_ == PyUnicode_WCHAR_KIND) \
4235            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4236        else if (k_ == PyUnicode_1BYTE_KIND) \
4237            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4238        else if (k_ == PyUnicode_2BYTE_KIND) \
4239            ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4240        else \
4241            ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4242    } while (0)
4243
4244PyObject *
4245PyUnicode_DecodeUTF8Stateful(const char *s,
4246                             Py_ssize_t size,
4247                             const char *errors,
4248                             Py_ssize_t *consumed)
4249{
4250    const char *starts = s;
4251    int n;
4252    int k;
4253    Py_ssize_t startinpos;
4254    Py_ssize_t endinpos;
4255    const char *e, *aligned_end;
4256    PyObject *unicode;
4257    const char *errmsg = "";
4258    PyObject *errorHandler = NULL;
4259    PyObject *exc = NULL;
4260    Py_UCS4 maxchar = 0;
4261    Py_ssize_t unicode_size;
4262    Py_ssize_t i;
4263    int kind;
4264    void *data;
4265    int has_errors;
4266    Py_UNICODE *error_outptr;
4267#if SIZEOF_WCHAR_T == 2
4268    Py_ssize_t wchar_offset = 0;
4269#endif
4270
4271    if (size == 0) {
4272        if (consumed)
4273            *consumed = 0;
4274        return (PyObject *)PyUnicode_New(0, 0);
4275    }
4276    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4277                                                consumed, &has_errors);
4278    if (has_errors) {
4279        unicode = (PyObject*)_PyUnicode_New(size);
4280        if (!unicode)
4281            return NULL;
4282        kind = PyUnicode_WCHAR_KIND;
4283        data = PyUnicode_AS_UNICODE(unicode);
4284        assert(data != NULL);
4285    }
4286    else {
4287        unicode = PyUnicode_New(unicode_size, maxchar);
4288        if (!unicode)
4289            return NULL;
4290        /* When the string is ASCII only, just use memcpy and return.
4291           unicode_size may be != size if there is an incomplete UTF-8
4292           sequence at the end of the ASCII block.  */
4293        if (maxchar < 128 && size == unicode_size) {
4294            Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4295            return unicode;
4296        }
4297        kind = PyUnicode_KIND(unicode);
4298        data = PyUnicode_DATA(unicode);
4299    }
4300    /* Unpack UTF-8 encoded data */
4301    i = 0;
4302    e = s + size;
4303    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4304
4305    while (s < e) {
4306        Py_UCS4 ch = (unsigned char)*s;
4307
4308        if (ch < 0x80) {
4309            /* Fast path for runs of ASCII characters. Given that common UTF-8
4310               input will consist of an overwhelming majority of ASCII
4311               characters, we try to optimize for this case by checking
4312               as many characters as a C 'long' can contain.
4313               First, check if we can do an aligned read, as most CPUs have
4314               a penalty for unaligned reads.
4315            */
4316            if (!((size_t) s & LONG_PTR_MASK)) {
4317                /* Help register allocation */
4318                register const char *_s = s;
4319                register Py_ssize_t _i = i;
4320                while (_s < aligned_end) {
4321                    /* Read a whole long at a time (either 4 or 8 bytes),
4322                       and do a fast unrolled copy if it only contains ASCII
4323                       characters. */
4324                    unsigned long value = *(unsigned long *) _s;
4325                    if (value & ASCII_CHAR_MASK)
4326                        break;
4327                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4328                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4329                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4330                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
4331#if (SIZEOF_LONG == 8)
4332                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4333                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4334                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4335                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
4336#endif
4337                    _s += SIZEOF_LONG;
4338                    _i += SIZEOF_LONG;
4339                }
4340                s = _s;
4341                i = _i;
4342                if (s == e)
4343                    break;
4344                ch = (unsigned char)*s;
4345            }
4346        }
4347
4348        if (ch < 0x80) {
4349            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4350            s++;
4351            continue;
4352        }
4353
4354        n = utf8_code_length[ch];
4355
4356        if (s + n > e) {
4357            if (consumed)
4358                break;
4359            else {
4360                errmsg = "unexpected end of data";
4361                startinpos = s-starts;
4362                endinpos = startinpos+1;
4363                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4364                    endinpos++;
4365                goto utf8Error;
4366            }
4367        }
4368
4369        switch (n) {
4370
4371        case 0:
4372            errmsg = "invalid start byte";
4373            startinpos = s-starts;
4374            endinpos = startinpos+1;
4375            goto utf8Error;
4376
4377        case 1:
4378            errmsg = "internal error";
4379            startinpos = s-starts;
4380            endinpos = startinpos+1;
4381            goto utf8Error;
4382
4383        case 2:
4384            if ((s[1] & 0xc0) != 0x80) {
4385                errmsg = "invalid continuation byte";
4386                startinpos = s-starts;
4387                endinpos = startinpos + 1;
4388                goto utf8Error;
4389            }
4390            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4391            assert ((ch > 0x007F) && (ch <= 0x07FF));
4392            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4393            break;
4394
4395        case 3:
4396            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4397               will result in surrogates in range d800-dfff. Surrogates are
4398               not valid UTF-8 so they are rejected.
4399               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4400               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4401            if ((s[1] & 0xc0) != 0x80 ||
4402                (s[2] & 0xc0) != 0x80 ||
4403                ((unsigned char)s[0] == 0xE0 &&
4404                 (unsigned char)s[1] < 0xA0) ||
4405                ((unsigned char)s[0] == 0xED &&
4406                 (unsigned char)s[1] > 0x9F)) {
4407                errmsg = "invalid continuation byte";
4408                startinpos = s-starts;
4409                endinpos = startinpos + 1;
4410
4411                /* if s[1] first two bits are 1 and 0, then the invalid
4412                   continuation byte is s[2], so increment endinpos by 1,
4413                   if not, s[1] is invalid and endinpos doesn't need to
4414                   be incremented. */
4415                if ((s[1] & 0xC0) == 0x80)
4416                    endinpos++;
4417                goto utf8Error;
4418            }
4419            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4420            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4421            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4422            break;
4423
4424        case 4:
4425            if ((s[1] & 0xc0) != 0x80 ||
4426                (s[2] & 0xc0) != 0x80 ||
4427                (s[3] & 0xc0) != 0x80 ||
4428                ((unsigned char)s[0] == 0xF0 &&
4429                 (unsigned char)s[1] < 0x90) ||
4430                ((unsigned char)s[0] == 0xF4 &&
4431                 (unsigned char)s[1] > 0x8F)) {
4432                errmsg = "invalid continuation byte";
4433                startinpos = s-starts;
4434                endinpos = startinpos + 1;
4435                if ((s[1] & 0xC0) == 0x80) {
4436                    endinpos++;
4437                    if ((s[2] & 0xC0) == 0x80)
4438                        endinpos++;
4439                }
4440                goto utf8Error;
4441            }
4442            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4443                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4444            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4445
4446            /* If the string is flexible or we have native UCS-4, write
4447               directly.. */
4448            if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4449                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4450
4451            else {
4452                /* compute and append the two surrogates: */
4453
4454                /* translate from 10000..10FFFF to 0..FFFF */
4455                ch -= 0x10000;
4456
4457                /* high surrogate = top 10 bits added to D800 */
4458                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4459                                       (Py_UNICODE)(0xD800 + (ch >> 10)));
4460
4461                /* low surrogate = bottom 10 bits added to DC00 */
4462                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4463                                       (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4464            }
4465#if SIZEOF_WCHAR_T == 2
4466            wchar_offset++;
4467#endif
4468            break;
4469        }
4470        s += n;
4471        continue;
4472
4473      utf8Error:
4474        /* If this is not yet a resizable string, make it one.. */
4475        if (kind != PyUnicode_WCHAR_KIND) {
4476            const Py_UNICODE *u;
4477            PyObject *new_unicode = (PyObject*)_PyUnicode_New(size);
4478            if (!new_unicode)
4479                goto onError;
4480            u = PyUnicode_AsUnicode(unicode);
4481            if (!u)
4482                goto onError;
4483#if SIZEOF_WCHAR_T == 2
4484            i += wchar_offset;
4485#endif
4486            Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4487            Py_DECREF(unicode);
4488            unicode = new_unicode;
4489            kind = 0;
4490            data = PyUnicode_AS_UNICODE(new_unicode);
4491            assert(data != NULL);
4492        }
4493        error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
4494        if (unicode_decode_call_errorhandler(
4495                errors, &errorHandler,
4496                "utf8", errmsg,
4497                &starts, &e, &startinpos, &endinpos, &exc, &s,
4498                &unicode, &i, &error_outptr))
4499            goto onError;
4500        /* Update data because unicode_decode_call_errorhandler might have
4501           re-created or resized the unicode object. */
4502        data = PyUnicode_AS_UNICODE(unicode);
4503        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4504    }
4505    /* Ensure the unicode_size calculation above was correct: */
4506    assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4507
4508    if (consumed)
4509        *consumed = s-starts;
4510
4511    /* Adjust length and ready string when it contained errors and
4512       is of the old resizable kind. */
4513    if (kind == PyUnicode_WCHAR_KIND) {
4514        if (PyUnicode_Resize(&unicode, i) < 0)
4515            goto onError;
4516    }
4517
4518    Py_XDECREF(errorHandler);
4519    Py_XDECREF(exc);
4520#ifndef DONT_MAKE_RESULT_READY
4521    if (_PyUnicode_READY_REPLACE(&unicode)) {
4522        Py_DECREF(unicode);
4523        return NULL;
4524    }
4525#endif
4526    assert(_PyUnicode_CheckConsistency(unicode, 1));
4527    return unicode;
4528
4529  onError:
4530    Py_XDECREF(errorHandler);
4531    Py_XDECREF(exc);
4532    Py_DECREF(unicode);
4533    return NULL;
4534}
4535
4536#undef WRITE_FLEXIBLE_OR_WSTR
4537
4538#ifdef __APPLE__
4539
4540/* Simplified UTF-8 decoder using surrogateescape error handler,
4541   used to decode the command line arguments on Mac OS X. */
4542
4543wchar_t*
4544_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4545{
4546    int n;
4547    const char *e;
4548    wchar_t *unicode, *p;
4549
4550    /* Note: size will always be longer than the resulting Unicode
4551       character count */
4552    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4553        PyErr_NoMemory();
4554        return NULL;
4555    }
4556    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4557    if (!unicode)
4558        return NULL;
4559
4560    /* Unpack UTF-8 encoded data */
4561    p = unicode;
4562    e = s + size;
4563    while (s < e) {
4564        Py_UCS4 ch = (unsigned char)*s;
4565
4566        if (ch < 0x80) {
4567            *p++ = (wchar_t)ch;
4568            s++;
4569            continue;
4570        }
4571
4572        n = utf8_code_length[ch];
4573        if (s + n > e) {
4574            goto surrogateescape;
4575        }
4576
4577        switch (n) {
4578        case 0:
4579        case 1:
4580            goto surrogateescape;
4581
4582        case 2:
4583            if ((s[1] & 0xc0) != 0x80)
4584                goto surrogateescape;
4585            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4586            assert ((ch > 0x007F) && (ch <= 0x07FF));
4587            *p++ = (wchar_t)ch;
4588            break;
4589
4590        case 3:
4591            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4592               will result in surrogates in range d800-dfff. Surrogates are
4593               not valid UTF-8 so they are rejected.
4594               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4595               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4596            if ((s[1] & 0xc0) != 0x80 ||
4597                (s[2] & 0xc0) != 0x80 ||
4598                ((unsigned char)s[0] == 0xE0 &&
4599                 (unsigned char)s[1] < 0xA0) ||
4600                ((unsigned char)s[0] == 0xED &&
4601                 (unsigned char)s[1] > 0x9F)) {
4602
4603                goto surrogateescape;
4604            }
4605            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4606            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4607            *p++ = (wchar_t)ch;
4608            break;
4609
4610        case 4:
4611            if ((s[1] & 0xc0) != 0x80 ||
4612                (s[2] & 0xc0) != 0x80 ||
4613                (s[3] & 0xc0) != 0x80 ||
4614                ((unsigned char)s[0] == 0xF0 &&
4615                 (unsigned char)s[1] < 0x90) ||
4616                ((unsigned char)s[0] == 0xF4 &&
4617                 (unsigned char)s[1] > 0x8F)) {
4618                goto surrogateescape;
4619            }
4620            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4621                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4622            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4623
4624#if SIZEOF_WCHAR_T == 4
4625            *p++ = (wchar_t)ch;
4626#else
4627            /*  compute and append the two surrogates: */
4628
4629            /*  translate from 10000..10FFFF to 0..FFFF */
4630            ch -= 0x10000;
4631
4632            /*  high surrogate = top 10 bits added to D800 */
4633            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4634
4635            /*  low surrogate = bottom 10 bits added to DC00 */
4636            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4637#endif
4638            break;
4639        }
4640        s += n;
4641        continue;
4642
4643      surrogateescape:
4644        *p++ = 0xDC00 + ch;
4645        s++;
4646    }
4647    *p = L'\0';
4648    return unicode;
4649}
4650
4651#endif /* __APPLE__ */
4652
4653/* Primary internal function which creates utf8 encoded bytes objects.
4654
4655   Allocation strategy:  if the string is short, convert into a stack buffer
4656   and allocate exactly as much space needed at the end.  Else allocate the
4657   maximum possible needed (4 result bytes per Unicode character), and return
4658   the excess memory at the end.
4659*/
4660PyObject *
4661_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4662{
4663#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4664
4665    Py_ssize_t i;                /* index into s of next input byte */
4666    PyObject *result;            /* result string object */
4667    char *p;                     /* next free byte in output buffer */
4668    Py_ssize_t nallocated;      /* number of result bytes allocated */
4669    Py_ssize_t nneeded;            /* number of result bytes needed */
4670    char stackbuf[MAX_SHORT_UNICHARS * 4];
4671    PyObject *errorHandler = NULL;
4672    PyObject *exc = NULL;
4673    int kind;
4674    void *data;
4675    Py_ssize_t size;
4676
4677    if (!PyUnicode_Check(unicode)) {
4678        PyErr_BadArgument();
4679        return NULL;
4680    }
4681
4682    if (PyUnicode_READY(unicode) == -1)
4683        return NULL;
4684
4685    if (PyUnicode_UTF8(unicode))
4686        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4687                                         PyUnicode_UTF8_LENGTH(unicode));
4688
4689    kind = PyUnicode_KIND(unicode);
4690    data = PyUnicode_DATA(unicode);
4691    size = PyUnicode_GET_LENGTH(unicode);
4692
4693    assert(size >= 0);
4694
4695    if (size <= MAX_SHORT_UNICHARS) {
4696        /* Write into the stack buffer; nallocated can't overflow.
4697         * At the end, we'll allocate exactly as much heap space as it
4698         * turns out we need.
4699         */
4700        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4701        result = NULL;   /* will allocate after we're done */
4702        p = stackbuf;
4703    }
4704    else {
4705        /* Overallocate on the heap, and give the excess back at the end. */
4706        nallocated = size * 4;
4707        if (nallocated / 4 != size)  /* overflow! */
4708            return PyErr_NoMemory();
4709        result = PyBytes_FromStringAndSize(NULL, nallocated);
4710        if (result == NULL)
4711            return NULL;
4712        p = PyBytes_AS_STRING(result);
4713    }
4714
4715    for (i = 0; i < size;) {
4716        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4717
4718        if (ch < 0x80)
4719            /* Encode ASCII */
4720            *p++ = (char) ch;
4721
4722        else if (ch < 0x0800) {
4723            /* Encode Latin-1 */
4724            *p++ = (char)(0xc0 | (ch >> 6));
4725            *p++ = (char)(0x80 | (ch & 0x3f));
4726        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4727            Py_ssize_t newpos;
4728            PyObject *rep;
4729            Py_ssize_t repsize, k, startpos;
4730            startpos = i-1;
4731            rep = unicode_encode_call_errorhandler(
4732                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4733                  unicode, &exc, startpos, startpos+1, &newpos);
4734            if (!rep)
4735                goto error;
4736
4737            if (PyBytes_Check(rep))
4738                repsize = PyBytes_GET_SIZE(rep);
4739            else
4740                repsize = PyUnicode_GET_SIZE(rep);
4741
4742            if (repsize > 4) {
4743                Py_ssize_t offset;
4744
4745                if (result == NULL)
4746                    offset = p - stackbuf;
4747                else
4748                    offset = p - PyBytes_AS_STRING(result);
4749
4750                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4751                    /* integer overflow */
4752                    PyErr_NoMemory();
4753                    goto error;
4754                }
4755                nallocated += repsize - 4;
4756                if (result != NULL) {
4757                    if (_PyBytes_Resize(&result, nallocated) < 0)
4758                        goto error;
4759                } else {
4760                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4761                    if (result == NULL)
4762                        goto error;
4763                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4764                }
4765                p = PyBytes_AS_STRING(result) + offset;
4766            }
4767
4768            if (PyBytes_Check(rep)) {
4769                char *prep = PyBytes_AS_STRING(rep);
4770                for(k = repsize; k > 0; k--)
4771                    *p++ = *prep++;
4772            } else /* rep is unicode */ {
4773                const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4774                Py_UNICODE c;
4775
4776                for(k=0; k<repsize; k++) {
4777                    c = prep[k];
4778                    if (0x80 <= c) {
4779                        raise_encode_exception(&exc, "utf-8",
4780                                               unicode,
4781                                               i-1, i,
4782                                               "surrogates not allowed");
4783                        goto error;
4784                    }
4785                    *p++ = (char)prep[k];
4786                }
4787            }
4788            Py_DECREF(rep);
4789        } else if (ch < 0x10000) {
4790            *p++ = (char)(0xe0 | (ch >> 12));
4791            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4792            *p++ = (char)(0x80 | (ch & 0x3f));
4793        } else /* ch >= 0x10000 */ {
4794            /* Encode UCS4 Unicode ordinals */
4795            *p++ = (char)(0xf0 | (ch >> 18));
4796            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4797            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4798            *p++ = (char)(0x80 | (ch & 0x3f));
4799        }
4800    }
4801
4802    if (result == NULL) {
4803        /* This was stack allocated. */
4804        nneeded = p - stackbuf;
4805        assert(nneeded <= nallocated);
4806        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4807    }
4808    else {
4809        /* Cut back to size actually needed. */
4810        nneeded = p - PyBytes_AS_STRING(result);
4811        assert(nneeded <= nallocated);
4812        _PyBytes_Resize(&result, nneeded);
4813    }
4814
4815    Py_XDECREF(errorHandler);
4816    Py_XDECREF(exc);
4817    return result;
4818 error:
4819    Py_XDECREF(errorHandler);
4820    Py_XDECREF(exc);
4821    Py_XDECREF(result);
4822    return NULL;
4823
4824#undef MAX_SHORT_UNICHARS
4825}
4826
4827PyObject *
4828PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4829                     Py_ssize_t size,
4830                     const char *errors)
4831{
4832    PyObject *v, *unicode;
4833
4834    unicode = PyUnicode_FromUnicode(s, size);
4835    if (unicode == NULL)
4836        return NULL;
4837    v = _PyUnicode_AsUTF8String(unicode, errors);
4838    Py_DECREF(unicode);
4839    return v;
4840}
4841
4842PyObject *
4843PyUnicode_AsUTF8String(PyObject *unicode)
4844{
4845    return _PyUnicode_AsUTF8String(unicode, NULL);
4846}
4847
4848/* --- UTF-32 Codec ------------------------------------------------------- */
4849
4850PyObject *
4851PyUnicode_DecodeUTF32(const char *s,
4852                      Py_ssize_t size,
4853                      const char *errors,
4854                      int *byteorder)
4855{
4856    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4857}
4858
4859PyObject *
4860PyUnicode_DecodeUTF32Stateful(const char *s,
4861                              Py_ssize_t size,
4862                              const char *errors,
4863                              int *byteorder,
4864                              Py_ssize_t *consumed)
4865{
4866    const char *starts = s;
4867    Py_ssize_t startinpos;
4868    Py_ssize_t endinpos;
4869    Py_ssize_t outpos;
4870    PyObject *unicode;
4871    Py_UNICODE *p;
4872#ifndef Py_UNICODE_WIDE
4873    int pairs = 0;
4874    const unsigned char *qq;
4875#else
4876    const int pairs = 0;
4877#endif
4878    const unsigned char *q, *e;
4879    int bo = 0;       /* assume native ordering by default */
4880    const char *errmsg = "";
4881    /* Offsets from q for retrieving bytes in the right order. */
4882#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4883    int iorder[] = {0, 1, 2, 3};
4884#else
4885    int iorder[] = {3, 2, 1, 0};
4886#endif
4887    PyObject *errorHandler = NULL;
4888    PyObject *exc = NULL;
4889
4890    q = (unsigned char *)s;
4891    e = q + size;
4892
4893    if (byteorder)
4894        bo = *byteorder;
4895
4896    /* Check for BOM marks (U+FEFF) in the input and adjust current
4897       byte order setting accordingly. In native mode, the leading BOM
4898       mark is skipped, in all other modes, it is copied to the output
4899       stream as-is (giving a ZWNBSP character). */
4900    if (bo == 0) {
4901        if (size >= 4) {
4902            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4903                (q[iorder[1]] << 8) | q[iorder[0]];
4904#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4905            if (bom == 0x0000FEFF) {
4906                q += 4;
4907                bo = -1;
4908            }
4909            else if (bom == 0xFFFE0000) {
4910                q += 4;
4911                bo = 1;
4912            }
4913#else
4914            if (bom == 0x0000FEFF) {
4915                q += 4;
4916                bo = 1;
4917            }
4918            else if (bom == 0xFFFE0000) {
4919                q += 4;
4920                bo = -1;
4921            }
4922#endif
4923        }
4924    }
4925
4926    if (bo == -1) {
4927        /* force LE */
4928        iorder[0] = 0;
4929        iorder[1] = 1;
4930        iorder[2] = 2;
4931        iorder[3] = 3;
4932    }
4933    else if (bo == 1) {
4934        /* force BE */
4935        iorder[0] = 3;
4936        iorder[1] = 2;
4937        iorder[2] = 1;
4938        iorder[3] = 0;
4939    }
4940
4941    /* On narrow builds we split characters outside the BMP into two
4942       codepoints => count how much extra space we need. */
4943#ifndef Py_UNICODE_WIDE
4944    for (qq = q; qq < e; qq += 4)
4945        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4946            pairs++;
4947#endif
4948
4949    /* This might be one to much, because of a BOM */
4950    unicode = (PyObject*)_PyUnicode_New((size+3)/4+pairs);
4951    if (!unicode)
4952        return NULL;
4953    if (size == 0)
4954        return unicode;
4955
4956    /* Unpack UTF-32 encoded data */
4957    p = PyUnicode_AS_UNICODE(unicode);
4958
4959    while (q < e) {
4960        Py_UCS4 ch;
4961        /* remaining bytes at the end? (size should be divisible by 4) */
4962        if (e-q<4) {
4963            if (consumed)
4964                break;
4965            errmsg = "truncated data";
4966            startinpos = ((const char *)q)-starts;
4967            endinpos = ((const char *)e)-starts;
4968            goto utf32Error;
4969            /* The remaining input chars are ignored if the callback
4970               chooses to skip the input */
4971        }
4972        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4973            (q[iorder[1]] << 8) | q[iorder[0]];
4974
4975        if (ch >= 0x110000)
4976        {
4977            errmsg = "codepoint not in range(0x110000)";
4978            startinpos = ((const char *)q)-starts;
4979            endinpos = startinpos+4;
4980            goto utf32Error;
4981        }
4982#ifndef Py_UNICODE_WIDE
4983        if (ch >= 0x10000)
4984        {
4985            *p++ = 0xD800 | ((ch-0x10000) >> 10);
4986            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4987        }
4988        else
4989#endif
4990            *p++ = ch;
4991        q += 4;
4992        continue;
4993      utf32Error:
4994        outpos = p-PyUnicode_AS_UNICODE(unicode);
4995        if (unicode_decode_call_errorhandler(
4996                errors, &errorHandler,
4997                "utf32", errmsg,
4998                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4999                &unicode, &outpos, &p))
5000            goto onError;
5001    }
5002
5003    if (byteorder)
5004        *byteorder = bo;
5005
5006    if (consumed)
5007        *consumed = (const char *)q-starts;
5008
5009    /* Adjust length */
5010    if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5011        goto onError;
5012
5013    Py_XDECREF(errorHandler);
5014    Py_XDECREF(exc);
5015#ifndef DONT_MAKE_RESULT_READY
5016    if (_PyUnicode_READY_REPLACE(&unicode)) {
5017        Py_DECREF(unicode);
5018        return NULL;
5019    }
5020#endif
5021    assert(_PyUnicode_CheckConsistency(unicode, 1));
5022    return unicode;
5023
5024  onError:
5025    Py_DECREF(unicode);
5026    Py_XDECREF(errorHandler);
5027    Py_XDECREF(exc);
5028    return NULL;
5029}
5030
5031PyObject *
5032PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5033                      Py_ssize_t size,
5034                      const char *errors,
5035                      int byteorder)
5036{
5037    PyObject *v;
5038    unsigned char *p;
5039    Py_ssize_t nsize, bytesize;
5040#ifndef Py_UNICODE_WIDE
5041    Py_ssize_t i, pairs;
5042#else
5043    const int pairs = 0;
5044#endif
5045    /* Offsets from p for storing byte pairs in the right order. */
5046#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5047    int iorder[] = {0, 1, 2, 3};
5048#else
5049    int iorder[] = {3, 2, 1, 0};
5050#endif
5051
5052#define STORECHAR(CH)                           \
5053    do {                                        \
5054        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5055        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5056        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5057        p[iorder[0]] = (CH) & 0xff;             \
5058        p += 4;                                 \
5059    } while(0)
5060
5061    /* In narrow builds we can output surrogate pairs as one codepoint,
5062       so we need less space. */
5063#ifndef Py_UNICODE_WIDE
5064    for (i = pairs = 0; i < size-1; i++)
5065        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5066            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5067            pairs++;
5068#endif
5069    nsize = (size - pairs + (byteorder == 0));
5070    bytesize = nsize * 4;
5071    if (bytesize / 4 != nsize)
5072        return PyErr_NoMemory();
5073    v = PyBytes_FromStringAndSize(NULL, bytesize);
5074    if (v == NULL)
5075        return NULL;
5076
5077    p = (unsigned char *)PyBytes_AS_STRING(v);
5078    if (byteorder == 0)
5079        STORECHAR(0xFEFF);
5080    if (size == 0)
5081        goto done;
5082
5083    if (byteorder == -1) {
5084        /* force LE */
5085        iorder[0] = 0;
5086        iorder[1] = 1;
5087        iorder[2] = 2;
5088        iorder[3] = 3;
5089    }
5090    else if (byteorder == 1) {
5091        /* force BE */
5092        iorder[0] = 3;
5093        iorder[1] = 2;
5094        iorder[2] = 1;
5095        iorder[3] = 0;
5096    }
5097
5098    while (size-- > 0) {
5099        Py_UCS4 ch = *s++;
5100#ifndef Py_UNICODE_WIDE
5101        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5102            Py_UCS4 ch2 = *s;
5103            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5104                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5105                s++;
5106                size--;
5107            }
5108        }
5109#endif
5110        STORECHAR(ch);
5111    }
5112
5113  done:
5114    return v;
5115#undef STORECHAR
5116}
5117
5118PyObject *
5119PyUnicode_AsUTF32String(PyObject *unicode)
5120{
5121    if (!PyUnicode_Check(unicode)) {
5122        PyErr_BadArgument();
5123        return NULL;
5124    }
5125    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
5126                                 PyUnicode_GET_SIZE(unicode),
5127                                 NULL,
5128                                 0);
5129}
5130
5131/* --- UTF-16 Codec ------------------------------------------------------- */
5132
5133PyObject *
5134PyUnicode_DecodeUTF16(const char *s,
5135                      Py_ssize_t size,
5136                      const char *errors,
5137                      int *byteorder)
5138{
5139    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5140}
5141
5142/* Two masks for fast checking of whether a C 'long' may contain
5143   UTF16-encoded surrogate characters. This is an efficient heuristic,
5144   assuming that non-surrogate characters with a code point >= 0x8000 are
5145   rare in most input.
5146   FAST_CHAR_MASK is used when the input is in native byte ordering,
5147   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
5148*/
5149#if (SIZEOF_LONG == 8)
5150# define FAST_CHAR_MASK         0x8000800080008000L
5151# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5152#elif (SIZEOF_LONG == 4)
5153# define FAST_CHAR_MASK         0x80008000L
5154# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5155#else
5156# error C 'long' size should be either 4 or 8!
5157#endif
5158
5159PyObject *
5160PyUnicode_DecodeUTF16Stateful(const char *s,
5161                              Py_ssize_t size,
5162                              const char *errors,
5163                              int *byteorder,
5164                              Py_ssize_t *consumed)
5165{
5166    const char *starts = s;
5167    Py_ssize_t startinpos;
5168    Py_ssize_t endinpos;
5169    Py_ssize_t outpos;
5170    PyObject *unicode;
5171    Py_UNICODE *p;
5172    const unsigned char *q, *e, *aligned_end;
5173    int bo = 0;       /* assume native ordering by default */
5174    int native_ordering = 0;
5175    const char *errmsg = "";
5176    /* Offsets from q for retrieving byte pairs in the right order. */
5177#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5178    int ihi = 1, ilo = 0;
5179#else
5180    int ihi = 0, ilo = 1;
5181#endif
5182    PyObject *errorHandler = NULL;
5183    PyObject *exc = NULL;
5184
5185    /* Note: size will always be longer than the resulting Unicode
5186       character count */
5187    unicode = (PyObject*)_PyUnicode_New(size);
5188    if (!unicode)
5189        return NULL;
5190    if (size == 0)
5191        return unicode;
5192
5193    /* Unpack UTF-16 encoded data */
5194    p = PyUnicode_AS_UNICODE(unicode);
5195    q = (unsigned char *)s;
5196    e = q + size - 1;
5197
5198    if (byteorder)
5199        bo = *byteorder;
5200
5201    /* Check for BOM marks (U+FEFF) in the input and adjust current
5202       byte order setting accordingly. In native mode, the leading BOM
5203       mark is skipped, in all other modes, it is copied to the output
5204       stream as-is (giving a ZWNBSP character). */
5205    if (bo == 0) {
5206        if (size >= 2) {
5207            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
5208#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5209            if (bom == 0xFEFF) {
5210                q += 2;
5211                bo = -1;
5212            }
5213            else if (bom == 0xFFFE) {
5214                q += 2;
5215                bo = 1;
5216            }
5217#else
5218            if (bom == 0xFEFF) {
5219                q += 2;
5220                bo = 1;
5221            }
5222            else if (bom == 0xFFFE) {
5223                q += 2;
5224                bo = -1;
5225            }
5226#endif
5227        }
5228    }
5229
5230    if (bo == -1) {
5231        /* force LE */
5232        ihi = 1;
5233        ilo = 0;
5234    }
5235    else if (bo == 1) {
5236        /* force BE */
5237        ihi = 0;
5238        ilo = 1;
5239    }
5240#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5241    native_ordering = ilo < ihi;
5242#else
5243    native_ordering = ilo > ihi;
5244#endif
5245
5246    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5247    while (q < e) {
5248        Py_UNICODE ch;
5249        /* First check for possible aligned read of a C 'long'. Unaligned
5250           reads are more expensive, better to defer to another iteration. */
5251        if (!((size_t) q & LONG_PTR_MASK)) {
5252            /* Fast path for runs of non-surrogate chars. */
5253            register const unsigned char *_q = q;
5254            Py_UNICODE *_p = p;
5255            if (native_ordering) {
5256                /* Native ordering is simple: as long as the input cannot
5257                   possibly contain a surrogate char, do an unrolled copy
5258                   of several 16-bit code points to the target object.
5259                   The non-surrogate check is done on several input bytes
5260                   at a time (as many as a C 'long' can contain). */
5261                while (_q < aligned_end) {
5262                    unsigned long data = * (unsigned long *) _q;
5263                    if (data & FAST_CHAR_MASK)
5264                        break;
5265                    _p[0] = ((unsigned short *) _q)[0];
5266                    _p[1] = ((unsigned short *) _q)[1];
5267#if (SIZEOF_LONG == 8)
5268                    _p[2] = ((unsigned short *) _q)[2];
5269                    _p[3] = ((unsigned short *) _q)[3];
5270#endif
5271                    _q += SIZEOF_LONG;
5272                    _p += SIZEOF_LONG / 2;
5273                }
5274            }
5275            else {
5276                /* Byteswapped ordering is similar, but we must decompose
5277                   the copy bytewise, and take care of zero'ing out the
5278                   upper bytes if the target object is in 32-bit units
5279                   (that is, in UCS-4 builds). */
5280                while (_q < aligned_end) {
5281                    unsigned long data = * (unsigned long *) _q;
5282                    if (data & SWAPPED_FAST_CHAR_MASK)
5283                        break;
5284                    /* Zero upper bytes in UCS-4 builds */
5285#if (Py_UNICODE_SIZE > 2)
5286                    _p[0] = 0;
5287                    _p[1] = 0;
5288#if (SIZEOF_LONG == 8)
5289                    _p[2] = 0;
5290                    _p[3] = 0;
5291#endif
5292#endif
5293                    /* Issue #4916; UCS-4 builds on big endian machines must
5294                       fill the two last bytes of each 4-byte unit. */
5295#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5296# define OFF 2
5297#else
5298# define OFF 0
5299#endif
5300                    ((unsigned char *) _p)[OFF + 1] = _q[0];
5301                    ((unsigned char *) _p)[OFF + 0] = _q[1];
5302                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5303                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5304#if (SIZEOF_LONG == 8)
5305                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5306                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5307                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5308                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5309#endif
5310#undef OFF
5311                    _q += SIZEOF_LONG;
5312                    _p += SIZEOF_LONG / 2;
5313                }
5314            }
5315            p = _p;
5316            q = _q;
5317            if (q >= e)
5318                break;
5319        }
5320        ch = (q[ihi] << 8) | q[ilo];
5321
5322        q += 2;
5323
5324        if (ch < 0xD800 || ch > 0xDFFF) {
5325            *p++ = ch;
5326            continue;
5327        }
5328
5329        /* UTF-16 code pair: */
5330        if (q > e) {
5331            errmsg = "unexpected end of data";
5332            startinpos = (((const char *)q) - 2) - starts;
5333            endinpos = ((const char *)e) + 1 - starts;
5334            goto utf16Error;
5335        }
5336        if (0xD800 <= ch && ch <= 0xDBFF) {
5337            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5338            q += 2;
5339            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5340#ifndef Py_UNICODE_WIDE
5341                *p++ = ch;
5342                *p++ = ch2;
5343#else
5344                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5345#endif
5346                continue;
5347            }
5348            else {
5349                errmsg = "illegal UTF-16 surrogate";
5350                startinpos = (((const char *)q)-4)-starts;
5351                endinpos = startinpos+2;
5352                goto utf16Error;
5353            }
5354
5355        }
5356        errmsg = "illegal encoding";
5357        startinpos = (((const char *)q)-2)-starts;
5358        endinpos = startinpos+2;
5359        /* Fall through to report the error */
5360
5361      utf16Error:
5362        outpos = p - PyUnicode_AS_UNICODE(unicode);
5363        if (unicode_decode_call_errorhandler(
5364                errors,
5365                &errorHandler,
5366                "utf16", errmsg,
5367                &starts,
5368                (const char **)&e,
5369                &startinpos,
5370                &endinpos,
5371                &exc,
5372                (const char **)&q,
5373                &unicode,
5374                &outpos,
5375                &p))
5376            goto onError;
5377    }
5378    /* remaining byte at the end? (size should be even) */
5379    if (e == q) {
5380        if (!consumed) {
5381            errmsg = "truncated data";
5382            startinpos = ((const char *)q) - starts;
5383            endinpos = ((const char *)e) + 1 - starts;
5384            outpos = p - PyUnicode_AS_UNICODE(unicode);
5385            if (unicode_decode_call_errorhandler(
5386                    errors,
5387                    &errorHandler,
5388                    "utf16", errmsg,
5389                    &starts,
5390                    (const char **)&e,
5391                    &startinpos,
5392                    &endinpos,
5393                    &exc,
5394                    (const char **)&q,
5395                    &unicode,
5396                    &outpos,
5397                    &p))
5398                goto onError;
5399            /* The remaining input chars are ignored if the callback
5400               chooses to skip the input */
5401        }
5402    }
5403
5404    if (byteorder)
5405        *byteorder = bo;
5406
5407    if (consumed)
5408        *consumed = (const char *)q-starts;
5409
5410    /* Adjust length */
5411    if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5412        goto onError;
5413
5414    Py_XDECREF(errorHandler);
5415    Py_XDECREF(exc);
5416#ifndef DONT_MAKE_RESULT_READY
5417    if (_PyUnicode_READY_REPLACE(&unicode)) {
5418        Py_DECREF(unicode);
5419        return NULL;
5420    }
5421#endif
5422    assert(_PyUnicode_CheckConsistency(unicode, 1));
5423    return unicode;
5424
5425  onError:
5426    Py_DECREF(unicode);
5427    Py_XDECREF(errorHandler);
5428    Py_XDECREF(exc);
5429    return NULL;
5430}
5431
5432#undef FAST_CHAR_MASK
5433#undef SWAPPED_FAST_CHAR_MASK
5434
5435PyObject *
5436PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5437                      Py_ssize_t size,
5438                      const char *errors,
5439                      int byteorder)
5440{
5441    PyObject *v;
5442    unsigned char *p;
5443    Py_ssize_t nsize, bytesize;
5444#ifdef Py_UNICODE_WIDE
5445    Py_ssize_t i, pairs;
5446#else
5447    const int pairs = 0;
5448#endif
5449    /* Offsets from p for storing byte pairs in the right order. */
5450#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5451    int ihi = 1, ilo = 0;
5452#else
5453    int ihi = 0, ilo = 1;
5454#endif
5455
5456#define STORECHAR(CH)                           \
5457    do {                                        \
5458        p[ihi] = ((CH) >> 8) & 0xff;            \
5459        p[ilo] = (CH) & 0xff;                   \
5460        p += 2;                                 \
5461    } while(0)
5462
5463#ifdef Py_UNICODE_WIDE
5464    for (i = pairs = 0; i < size; i++)
5465        if (s[i] >= 0x10000)
5466            pairs++;
5467#endif
5468    /* 2 * (size + pairs + (byteorder == 0)) */
5469    if (size > PY_SSIZE_T_MAX ||
5470        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5471        return PyErr_NoMemory();
5472    nsize = size + pairs + (byteorder == 0);
5473    bytesize = nsize * 2;
5474    if (bytesize / 2 != nsize)
5475        return PyErr_NoMemory();
5476    v = PyBytes_FromStringAndSize(NULL, bytesize);
5477    if (v == NULL)
5478        return NULL;
5479
5480    p = (unsigned char *)PyBytes_AS_STRING(v);
5481    if (byteorder == 0)
5482        STORECHAR(0xFEFF);
5483    if (size == 0)
5484        goto done;
5485
5486    if (byteorder == -1) {
5487        /* force LE */
5488        ihi = 1;
5489        ilo = 0;
5490    }
5491    else if (byteorder == 1) {
5492        /* force BE */
5493        ihi = 0;
5494        ilo = 1;
5495    }
5496
5497    while (size-- > 0) {
5498        Py_UNICODE ch = *s++;
5499        Py_UNICODE ch2 = 0;
5500#ifdef Py_UNICODE_WIDE
5501        if (ch >= 0x10000) {
5502            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5503            ch  = 0xD800 | ((ch-0x10000) >> 10);
5504        }
5505#endif
5506        STORECHAR(ch);
5507        if (ch2)
5508            STORECHAR(ch2);
5509    }
5510
5511  done:
5512    return v;
5513#undef STORECHAR
5514}
5515
5516PyObject *
5517PyUnicode_AsUTF16String(PyObject *unicode)
5518{
5519    if (!PyUnicode_Check(unicode)) {
5520        PyErr_BadArgument();
5521        return NULL;
5522    }
5523    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
5524                                 PyUnicode_GET_SIZE(unicode),
5525                                 NULL,
5526                                 0);
5527}
5528
5529/* --- Unicode Escape Codec ----------------------------------------------- */
5530
5531/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5532   if all the escapes in the string make it still a valid ASCII string.
5533   Returns -1 if any escapes were found which cause the string to
5534   pop out of ASCII range.  Otherwise returns the length of the
5535   required buffer to hold the string.
5536   */
5537static Py_ssize_t
5538length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5539{
5540    const unsigned char *p = (const unsigned char *)s;
5541    const unsigned char *end = p + size;
5542    Py_ssize_t length = 0;
5543
5544    if (size < 0)
5545        return -1;
5546
5547    for (; p < end; ++p) {
5548        if (*p > 127) {
5549            /* Non-ASCII */
5550            return -1;
5551        }
5552        else if (*p != '\\') {
5553            /* Normal character */
5554            ++length;
5555        }
5556        else {
5557            /* Backslash-escape, check next char */
5558            ++p;
5559            /* Escape sequence reaches till end of string or
5560               non-ASCII follow-up. */
5561            if (p >= end || *p > 127)
5562                return -1;
5563            switch (*p) {
5564            case '\n':
5565                /* backslash + \n result in zero characters */
5566                break;
5567            case '\\': case '\'': case '\"':
5568            case 'b': case 'f': case 't':
5569            case 'n': case 'r': case 'v': case 'a':
5570                ++length;
5571                break;
5572            case '0': case '1': case '2': case '3':
5573            case '4': case '5': case '6': case '7':
5574            case 'x': case 'u': case 'U': case 'N':
5575                /* these do not guarantee ASCII characters */
5576                return -1;
5577            default:
5578                /* count the backslash + the other character */
5579                length += 2;
5580            }
5581        }
5582    }
5583    return length;
5584}
5585
5586/* Similar to PyUnicode_WRITE but either write into wstr field
5587   or treat string as ASCII. */
5588#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5589    do { \
5590        if ((kind) != PyUnicode_WCHAR_KIND) \
5591            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5592        else \
5593            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5594    } while (0)
5595
5596#define WRITE_WSTR(buf, index, value) \
5597    assert(kind == PyUnicode_WCHAR_KIND), \
5598    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5599
5600
5601static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5602
5603PyObject *
5604PyUnicode_DecodeUnicodeEscape(const char *s,
5605                              Py_ssize_t size,
5606                              const char *errors)
5607{
5608    const char *starts = s;
5609    Py_ssize_t startinpos;
5610    Py_ssize_t endinpos;
5611    int j;
5612    PyObject *v;
5613    Py_UNICODE *p;
5614    const char *end;
5615    char* message;
5616    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5617    PyObject *errorHandler = NULL;
5618    PyObject *exc = NULL;
5619    Py_ssize_t ascii_length;
5620    Py_ssize_t i;
5621    int kind;
5622    void *data;
5623
5624    ascii_length = length_of_escaped_ascii_string(s, size);
5625
5626    /* After length_of_escaped_ascii_string() there are two alternatives,
5627       either the string is pure ASCII with named escapes like \n, etc.
5628       and we determined it's exact size (common case)
5629       or it contains \x, \u, ... escape sequences.  then we create a
5630       legacy wchar string and resize it at the end of this function. */
5631    if (ascii_length >= 0) {
5632        v = PyUnicode_New(ascii_length, 127);
5633        if (!v)
5634            goto onError;
5635        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5636        kind = PyUnicode_1BYTE_KIND;
5637        data = PyUnicode_DATA(v);
5638    }
5639    else {
5640        /* Escaped strings will always be longer than the resulting
5641           Unicode string, so we start with size here and then reduce the
5642           length after conversion to the true value.
5643           (but if the error callback returns a long replacement string
5644           we'll have to allocate more space) */
5645        v = (PyObject*)_PyUnicode_New(size);
5646        if (!v)
5647            goto onError;
5648        kind = PyUnicode_WCHAR_KIND;
5649        data = PyUnicode_AS_UNICODE(v);
5650    }
5651
5652    if (size == 0)
5653        return v;
5654    i = 0;
5655    end = s + size;
5656
5657    while (s < end) {
5658        unsigned char c;
5659        Py_UNICODE x;
5660        int digits;
5661
5662        if (kind == PyUnicode_WCHAR_KIND) {
5663            assert(i < _PyUnicode_WSTR_LENGTH(v));
5664        }
5665        else {
5666            /* The only case in which i == ascii_length is a backslash
5667               followed by a newline. */
5668            assert(i <= ascii_length);
5669        }
5670
5671        /* Non-escape characters are interpreted as Unicode ordinals */
5672        if (*s != '\\') {
5673            WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
5674            continue;
5675        }
5676
5677        startinpos = s-starts;
5678        /* \ - Escapes */
5679        s++;
5680        c = *s++;
5681        if (s > end)
5682            c = '\0'; /* Invalid after \ */
5683
5684        if (kind == PyUnicode_WCHAR_KIND) {
5685            assert(i < _PyUnicode_WSTR_LENGTH(v));
5686        }
5687        else {
5688            /* The only case in which i == ascii_length is a backslash
5689               followed by a newline. */
5690            assert(i < ascii_length || (i == ascii_length && c == '\n'));
5691        }
5692
5693        switch (c) {
5694
5695            /* \x escapes */
5696        case '\n': break;
5697        case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5698        case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5699        case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5700        case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5701        /* FF */
5702        case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5703        case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5704        case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5705        case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5706        /* VT */
5707        case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5708        /* BEL, not classic C */
5709        case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
5710
5711            /* \OOO (octal) escapes */
5712        case '0': case '1': case '2': case '3':
5713        case '4': case '5': case '6': case '7':
5714            x = s[-1] - '0';
5715            if (s < end && '0' <= *s && *s <= '7') {
5716                x = (x<<3) + *s++ - '0';
5717                if (s < end && '0' <= *s && *s <= '7')
5718                    x = (x<<3) + *s++ - '0';
5719            }
5720            WRITE_WSTR(data, i++, x);
5721            break;
5722
5723            /* hex escapes */
5724            /* \xXX */
5725        case 'x':
5726            digits = 2;
5727            message = "truncated \\xXX escape";
5728            goto hexescape;
5729
5730            /* \uXXXX */
5731        case 'u':
5732            digits = 4;
5733            message = "truncated \\uXXXX escape";
5734            goto hexescape;
5735
5736            /* \UXXXXXXXX */
5737        case 'U':
5738            digits = 8;
5739            message = "truncated \\UXXXXXXXX escape";
5740        hexescape:
5741            chr = 0;
5742            p = PyUnicode_AS_UNICODE(v) + i;
5743            if (s+digits>end) {
5744                endinpos = size;
5745                if (unicode_decode_call_errorhandler(
5746                        errors, &errorHandler,
5747                        "unicodeescape", "end of string in escape sequence",
5748                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5749                        &v, &i, &p))
5750                    goto onError;
5751                data = PyUnicode_AS_UNICODE(v);
5752                goto nextByte;
5753            }
5754            for (j = 0; j < digits; ++j) {
5755                c = (unsigned char) s[j];
5756                if (!Py_ISXDIGIT(c)) {
5757                    endinpos = (s+j+1)-starts;
5758                    p = PyUnicode_AS_UNICODE(v) + i;
5759                    if (unicode_decode_call_errorhandler(
5760                            errors, &errorHandler,
5761                            "unicodeescape", message,
5762                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5763                            &v, &i, &p))
5764                        goto onError;
5765                    data = PyUnicode_AS_UNICODE(v);
5766                    goto nextByte;
5767                }
5768                chr = (chr<<4) & ~0xF;
5769                if (c >= '0' && c <= '9')
5770                    chr += c - '0';
5771                else if (c >= 'a' && c <= 'f')
5772                    chr += 10 + c - 'a';
5773                else
5774                    chr += 10 + c - 'A';
5775            }
5776            s += j;
5777            if (chr == 0xffffffff && PyErr_Occurred())
5778                /* _decoding_error will have already written into the
5779                   target buffer. */
5780                break;
5781        store:
5782            /* when we get here, chr is a 32-bit unicode character */
5783            if (chr <= 0xffff)
5784                /* UCS-2 character */
5785                WRITE_WSTR(data, i++, chr);
5786            else if (chr <= 0x10ffff) {
5787                /* UCS-4 character. Either store directly, or as
5788                   surrogate pair. */
5789#ifdef Py_UNICODE_WIDE
5790                WRITE_WSTR(data, i++, chr);
5791#else
5792                chr -= 0x10000L;
5793                WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5794                WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
5795#endif
5796            } else {
5797                endinpos = s-starts;
5798                p = PyUnicode_AS_UNICODE(v) + i;
5799                if (unicode_decode_call_errorhandler(
5800                        errors, &errorHandler,
5801                        "unicodeescape", "illegal Unicode character",
5802                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5803                        &v, &i, &p))
5804                    goto onError;
5805                data = PyUnicode_AS_UNICODE(v);
5806            }
5807            break;
5808
5809            /* \N{name} */
5810        case 'N':
5811            message = "malformed \\N character escape";
5812            if (ucnhash_CAPI == NULL) {
5813                /* load the unicode data module */
5814                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5815                                                PyUnicodeData_CAPSULE_NAME, 1);
5816                if (ucnhash_CAPI == NULL)
5817                    goto ucnhashError;
5818            }
5819            if (*s == '{') {
5820                const char *start = s+1;
5821                /* look for the closing brace */
5822                while (*s != '}' && s < end)
5823                    s++;
5824                if (s > start && s < end && *s == '}') {
5825                    /* found a name.  look it up in the unicode database */
5826                    message = "unknown Unicode character name";
5827                    s++;
5828                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5829                                              &chr, 0))
5830                        goto store;
5831                }
5832            }
5833            endinpos = s-starts;
5834            p = PyUnicode_AS_UNICODE(v) + i;
5835            if (unicode_decode_call_errorhandler(
5836                    errors, &errorHandler,
5837                    "unicodeescape", message,
5838                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5839                    &v, &i, &p))
5840                goto onError;
5841            data = PyUnicode_AS_UNICODE(v);
5842            break;
5843
5844        default:
5845            if (s > end) {
5846                assert(kind == PyUnicode_WCHAR_KIND);
5847                message = "\\ at end of string";
5848                s--;
5849                endinpos = s-starts;
5850                p = PyUnicode_AS_UNICODE(v) + i;
5851                if (unicode_decode_call_errorhandler(
5852                        errors, &errorHandler,
5853                        "unicodeescape", message,
5854                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5855                        &v, &i, &p))
5856                    goto onError;
5857                data = PyUnicode_AS_UNICODE(v);
5858            }
5859            else {
5860                WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5861                WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
5862            }
5863            break;
5864        }
5865      nextByte:
5866        ;
5867    }
5868    /* Ensure the length prediction worked in case of ASCII strings */
5869    assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5870
5871    if (kind == PyUnicode_WCHAR_KIND)
5872    {
5873        if (PyUnicode_Resize(&v, i) < 0)
5874            goto onError;
5875    }
5876    Py_XDECREF(errorHandler);
5877    Py_XDECREF(exc);
5878#ifndef DONT_MAKE_RESULT_READY
5879    if (_PyUnicode_READY_REPLACE(&v)) {
5880        Py_DECREF(v);
5881        return NULL;
5882    }
5883#endif
5884    assert(_PyUnicode_CheckConsistency(v, 1));
5885    return v;
5886
5887  ucnhashError:
5888    PyErr_SetString(
5889        PyExc_UnicodeError,
5890        "\\N escapes not supported (can't load unicodedata module)"
5891        );
5892    Py_XDECREF(v);
5893    Py_XDECREF(errorHandler);
5894    Py_XDECREF(exc);
5895    return NULL;
5896
5897  onError:
5898    Py_XDECREF(v);
5899    Py_XDECREF(errorHandler);
5900    Py_XDECREF(exc);
5901    return NULL;
5902}
5903
5904#undef WRITE_ASCII_OR_WSTR
5905#undef WRITE_WSTR
5906
5907/* Return a Unicode-Escape string version of the Unicode object.
5908
5909   If quotes is true, the string is enclosed in u"" or u'' quotes as
5910   appropriate.
5911
5912*/
5913
5914PyObject *
5915PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5916                              Py_ssize_t size)
5917{
5918    PyObject *repr;
5919    char *p;
5920
5921#ifdef Py_UNICODE_WIDE
5922    const Py_ssize_t expandsize = 10;
5923#else
5924    const Py_ssize_t expandsize = 6;
5925#endif
5926
5927    /* XXX(nnorwitz): rather than over-allocating, it would be
5928       better to choose a different scheme.  Perhaps scan the
5929       first N-chars of the string and allocate based on that size.
5930    */
5931    /* Initial allocation is based on the longest-possible unichr
5932       escape.
5933
5934       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5935       unichr, so in this case it's the longest unichr escape. In
5936       narrow (UTF-16) builds this is five chars per source unichr
5937       since there are two unichrs in the surrogate pair, so in narrow
5938       (UTF-16) builds it's not the longest unichr escape.
5939
5940       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5941       so in the narrow (UTF-16) build case it's the longest unichr
5942       escape.
5943    */
5944
5945    if (size == 0)
5946        return PyBytes_FromStringAndSize(NULL, 0);
5947
5948    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5949        return PyErr_NoMemory();
5950
5951    repr = PyBytes_FromStringAndSize(NULL,
5952                                     2
5953                                     + expandsize*size
5954                                     + 1);
5955    if (repr == NULL)
5956        return NULL;
5957
5958    p = PyBytes_AS_STRING(repr);
5959
5960    while (size-- > 0) {
5961        Py_UNICODE ch = *s++;
5962
5963        /* Escape backslashes */
5964        if (ch == '\\') {
5965            *p++ = '\\';
5966            *p++ = (char) ch;
5967            continue;
5968        }
5969
5970#ifdef Py_UNICODE_WIDE
5971        /* Map 21-bit characters to '\U00xxxxxx' */
5972        else if (ch >= 0x10000) {
5973            *p++ = '\\';
5974            *p++ = 'U';
5975            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5976            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5977            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5978            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5979            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5980            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5981            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5982            *p++ = Py_hexdigits[ch & 0x0000000F];
5983            continue;
5984        }
5985#else
5986        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5987        else if (ch >= 0xD800 && ch < 0xDC00) {
5988            Py_UNICODE ch2;
5989            Py_UCS4 ucs;
5990
5991            ch2 = *s++;
5992            size--;
5993            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5994                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5995                *p++ = '\\';
5996                *p++ = 'U';
5997                *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
5998                *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
5999                *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6000                *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6001                *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6002                *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6003                *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6004                *p++ = Py_hexdigits[ucs & 0x0000000F];
6005                continue;
6006            }
6007            /* Fall through: isolated surrogates are copied as-is */
6008            s--;
6009            size++;
6010        }
6011#endif
6012
6013        /* Map 16-bit characters to '\uxxxx' */
6014        if (ch >= 256) {
6015            *p++ = '\\';
6016            *p++ = 'u';
6017            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6018            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6019            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6020            *p++ = Py_hexdigits[ch & 0x000F];
6021        }
6022
6023        /* Map special whitespace to '\t', \n', '\r' */
6024        else if (ch == '\t') {
6025            *p++ = '\\';
6026            *p++ = 't';
6027        }
6028        else if (ch == '\n') {
6029            *p++ = '\\';
6030            *p++ = 'n';
6031        }
6032        else if (ch == '\r') {
6033            *p++ = '\\';
6034            *p++ = 'r';
6035        }
6036
6037        /* Map non-printable US ASCII to '\xhh' */
6038        else if (ch < ' ' || ch >= 0x7F) {
6039            *p++ = '\\';
6040            *p++ = 'x';
6041            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6042            *p++ = Py_hexdigits[ch & 0x000F];
6043        }
6044
6045        /* Copy everything else as-is */
6046        else
6047            *p++ = (char) ch;
6048    }
6049
6050    assert(p - PyBytes_AS_STRING(repr) > 0);
6051    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6052        return NULL;
6053    return repr;
6054}
6055
6056PyObject *
6057PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6058{
6059    PyObject *s;
6060    if (!PyUnicode_Check(unicode)) {
6061        PyErr_BadArgument();
6062        return NULL;
6063    }
6064    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6065                                      PyUnicode_GET_SIZE(unicode));
6066    return s;
6067}
6068
6069/* --- Raw Unicode Escape Codec ------------------------------------------- */
6070
6071PyObject *
6072PyUnicode_DecodeRawUnicodeEscape(const char *s,
6073                                 Py_ssize_t size,
6074                                 const char *errors)
6075{
6076    const char *starts = s;
6077    Py_ssize_t startinpos;
6078    Py_ssize_t endinpos;
6079    Py_ssize_t outpos;
6080    PyObject *v;
6081    Py_UNICODE *p;
6082    const char *end;
6083    const char *bs;
6084    PyObject *errorHandler = NULL;
6085    PyObject *exc = NULL;
6086
6087    /* Escaped strings will always be longer than the resulting
6088       Unicode string, so we start with size here and then reduce the
6089       length after conversion to the true value. (But decoding error
6090       handler might have to resize the string) */
6091    v = (PyObject*)_PyUnicode_New(size);
6092    if (v == NULL)
6093        goto onError;
6094    if (size == 0)
6095        return v;
6096    p = PyUnicode_AS_UNICODE(v);
6097    end = s + size;
6098    while (s < end) {
6099        unsigned char c;
6100        Py_UCS4 x;
6101        int i;
6102        int count;
6103
6104        /* Non-escape characters are interpreted as Unicode ordinals */
6105        if (*s != '\\') {
6106            *p++ = (unsigned char)*s++;
6107            continue;
6108        }
6109        startinpos = s-starts;
6110
6111        /* \u-escapes are only interpreted iff the number of leading
6112           backslashes if odd */
6113        bs = s;
6114        for (;s < end;) {
6115            if (*s != '\\')
6116                break;
6117            *p++ = (unsigned char)*s++;
6118        }
6119        if (((s - bs) & 1) == 0 ||
6120            s >= end ||
6121            (*s != 'u' && *s != 'U')) {
6122            continue;
6123        }
6124        p--;
6125        count = *s=='u' ? 4 : 8;
6126        s++;
6127
6128        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6129        outpos = p-PyUnicode_AS_UNICODE(v);
6130        for (x = 0, i = 0; i < count; ++i, ++s) {
6131            c = (unsigned char)*s;
6132            if (!Py_ISXDIGIT(c)) {
6133                endinpos = s-starts;
6134                if (unicode_decode_call_errorhandler(
6135                        errors, &errorHandler,
6136                        "rawunicodeescape", "truncated \\uXXXX",
6137                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6138                        &v, &outpos, &p))
6139                    goto onError;
6140                goto nextByte;
6141            }
6142            x = (x<<4) & ~0xF;
6143            if (c >= '0' && c <= '9')
6144                x += c - '0';
6145            else if (c >= 'a' && c <= 'f')
6146                x += 10 + c - 'a';
6147            else
6148                x += 10 + c - 'A';
6149        }
6150        if (x <= 0xffff)
6151            /* UCS-2 character */
6152            *p++ = (Py_UNICODE) x;
6153        else if (x <= 0x10ffff) {
6154            /* UCS-4 character. Either store directly, or as
6155               surrogate pair. */
6156#ifdef Py_UNICODE_WIDE
6157            *p++ = (Py_UNICODE) x;
6158#else
6159            x -= 0x10000L;
6160            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6161            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
6162#endif
6163        } else {
6164            endinpos = s-starts;
6165            outpos = p-PyUnicode_AS_UNICODE(v);
6166            if (unicode_decode_call_errorhandler(
6167                    errors, &errorHandler,
6168                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6169                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6170                    &v, &outpos, &p))
6171                goto onError;
6172        }
6173      nextByte:
6174        ;
6175    }
6176    if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6177        goto onError;
6178    Py_XDECREF(errorHandler);
6179    Py_XDECREF(exc);
6180#ifndef DONT_MAKE_RESULT_READY
6181    if (_PyUnicode_READY_REPLACE(&v)) {
6182        Py_DECREF(v);
6183        return NULL;
6184    }
6185#endif
6186    assert(_PyUnicode_CheckConsistency(v, 1));
6187    return v;
6188
6189  onError:
6190    Py_XDECREF(v);
6191    Py_XDECREF(errorHandler);
6192    Py_XDECREF(exc);
6193    return NULL;
6194}
6195
6196PyObject *
6197PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6198                                 Py_ssize_t size)
6199{
6200    PyObject *repr;
6201    char *p;
6202    char *q;
6203
6204#ifdef Py_UNICODE_WIDE
6205    const Py_ssize_t expandsize = 10;
6206#else
6207    const Py_ssize_t expandsize = 6;
6208#endif
6209
6210    if (size > PY_SSIZE_T_MAX / expandsize)
6211        return PyErr_NoMemory();
6212
6213    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
6214    if (repr == NULL)
6215        return NULL;
6216    if (size == 0)
6217        return repr;
6218
6219    p = q = PyBytes_AS_STRING(repr);
6220    while (size-- > 0) {
6221        Py_UNICODE ch = *s++;
6222#ifdef Py_UNICODE_WIDE
6223        /* Map 32-bit characters to '\Uxxxxxxxx' */
6224        if (ch >= 0x10000) {
6225            *p++ = '\\';
6226            *p++ = 'U';
6227            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6228            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6229            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6230            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6231            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6232            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6233            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6234            *p++ = Py_hexdigits[ch & 15];
6235        }
6236        else
6237#else
6238            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6239            if (ch >= 0xD800 && ch < 0xDC00) {
6240                Py_UNICODE ch2;
6241                Py_UCS4 ucs;
6242
6243                ch2 = *s++;
6244                size--;
6245                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6246                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6247                    *p++ = '\\';
6248                    *p++ = 'U';
6249                    *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6250                    *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6251                    *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6252                    *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6253                    *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6254                    *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6255                    *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6256                    *p++ = Py_hexdigits[ucs & 0xf];
6257                    continue;
6258                }
6259                /* Fall through: isolated surrogates are copied as-is */
6260                s--;
6261                size++;
6262            }
6263#endif
6264        /* Map 16-bit characters to '\uxxxx' */
6265        if (ch >= 256) {
6266            *p++ = '\\';
6267            *p++ = 'u';
6268            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6269            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6270            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6271            *p++ = Py_hexdigits[ch & 15];
6272        }
6273        /* Copy everything else as-is */
6274        else
6275            *p++ = (char) ch;
6276    }
6277    size = p - q;
6278
6279    assert(size > 0);
6280    if (_PyBytes_Resize(&repr, size) < 0)
6281        return NULL;
6282    return repr;
6283}
6284
6285PyObject *
6286PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6287{
6288    PyObject *s;
6289    if (!PyUnicode_Check(unicode)) {
6290        PyErr_BadArgument();
6291        return NULL;
6292    }
6293    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6294                                         PyUnicode_GET_SIZE(unicode));
6295
6296    return s;
6297}
6298
6299/* --- Unicode Internal Codec ------------------------------------------- */
6300
6301PyObject *
6302_PyUnicode_DecodeUnicodeInternal(const char *s,
6303                                 Py_ssize_t size,
6304                                 const char *errors)
6305{
6306    const char *starts = s;
6307    Py_ssize_t startinpos;
6308    Py_ssize_t endinpos;
6309    Py_ssize_t outpos;
6310    PyObject *v;
6311    Py_UNICODE *p;
6312    const char *end;
6313    const char *reason;
6314    PyObject *errorHandler = NULL;
6315    PyObject *exc = NULL;
6316
6317#ifdef Py_UNICODE_WIDE
6318    Py_UNICODE unimax = PyUnicode_GetMax();
6319#endif
6320
6321    /* XXX overflow detection missing */
6322    v = (PyObject*)_PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6323    if (v == NULL)
6324        goto onError;
6325    /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6326       as string was created with the old API. */
6327    if (PyUnicode_GET_SIZE(v) == 0)
6328        return v;
6329    p = PyUnicode_AS_UNICODE(v);
6330    end = s + size;
6331
6332    while (s < end) {
6333        memcpy(p, s, sizeof(Py_UNICODE));
6334        /* We have to sanity check the raw data, otherwise doom looms for
6335           some malformed UCS-4 data. */
6336        if (
6337#ifdef Py_UNICODE_WIDE
6338            *p > unimax || *p < 0 ||
6339#endif
6340            end-s < Py_UNICODE_SIZE
6341            )
6342        {
6343            startinpos = s - starts;
6344            if (end-s < Py_UNICODE_SIZE) {
6345                endinpos = end-starts;
6346                reason = "truncated input";
6347            }
6348            else {
6349                endinpos = s - starts + Py_UNICODE_SIZE;
6350                reason = "illegal code point (> 0x10FFFF)";
6351            }
6352            outpos = p - PyUnicode_AS_UNICODE(v);
6353            if (unicode_decode_call_errorhandler(
6354                    errors, &errorHandler,
6355                    "unicode_internal", reason,
6356                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6357                    &v, &outpos, &p)) {
6358                goto onError;
6359            }
6360        }
6361        else {
6362            p++;
6363            s += Py_UNICODE_SIZE;
6364        }
6365    }
6366
6367    if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6368        goto onError;
6369    Py_XDECREF(errorHandler);
6370    Py_XDECREF(exc);
6371#ifndef DONT_MAKE_RESULT_READY
6372    if (_PyUnicode_READY_REPLACE(&v)) {
6373        Py_DECREF(v);
6374        return NULL;
6375    }
6376#endif
6377    assert(_PyUnicode_CheckConsistency(v, 1));
6378    return v;
6379
6380  onError:
6381    Py_XDECREF(v);
6382    Py_XDECREF(errorHandler);
6383    Py_XDECREF(exc);
6384    return NULL;
6385}
6386
6387/* --- Latin-1 Codec ------------------------------------------------------ */
6388
6389PyObject *
6390PyUnicode_DecodeLatin1(const char *s,
6391                       Py_ssize_t size,
6392                       const char *errors)
6393{
6394    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6395    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6396}
6397
6398/* create or adjust a UnicodeEncodeError */
6399static void
6400make_encode_exception(PyObject **exceptionObject,
6401                      const char *encoding,
6402                      PyObject *unicode,
6403                      Py_ssize_t startpos, Py_ssize_t endpos,
6404                      const char *reason)
6405{
6406    if (*exceptionObject == NULL) {
6407        *exceptionObject = PyObject_CallFunction(
6408            PyExc_UnicodeEncodeError, "sOnns",
6409            encoding, unicode, startpos, endpos, reason);
6410    }
6411    else {
6412        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6413            goto onError;
6414        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6415            goto onError;
6416        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6417            goto onError;
6418        return;
6419      onError:
6420        Py_DECREF(*exceptionObject);
6421        *exceptionObject = NULL;
6422    }
6423}
6424
6425/* raises a UnicodeEncodeError */
6426static void
6427raise_encode_exception(PyObject **exceptionObject,
6428                       const char *encoding,
6429                       PyObject *unicode,
6430                       Py_ssize_t startpos, Py_ssize_t endpos,
6431                       const char *reason)
6432{
6433    make_encode_exception(exceptionObject,
6434                          encoding, unicode, startpos, endpos, reason);
6435    if (*exceptionObject != NULL)
6436        PyCodec_StrictErrors(*exceptionObject);
6437}
6438
6439/* error handling callback helper:
6440   build arguments, call the callback and check the arguments,
6441   put the result into newpos and return the replacement string, which
6442   has to be freed by the caller */
6443static PyObject *
6444unicode_encode_call_errorhandler(const char *errors,
6445                                 PyObject **errorHandler,
6446                                 const char *encoding, const char *reason,
6447                                 PyObject *unicode, PyObject **exceptionObject,
6448                                 Py_ssize_t startpos, Py_ssize_t endpos,
6449                                 Py_ssize_t *newpos)
6450{
6451    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6452    Py_ssize_t len;
6453    PyObject *restuple;
6454    PyObject *resunicode;
6455
6456    if (*errorHandler == NULL) {
6457        *errorHandler = PyCodec_LookupError(errors);
6458        if (*errorHandler == NULL)
6459            return NULL;
6460    }
6461
6462    if (PyUnicode_READY(unicode) < 0)
6463        return NULL;
6464    len = PyUnicode_GET_LENGTH(unicode);
6465
6466    make_encode_exception(exceptionObject,
6467                          encoding, unicode, startpos, endpos, reason);
6468    if (*exceptionObject == NULL)
6469        return NULL;
6470
6471    restuple = PyObject_CallFunctionObjArgs(
6472        *errorHandler, *exceptionObject, NULL);
6473    if (restuple == NULL)
6474        return NULL;
6475    if (!PyTuple_Check(restuple)) {
6476        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6477        Py_DECREF(restuple);
6478        return NULL;
6479    }
6480    if (!PyArg_ParseTuple(restuple, argparse,
6481                          &resunicode, newpos)) {
6482        Py_DECREF(restuple);
6483        return NULL;
6484    }
6485    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6486        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6487        Py_DECREF(restuple);
6488        return NULL;
6489    }
6490    if (*newpos<0)
6491        *newpos = len + *newpos;
6492    if (*newpos<0 || *newpos>len) {
6493        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6494        Py_DECREF(restuple);
6495        return NULL;
6496    }
6497    Py_INCREF(resunicode);
6498    Py_DECREF(restuple);
6499    return resunicode;
6500}
6501
6502static PyObject *
6503unicode_encode_ucs1(PyObject *unicode,
6504                    const char *errors,
6505                    unsigned int limit)
6506{
6507    /* input state */
6508    Py_ssize_t pos=0, size;
6509    int kind;
6510    void *data;
6511    /* output object */
6512    PyObject *res;
6513    /* pointer into the output */
6514    char *str;
6515    /* current output position */
6516    Py_ssize_t ressize;
6517    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6518    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6519    PyObject *errorHandler = NULL;
6520    PyObject *exc = NULL;
6521    /* the following variable is used for caching string comparisons
6522     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6523    int known_errorHandler = -1;
6524
6525    if (PyUnicode_READY(unicode) < 0)
6526        return NULL;
6527    size = PyUnicode_GET_LENGTH(unicode);
6528    kind = PyUnicode_KIND(unicode);
6529    data = PyUnicode_DATA(unicode);
6530    /* allocate enough for a simple encoding without
6531       replacements, if we need more, we'll resize */
6532    if (size == 0)
6533        return PyBytes_FromStringAndSize(NULL, 0);
6534    res = PyBytes_FromStringAndSize(NULL, size);
6535    if (res == NULL)
6536        return NULL;
6537    str = PyBytes_AS_STRING(res);
6538    ressize = size;
6539
6540    while (pos < size) {
6541        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6542
6543        /* can we encode this? */
6544        if (c<limit) {
6545            /* no overflow check, because we know that the space is enough */
6546            *str++ = (char)c;
6547            ++pos;
6548        }
6549        else {
6550            Py_ssize_t requiredsize;
6551            PyObject *repunicode;
6552            Py_ssize_t repsize, newpos, respos, i;
6553            /* startpos for collecting unencodable chars */
6554            Py_ssize_t collstart = pos;
6555            Py_ssize_t collend = pos;
6556            /* find all unecodable characters */
6557            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6558                ++collend;
6559            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6560            if (known_errorHandler==-1) {
6561                if ((errors==NULL) || (!strcmp(errors, "strict")))
6562                    known_errorHandler = 1;
6563                else if (!strcmp(errors, "replace"))
6564                    known_errorHandler = 2;
6565                else if (!strcmp(errors, "ignore"))
6566                    known_errorHandler = 3;
6567                else if (!strcmp(errors, "xmlcharrefreplace"))
6568                    known_errorHandler = 4;
6569                else
6570                    known_errorHandler = 0;
6571            }
6572            switch (known_errorHandler) {
6573            case 1: /* strict */
6574                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6575                goto onError;
6576            case 2: /* replace */
6577                while (collstart++<collend)
6578                    *str++ = '?'; /* fall through */
6579            case 3: /* ignore */
6580                pos = collend;
6581                break;
6582            case 4: /* xmlcharrefreplace */
6583                respos = str - PyBytes_AS_STRING(res);
6584                /* determine replacement size */
6585                for (i = collstart, repsize = 0; i < collend; ++i) {
6586                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6587                    if (ch < 10)
6588                        repsize += 2+1+1;
6589                    else if (ch < 100)
6590                        repsize += 2+2+1;
6591                    else if (ch < 1000)
6592                        repsize += 2+3+1;
6593                    else if (ch < 10000)
6594                        repsize += 2+4+1;
6595#ifndef Py_UNICODE_WIDE
6596                    else
6597                        repsize += 2+5+1;
6598#else
6599                    else if (ch < 100000)
6600                        repsize += 2+5+1;
6601                    else if (ch < 1000000)
6602                        repsize += 2+6+1;
6603                    else
6604                        repsize += 2+7+1;
6605#endif
6606                }
6607                requiredsize = respos+repsize+(size-collend);
6608                if (requiredsize > ressize) {
6609                    if (requiredsize<2*ressize)
6610                        requiredsize = 2*ressize;
6611                    if (_PyBytes_Resize(&res, requiredsize))
6612                        goto onError;
6613                    str = PyBytes_AS_STRING(res) + respos;
6614                    ressize = requiredsize;
6615                }
6616                /* generate replacement */
6617                for (i = collstart; i < collend; ++i) {
6618                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6619                }
6620                pos = collend;
6621                break;
6622            default:
6623                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6624                                                              encoding, reason, unicode, &exc,
6625                                                              collstart, collend, &newpos);
6626                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6627                                           PyUnicode_READY(repunicode) < 0))
6628                    goto onError;
6629                if (PyBytes_Check(repunicode)) {
6630                    /* Directly copy bytes result to output. */
6631                    repsize = PyBytes_Size(repunicode);
6632                    if (repsize > 1) {
6633                        /* Make room for all additional bytes. */
6634                        respos = str - PyBytes_AS_STRING(res);
6635                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6636                            Py_DECREF(repunicode);
6637                            goto onError;
6638                        }
6639                        str = PyBytes_AS_STRING(res) + respos;
6640                        ressize += repsize-1;
6641                    }
6642                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6643                    str += repsize;
6644                    pos = newpos;
6645                    Py_DECREF(repunicode);
6646                    break;
6647                }
6648                /* need more space? (at least enough for what we
6649                   have+the replacement+the rest of the string, so
6650                   we won't have to check space for encodable characters) */
6651                respos = str - PyBytes_AS_STRING(res);
6652                repsize = PyUnicode_GET_LENGTH(repunicode);
6653                requiredsize = respos+repsize+(size-collend);
6654                if (requiredsize > ressize) {
6655                    if (requiredsize<2*ressize)
6656                        requiredsize = 2*ressize;
6657                    if (_PyBytes_Resize(&res, requiredsize)) {
6658                        Py_DECREF(repunicode);
6659                        goto onError;
6660                    }
6661                    str = PyBytes_AS_STRING(res) + respos;
6662                    ressize = requiredsize;
6663                }
6664                /* check if there is anything unencodable in the replacement
6665                   and copy it to the output */
6666                for (i = 0; repsize-->0; ++i, ++str) {
6667                    c = PyUnicode_READ_CHAR(repunicode, i);
6668                    if (c >= limit) {
6669                        raise_encode_exception(&exc, encoding, unicode,
6670                                               pos, pos+1, reason);
6671                        Py_DECREF(repunicode);
6672                        goto onError;
6673                    }
6674                    *str = (char)c;
6675                }
6676                pos = newpos;
6677                Py_DECREF(repunicode);
6678            }
6679        }
6680    }
6681    /* Resize if we allocated to much */
6682    size = str - PyBytes_AS_STRING(res);
6683    if (size < ressize) { /* If this falls res will be NULL */
6684        assert(size >= 0);
6685        if (_PyBytes_Resize(&res, size) < 0)
6686            goto onError;
6687    }
6688
6689    Py_XDECREF(errorHandler);
6690    Py_XDECREF(exc);
6691    return res;
6692
6693  onError:
6694    Py_XDECREF(res);
6695    Py_XDECREF(errorHandler);
6696    Py_XDECREF(exc);
6697    return NULL;
6698}
6699
6700/* Deprecated */
6701PyObject *
6702PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6703                       Py_ssize_t size,
6704                       const char *errors)
6705{
6706    PyObject *result;
6707    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6708    if (unicode == NULL)
6709        return NULL;
6710    result = unicode_encode_ucs1(unicode, errors, 256);
6711    Py_DECREF(unicode);
6712    return result;
6713}
6714
6715PyObject *
6716_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6717{
6718    if (!PyUnicode_Check(unicode)) {
6719        PyErr_BadArgument();
6720        return NULL;
6721    }
6722    if (PyUnicode_READY(unicode) == -1)
6723        return NULL;
6724    /* Fast path: if it is a one-byte string, construct
6725       bytes object directly. */
6726    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6727        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6728                                         PyUnicode_GET_LENGTH(unicode));
6729    /* Non-Latin-1 characters present. Defer to above function to
6730       raise the exception. */
6731    return unicode_encode_ucs1(unicode, errors, 256);
6732}
6733
6734PyObject*
6735PyUnicode_AsLatin1String(PyObject *unicode)
6736{
6737    return _PyUnicode_AsLatin1String(unicode, NULL);
6738}
6739
6740/* --- 7-bit ASCII Codec -------------------------------------------------- */
6741
6742PyObject *
6743PyUnicode_DecodeASCII(const char *s,
6744                      Py_ssize_t size,
6745                      const char *errors)
6746{
6747    const char *starts = s;
6748    PyObject *v;
6749    Py_UNICODE *u;
6750    Py_ssize_t startinpos;
6751    Py_ssize_t endinpos;
6752    Py_ssize_t outpos;
6753    const char *e;
6754    int has_error;
6755    const unsigned char *p = (const unsigned char *)s;
6756    const unsigned char *end = p + size;
6757    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
6758    PyObject *errorHandler = NULL;
6759    PyObject *exc = NULL;
6760
6761    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6762    if (size == 1 && (unsigned char)s[0] < 128)
6763        return get_latin1_char((unsigned char)s[0]);
6764
6765    has_error = 0;
6766    while (p < end && !has_error) {
6767        /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6768           an explanation. */
6769        if (!((size_t) p & LONG_PTR_MASK)) {
6770            /* Help register allocation */
6771            register const unsigned char *_p = p;
6772            while (_p < aligned_end) {
6773                unsigned long value = *(unsigned long *) _p;
6774                if (value & ASCII_CHAR_MASK) {
6775                    has_error = 1;
6776                    break;
6777                }
6778                _p += SIZEOF_LONG;
6779            }
6780            if (_p == end)
6781                break;
6782            if (has_error)
6783                break;
6784            p = _p;
6785        }
6786        if (*p & 0x80) {
6787            has_error = 1;
6788            break;
6789        }
6790        else {
6791            ++p;
6792        }
6793    }
6794    if (!has_error)
6795        return unicode_fromascii((const unsigned char *)s, size);
6796
6797    v = (PyObject*)_PyUnicode_New(size);
6798    if (v == NULL)
6799        goto onError;
6800    if (size == 0)
6801        return v;
6802    u = PyUnicode_AS_UNICODE(v);
6803    e = s + size;
6804    while (s < e) {
6805        register unsigned char c = (unsigned char)*s;
6806        if (c < 128) {
6807            *u++ = c;
6808            ++s;
6809        }
6810        else {
6811            startinpos = s-starts;
6812            endinpos = startinpos + 1;
6813            outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6814            if (unicode_decode_call_errorhandler(
6815                    errors, &errorHandler,
6816                    "ascii", "ordinal not in range(128)",
6817                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6818                    &v, &outpos, &u))
6819                goto onError;
6820        }
6821    }
6822    if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6823        if (PyUnicode_Resize(&v, u - PyUnicode_AS_UNICODE(v)) < 0)
6824            goto onError;
6825    Py_XDECREF(errorHandler);
6826    Py_XDECREF(exc);
6827#ifndef DONT_MAKE_RESULT_READY
6828    if (_PyUnicode_READY_REPLACE(&v)) {
6829        Py_DECREF(v);
6830        return NULL;
6831    }
6832#endif
6833    assert(_PyUnicode_CheckConsistency(v, 1));
6834    return v;
6835
6836  onError:
6837    Py_XDECREF(v);
6838    Py_XDECREF(errorHandler);
6839    Py_XDECREF(exc);
6840    return NULL;
6841}
6842
6843/* Deprecated */
6844PyObject *
6845PyUnicode_EncodeASCII(const Py_UNICODE *p,
6846                      Py_ssize_t size,
6847                      const char *errors)
6848{
6849    PyObject *result;
6850    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6851    if (unicode == NULL)
6852        return NULL;
6853    result = unicode_encode_ucs1(unicode, errors, 128);
6854    Py_DECREF(unicode);
6855    return result;
6856}
6857
6858PyObject *
6859_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6860{
6861    if (!PyUnicode_Check(unicode)) {
6862        PyErr_BadArgument();
6863        return NULL;
6864    }
6865    if (PyUnicode_READY(unicode) == -1)
6866        return NULL;
6867    /* Fast path: if it is an ASCII-only string, construct bytes object
6868       directly. Else defer to above function to raise the exception. */
6869    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6870        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6871                                         PyUnicode_GET_LENGTH(unicode));
6872    return unicode_encode_ucs1(unicode, errors, 128);
6873}
6874
6875PyObject *
6876PyUnicode_AsASCIIString(PyObject *unicode)
6877{
6878    return _PyUnicode_AsASCIIString(unicode, NULL);
6879}
6880
6881#ifdef HAVE_MBCS
6882
6883/* --- MBCS codecs for Windows -------------------------------------------- */
6884
6885#if SIZEOF_INT < SIZEOF_SIZE_T
6886#define NEED_RETRY
6887#endif
6888
6889#ifndef WC_ERR_INVALID_CHARS
6890#  define WC_ERR_INVALID_CHARS 0x0080
6891#endif
6892
6893static char*
6894code_page_name(UINT code_page, PyObject **obj)
6895{
6896    *obj = NULL;
6897    if (code_page == CP_ACP)
6898        return "mbcs";
6899    if (code_page == CP_UTF7)
6900        return "CP_UTF7";
6901    if (code_page == CP_UTF8)
6902        return "CP_UTF8";
6903
6904    *obj = PyBytes_FromFormat("cp%u", code_page);
6905    if (*obj == NULL)
6906        return NULL;
6907    return PyBytes_AS_STRING(*obj);
6908}
6909
6910static int
6911is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6912{
6913    const char *curr = s + offset;
6914    const char *prev;
6915
6916    if (!IsDBCSLeadByteEx(code_page, *curr))
6917        return 0;
6918
6919    prev = CharPrevExA(code_page, s, curr, 0);
6920    if (prev == curr)
6921        return 1;
6922    /* FIXME: This code is limited to "true" double-byte encodings,
6923       as it assumes an incomplete character consists of a single
6924       byte. */
6925    if (curr - prev == 2)
6926        return 1;
6927    if (!IsDBCSLeadByteEx(code_page, *prev))
6928        return 1;
6929    return 0;
6930}
6931
6932static DWORD
6933decode_code_page_flags(UINT code_page)
6934{
6935    if (code_page == CP_UTF7) {
6936        /* The CP_UTF7 decoder only supports flags=0 */
6937        return 0;
6938    }
6939    else
6940        return MB_ERR_INVALID_CHARS;
6941}
6942
6943/*
6944 * Decode a byte string from a Windows code page into unicode object in strict
6945 * mode.
6946 *
6947 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6948 * WindowsError and returns -1 on other error.
6949 */
6950static int
6951decode_code_page_strict(UINT code_page,
6952                        PyObject **v,
6953                        const char *in,
6954                        int insize)
6955{
6956    const DWORD flags = decode_code_page_flags(code_page);
6957    Py_UNICODE *out;
6958    DWORD outsize;
6959
6960    /* First get the size of the result */
6961    assert(insize > 0);
6962    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6963    if (outsize <= 0)
6964        goto error;
6965
6966    if (*v == NULL) {
6967        /* Create unicode object */
6968        *v = (PyObject*)_PyUnicode_New(outsize);
6969        if (*v == NULL)
6970            return -1;
6971        out = PyUnicode_AS_UNICODE(*v);
6972    }
6973    else {
6974        /* Extend unicode object */
6975        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6976        if (PyUnicode_Resize(v, n + outsize) < 0)
6977            return -1;
6978        out = PyUnicode_AS_UNICODE(*v) + n;
6979    }
6980
6981    /* Do the conversion */
6982    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6983    if (outsize <= 0)
6984        goto error;
6985    return insize;
6986
6987error:
6988    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6989        return -2;
6990    PyErr_SetFromWindowsErr(0);
6991    return -1;
6992}
6993
6994/*
6995 * Decode a byte string from a code page into unicode object with an error
6996 * handler.
6997 *
6998 * Returns consumed size if succeed, or raise a WindowsError or
6999 * UnicodeDecodeError exception and returns -1 on error.
7000 */
7001static int
7002decode_code_page_errors(UINT code_page,
7003                        PyObject **v,
7004                        const char *in, const int size,
7005                        const char *errors)
7006{
7007    const char *startin = in;
7008    const char *endin = in + size;
7009    const DWORD flags = decode_code_page_flags(code_page);
7010    /* Ideally, we should get reason from FormatMessage. This is the Windows
7011       2000 English version of the message. */
7012    const char *reason = "No mapping for the Unicode character exists "
7013                         "in the target code page.";
7014    /* each step cannot decode more than 1 character, but a character can be
7015       represented as a surrogate pair */
7016    wchar_t buffer[2], *startout, *out;
7017    int insize, outsize;
7018    PyObject *errorHandler = NULL;
7019    PyObject *exc = NULL;
7020    PyObject *encoding_obj = NULL;
7021    char *encoding;
7022    DWORD err;
7023    int ret = -1;
7024
7025    assert(size > 0);
7026
7027    encoding = code_page_name(code_page, &encoding_obj);
7028    if (encoding == NULL)
7029        return -1;
7030
7031    if (errors == NULL || strcmp(errors, "strict") == 0) {
7032        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7033           UnicodeDecodeError. */
7034        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7035        if (exc != NULL) {
7036            PyCodec_StrictErrors(exc);
7037            Py_CLEAR(exc);
7038        }
7039        goto error;
7040    }
7041
7042    if (*v == NULL) {
7043        /* Create unicode object */
7044        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7045            PyErr_NoMemory();
7046            goto error;
7047        }
7048        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7049        if (*v == NULL)
7050            goto error;
7051        startout = PyUnicode_AS_UNICODE(*v);
7052    }
7053    else {
7054        /* Extend unicode object */
7055        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7056        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7057            PyErr_NoMemory();
7058            goto error;
7059        }
7060        if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7061            goto error;
7062        startout = PyUnicode_AS_UNICODE(*v) + n;
7063    }
7064
7065    /* Decode the byte string character per character */
7066    out = startout;
7067    while (in < endin)
7068    {
7069        /* Decode a character */
7070        insize = 1;
7071        do
7072        {
7073            outsize = MultiByteToWideChar(code_page, flags,
7074                                          in, insize,
7075                                          buffer, Py_ARRAY_LENGTH(buffer));
7076            if (outsize > 0)
7077                break;
7078            err = GetLastError();
7079            if (err != ERROR_NO_UNICODE_TRANSLATION
7080                && err != ERROR_INSUFFICIENT_BUFFER)
7081            {
7082                PyErr_SetFromWindowsErr(0);
7083                goto error;
7084            }
7085            insize++;
7086        }
7087        /* 4=maximum length of a UTF-8 sequence */
7088        while (insize <= 4 && (in + insize) <= endin);
7089
7090        if (outsize <= 0) {
7091            Py_ssize_t startinpos, endinpos, outpos;
7092
7093            startinpos = in - startin;
7094            endinpos = startinpos + 1;
7095            outpos = out - PyUnicode_AS_UNICODE(*v);
7096            if (unicode_decode_call_errorhandler(
7097                    errors, &errorHandler,
7098                    encoding, reason,
7099                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7100                    v, &outpos, &out))
7101            {
7102                goto error;
7103            }
7104        }
7105        else {
7106            in += insize;
7107            memcpy(out, buffer, outsize * sizeof(wchar_t));
7108            out += outsize;
7109        }
7110    }
7111
7112    /* write a NUL character at the end */
7113    *out = 0;
7114
7115    /* Extend unicode object */
7116    outsize = out - startout;
7117    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7118    if (PyUnicode_Resize(v, outsize) < 0)
7119        goto error;
7120    ret = size;
7121
7122error:
7123    Py_XDECREF(encoding_obj);
7124    Py_XDECREF(errorHandler);
7125    Py_XDECREF(exc);
7126    return ret;
7127}
7128
7129static PyObject *
7130decode_code_page_stateful(int code_page,
7131                          const char *s, Py_ssize_t size,
7132                          const char *errors, Py_ssize_t *consumed)
7133{
7134    PyObject *v = NULL;
7135    int chunk_size, final, converted, done;
7136
7137    if (code_page < 0) {
7138        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7139        return NULL;
7140    }
7141
7142    if (consumed)
7143        *consumed = 0;
7144
7145    do
7146    {
7147#ifdef NEED_RETRY
7148        if (size > INT_MAX) {
7149            chunk_size = INT_MAX;
7150            final = 0;
7151            done = 0;
7152        }
7153        else
7154#endif
7155        {
7156            chunk_size = (int)size;
7157            final = (consumed == NULL);
7158            done = 1;
7159        }
7160
7161        /* Skip trailing lead-byte unless 'final' is set */
7162        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7163            --chunk_size;
7164
7165        if (chunk_size == 0 && done) {
7166            if (v != NULL)
7167                break;
7168            Py_INCREF(unicode_empty);
7169            return unicode_empty;
7170        }
7171
7172
7173        converted = decode_code_page_strict(code_page, &v,
7174                                            s, chunk_size);
7175        if (converted == -2)
7176            converted = decode_code_page_errors(code_page, &v,
7177                                                s, chunk_size,
7178                                                errors);
7179        assert(converted != 0);
7180
7181        if (converted < 0) {
7182            Py_XDECREF(v);
7183            return NULL;
7184        }
7185
7186        if (consumed)
7187            *consumed += converted;
7188
7189        s += converted;
7190        size -= converted;
7191    } while (!done);
7192
7193#ifndef DONT_MAKE_RESULT_READY
7194    if (_PyUnicode_READY_REPLACE(&v)) {
7195        Py_DECREF(v);
7196        return NULL;
7197    }
7198#endif
7199    assert(_PyUnicode_CheckConsistency(v, 1));
7200    return v;
7201}
7202
7203PyObject *
7204PyUnicode_DecodeCodePageStateful(int code_page,
7205                                 const char *s,
7206                                 Py_ssize_t size,
7207                                 const char *errors,
7208                                 Py_ssize_t *consumed)
7209{
7210    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7211}
7212
7213PyObject *
7214PyUnicode_DecodeMBCSStateful(const char *s,
7215                             Py_ssize_t size,
7216                             const char *errors,
7217                             Py_ssize_t *consumed)
7218{
7219    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7220}
7221
7222PyObject *
7223PyUnicode_DecodeMBCS(const char *s,
7224                     Py_ssize_t size,
7225                     const char *errors)
7226{
7227    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7228}
7229
7230static DWORD
7231encode_code_page_flags(UINT code_page, const char *errors)
7232{
7233    if (code_page == CP_UTF8) {
7234        if (winver.dwMajorVersion >= 6)
7235            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7236               and later */
7237            return WC_ERR_INVALID_CHARS;
7238        else
7239            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7240            return 0;
7241    }
7242    else if (code_page == CP_UTF7) {
7243        /* CP_UTF7 only supports flags=0 */
7244        return 0;
7245    }
7246    else {
7247        if (errors != NULL && strcmp(errors, "replace") == 0)
7248            return 0;
7249        else
7250            return WC_NO_BEST_FIT_CHARS;
7251    }
7252}
7253
7254/*
7255 * Encode a Unicode string to a Windows code page into a byte string in strict
7256 * mode.
7257 *
7258 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7259 * a WindowsError and returns -1 on other error.
7260 */
7261static int
7262encode_code_page_strict(UINT code_page, PyObject **outbytes,
7263                        PyObject *unicode, Py_ssize_t offset, int len,
7264                        const char* errors)
7265{
7266    BOOL usedDefaultChar = FALSE;
7267    BOOL *pusedDefaultChar = &usedDefaultChar;
7268    int outsize;
7269    PyObject *exc = NULL;
7270	Py_UNICODE *p;
7271	Py_ssize_t size;
7272    const DWORD flags = encode_code_page_flags(code_page, NULL);
7273    char *out;
7274	/* Create a substring so that we can get the UTF-16 representation
7275	   of just the slice under consideration. */
7276	PyObject *substring;
7277
7278    assert(len > 0);
7279
7280    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7281        pusedDefaultChar = &usedDefaultChar;
7282    else
7283        pusedDefaultChar = NULL;
7284
7285	substring = PyUnicode_Substring(unicode, offset, offset+len);
7286	if (substring == NULL)
7287		return -1;
7288	p = PyUnicode_AsUnicodeAndSize(substring, &size);
7289	if (p == NULL) {
7290		Py_DECREF(substring);
7291		return -1;
7292	}
7293
7294    /* First get the size of the result */
7295    outsize = WideCharToMultiByte(code_page, flags,
7296                                  p, size,
7297                                  NULL, 0,
7298                                  NULL, pusedDefaultChar);
7299    if (outsize <= 0)
7300        goto error;
7301    /* If we used a default char, then we failed! */
7302	if (pusedDefaultChar && *pusedDefaultChar) {
7303		Py_DECREF(substring);
7304        return -2;
7305	}
7306
7307    if (*outbytes == NULL) {
7308        /* Create string object */
7309        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7310		if (*outbytes == NULL) {
7311			Py_DECREF(substring);
7312            return -1;
7313		}
7314        out = PyBytes_AS_STRING(*outbytes);
7315    }
7316    else {
7317        /* Extend string object */
7318        const Py_ssize_t n = PyBytes_Size(*outbytes);
7319        if (outsize > PY_SSIZE_T_MAX - n) {
7320            PyErr_NoMemory();
7321			Py_DECREF(substring);
7322            return -1;
7323        }
7324		if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7325			Py_DECREF(substring);
7326            return -1;
7327		}
7328        out = PyBytes_AS_STRING(*outbytes) + n;
7329    }
7330
7331    /* Do the conversion */
7332    outsize = WideCharToMultiByte(code_page, flags,
7333                                  p, size,
7334                                  out, outsize,
7335                                  NULL, pusedDefaultChar);
7336	Py_CLEAR(substring);
7337    if (outsize <= 0)
7338        goto error;
7339    if (pusedDefaultChar && *pusedDefaultChar)
7340        return -2;
7341    return 0;
7342
7343error:
7344	Py_XDECREF(substring);
7345    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7346        return -2;
7347    PyErr_SetFromWindowsErr(0);
7348    return -1;
7349}
7350
7351/*
7352 * Encode a Unicode string to a Windows code page into a byte string using a
7353 * error handler.
7354 *
7355 * Returns consumed characters if succeed, or raise a WindowsError and returns
7356 * -1 on other error.
7357 */
7358static int
7359encode_code_page_errors(UINT code_page, PyObject **outbytes,
7360                        PyObject *unicode, Py_ssize_t unicode_offset,
7361                        Py_ssize_t insize, const char* errors)
7362{
7363    const DWORD flags = encode_code_page_flags(code_page, errors);
7364	Py_ssize_t pos = unicode_offset;
7365	Py_ssize_t endin = unicode_offset + insize;
7366    /* Ideally, we should get reason from FormatMessage. This is the Windows
7367       2000 English version of the message. */
7368    const char *reason = "invalid character";
7369    /* 4=maximum length of a UTF-8 sequence */
7370    char buffer[4];
7371    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7372    Py_ssize_t outsize;
7373    char *out;
7374    PyObject *errorHandler = NULL;
7375    PyObject *exc = NULL;
7376    PyObject *encoding_obj = NULL;
7377    char *encoding;
7378    Py_ssize_t newpos, newoutsize;
7379    PyObject *rep;
7380    int ret = -1;
7381
7382    assert(insize > 0);
7383
7384    encoding = code_page_name(code_page, &encoding_obj);
7385    if (encoding == NULL)
7386        return -1;
7387
7388    if (errors == NULL || strcmp(errors, "strict") == 0) {
7389        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7390           then we raise a UnicodeEncodeError. */
7391        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7392        if (exc != NULL) {
7393            PyCodec_StrictErrors(exc);
7394            Py_DECREF(exc);
7395        }
7396        Py_XDECREF(encoding_obj);
7397        return -1;
7398    }
7399
7400    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7401        pusedDefaultChar = &usedDefaultChar;
7402    else
7403        pusedDefaultChar = NULL;
7404
7405    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7406        PyErr_NoMemory();
7407        goto error;
7408    }
7409    outsize = insize * Py_ARRAY_LENGTH(buffer);
7410
7411    if (*outbytes == NULL) {
7412        /* Create string object */
7413        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7414        if (*outbytes == NULL)
7415            goto error;
7416        out = PyBytes_AS_STRING(*outbytes);
7417    }
7418    else {
7419        /* Extend string object */
7420        Py_ssize_t n = PyBytes_Size(*outbytes);
7421        if (n > PY_SSIZE_T_MAX - outsize) {
7422            PyErr_NoMemory();
7423            goto error;
7424        }
7425        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7426            goto error;
7427        out = PyBytes_AS_STRING(*outbytes) + n;
7428    }
7429
7430    /* Encode the string character per character */
7431    while (pos < endin)
7432    {
7433		Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7434		wchar_t chars[2];
7435		int charsize;
7436		if (ch < 0x10000) {
7437			chars[0] = (wchar_t)ch;
7438			charsize = 1;
7439		}
7440		else {
7441			ch -= 0x10000;
7442			chars[0] = 0xd800 + (ch >> 10);
7443			chars[1] = 0xdc00 + (ch & 0x3ff);
7444			charsize = 2;
7445		}
7446
7447        outsize = WideCharToMultiByte(code_page, flags,
7448                                      chars, charsize,
7449                                      buffer, Py_ARRAY_LENGTH(buffer),
7450                                      NULL, pusedDefaultChar);
7451        if (outsize > 0) {
7452            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7453            {
7454                pos++;
7455                memcpy(out, buffer, outsize);
7456                out += outsize;
7457                continue;
7458            }
7459        }
7460        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7461            PyErr_SetFromWindowsErr(0);
7462            goto error;
7463        }
7464
7465        rep = unicode_encode_call_errorhandler(
7466                  errors, &errorHandler, encoding, reason,
7467                  unicode, &exc,
7468                  pos, pos + 1, &newpos);
7469        if (rep == NULL)
7470            goto error;
7471        pos = newpos;
7472
7473        if (PyBytes_Check(rep)) {
7474            outsize = PyBytes_GET_SIZE(rep);
7475            if (outsize != 1) {
7476                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7477                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7478                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7479                    Py_DECREF(rep);
7480                    goto error;
7481                }
7482                out = PyBytes_AS_STRING(*outbytes) + offset;
7483            }
7484            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7485            out += outsize;
7486        }
7487        else {
7488            Py_ssize_t i;
7489            enum PyUnicode_Kind kind;
7490            void *data;
7491
7492            if (PyUnicode_READY(rep) < 0) {
7493                Py_DECREF(rep);
7494                goto error;
7495            }
7496
7497            outsize = PyUnicode_GET_LENGTH(rep);
7498            if (outsize != 1) {
7499                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7500                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7501                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7502                    Py_DECREF(rep);
7503                    goto error;
7504                }
7505                out = PyBytes_AS_STRING(*outbytes) + offset;
7506            }
7507            kind = PyUnicode_KIND(rep);
7508            data = PyUnicode_DATA(rep);
7509            for (i=0; i < outsize; i++) {
7510                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7511                if (ch > 127) {
7512                    raise_encode_exception(&exc,
7513                        encoding, unicode,
7514                        pos, pos + 1,
7515                        "unable to encode error handler result to ASCII");
7516                    Py_DECREF(rep);
7517                    goto error;
7518                }
7519                *out = (unsigned char)ch;
7520                out++;
7521            }
7522        }
7523        Py_DECREF(rep);
7524    }
7525    /* write a NUL byte */
7526    *out = 0;
7527    outsize = out - PyBytes_AS_STRING(*outbytes);
7528    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7529    if (_PyBytes_Resize(outbytes, outsize) < 0)
7530        goto error;
7531    ret = 0;
7532
7533error:
7534    Py_XDECREF(encoding_obj);
7535    Py_XDECREF(errorHandler);
7536    Py_XDECREF(exc);
7537    return ret;
7538}
7539
7540static PyObject *
7541encode_code_page(int code_page,
7542                 PyObject *unicode,
7543                 const char *errors)
7544{
7545    Py_ssize_t len;
7546    PyObject *outbytes = NULL;
7547    Py_ssize_t offset;
7548    int chunk_len, ret, done;
7549
7550	if (PyUnicode_READY(unicode) < 0)
7551		return NULL;
7552	len = PyUnicode_GET_LENGTH(unicode);
7553
7554    if (code_page < 0) {
7555        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7556        return NULL;
7557    }
7558
7559    if (len == 0)
7560        return PyBytes_FromStringAndSize(NULL, 0);
7561
7562    offset = 0;
7563    do
7564    {
7565#ifdef NEED_RETRY
7566		/* UTF-16 encoding may double the size, so use only INT_MAX/2
7567           chunks. */
7568        if (len > INT_MAX/2) {
7569            chunk_len = INT_MAX/2;
7570            done = 0;
7571        }
7572        else
7573#endif
7574        {
7575            chunk_len = (int)len;
7576            done = 1;
7577        }
7578
7579        ret = encode_code_page_strict(code_page, &outbytes,
7580                                      unicode, offset, chunk_len,
7581                                      errors);
7582        if (ret == -2)
7583            ret = encode_code_page_errors(code_page, &outbytes,
7584                                          unicode, offset,
7585                                          chunk_len, errors);
7586        if (ret < 0) {
7587            Py_XDECREF(outbytes);
7588            return NULL;
7589        }
7590
7591        offset += chunk_len;
7592        len -= chunk_len;
7593    } while (!done);
7594
7595    return outbytes;
7596}
7597
7598PyObject *
7599PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7600                     Py_ssize_t size,
7601                     const char *errors)
7602{
7603    PyObject *unicode, *res;
7604    unicode = PyUnicode_FromUnicode(p, size);
7605    if (unicode == NULL)
7606        return NULL;
7607    res = encode_code_page(CP_ACP, unicode, errors);
7608    Py_DECREF(unicode);
7609    return res;
7610}
7611
7612PyObject *
7613PyUnicode_EncodeCodePage(int code_page,
7614                         PyObject *unicode,
7615                         const char *errors)
7616{
7617    return encode_code_page(code_page, unicode, errors);
7618}
7619
7620PyObject *
7621PyUnicode_AsMBCSString(PyObject *unicode)
7622{
7623    if (!PyUnicode_Check(unicode)) {
7624        PyErr_BadArgument();
7625        return NULL;
7626    }
7627    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7628}
7629
7630#undef NEED_RETRY
7631
7632#endif /* HAVE_MBCS */
7633
7634/* --- Character Mapping Codec -------------------------------------------- */
7635
7636PyObject *
7637PyUnicode_DecodeCharmap(const char *s,
7638                        Py_ssize_t size,
7639                        PyObject *mapping,
7640                        const char *errors)
7641{
7642    const char *starts = s;
7643    Py_ssize_t startinpos;
7644    Py_ssize_t endinpos;
7645    Py_ssize_t outpos;
7646    const char *e;
7647    PyObject *v;
7648    Py_UNICODE *p;
7649    Py_ssize_t extrachars = 0;
7650    PyObject *errorHandler = NULL;
7651    PyObject *exc = NULL;
7652    Py_UNICODE *mapstring = NULL;
7653    Py_ssize_t maplen = 0;
7654
7655    /* Default to Latin-1 */
7656    if (mapping == NULL)
7657        return PyUnicode_DecodeLatin1(s, size, errors);
7658
7659    v = (PyObject*)_PyUnicode_New(size);
7660    if (v == NULL)
7661        goto onError;
7662    if (size == 0)
7663        return v;
7664    p = PyUnicode_AS_UNICODE(v);
7665    e = s + size;
7666    if (PyUnicode_CheckExact(mapping)) {
7667        mapstring = PyUnicode_AS_UNICODE(mapping);
7668        maplen = PyUnicode_GET_SIZE(mapping);
7669        while (s < e) {
7670            unsigned char ch = *s;
7671            Py_UNICODE x = 0xfffe; /* illegal value */
7672
7673            if (ch < maplen)
7674                x = mapstring[ch];
7675
7676            if (x == 0xfffe) {
7677                /* undefined mapping */
7678                outpos = p-PyUnicode_AS_UNICODE(v);
7679                startinpos = s-starts;
7680                endinpos = startinpos+1;
7681                if (unicode_decode_call_errorhandler(
7682                        errors, &errorHandler,
7683                        "charmap", "character maps to <undefined>",
7684                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7685                        &v, &outpos, &p)) {
7686                    goto onError;
7687                }
7688                continue;
7689            }
7690            *p++ = x;
7691            ++s;
7692        }
7693    }
7694    else {
7695        while (s < e) {
7696            unsigned char ch = *s;
7697            PyObject *w, *x;
7698
7699            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7700            w = PyLong_FromLong((long)ch);
7701            if (w == NULL)
7702                goto onError;
7703            x = PyObject_GetItem(mapping, w);
7704            Py_DECREF(w);
7705            if (x == NULL) {
7706                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7707                    /* No mapping found means: mapping is undefined. */
7708                    PyErr_Clear();
7709                    x = Py_None;
7710                    Py_INCREF(x);
7711                } else
7712                    goto onError;
7713            }
7714
7715            /* Apply mapping */
7716            if (PyLong_Check(x)) {
7717                long value = PyLong_AS_LONG(x);
7718                if (value < 0 || value > 65535) {
7719                    PyErr_SetString(PyExc_TypeError,
7720                                    "character mapping must be in range(65536)");
7721                    Py_DECREF(x);
7722                    goto onError;
7723                }
7724                *p++ = (Py_UNICODE)value;
7725            }
7726            else if (x == Py_None) {
7727                /* undefined mapping */
7728                outpos = p-PyUnicode_AS_UNICODE(v);
7729                startinpos = s-starts;
7730                endinpos = startinpos+1;
7731                if (unicode_decode_call_errorhandler(
7732                        errors, &errorHandler,
7733                        "charmap", "character maps to <undefined>",
7734                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7735                        &v, &outpos, &p)) {
7736                    Py_DECREF(x);
7737                    goto onError;
7738                }
7739                Py_DECREF(x);
7740                continue;
7741            }
7742            else if (PyUnicode_Check(x)) {
7743                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
7744
7745                if (targetsize == 1)
7746                    /* 1-1 mapping */
7747                    *p++ = *PyUnicode_AS_UNICODE(x);
7748
7749                else if (targetsize > 1) {
7750                    /* 1-n mapping */
7751                    if (targetsize > extrachars) {
7752                        /* resize first */
7753                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7754                        Py_ssize_t needed = (targetsize - extrachars) + \
7755                            (targetsize << 2);
7756                        extrachars += needed;
7757                        /* XXX overflow detection missing */
7758                        if (PyUnicode_Resize(&v,
7759                                             PyUnicode_GET_SIZE(v) + needed) < 0) {
7760                            Py_DECREF(x);
7761                            goto onError;
7762                        }
7763                        p = PyUnicode_AS_UNICODE(v) + oldpos;
7764                    }
7765                    Py_UNICODE_COPY(p,
7766                                    PyUnicode_AS_UNICODE(x),
7767                                    targetsize);
7768                    p += targetsize;
7769                    extrachars -= targetsize;
7770                }
7771                /* 1-0 mapping: skip the character */
7772            }
7773            else {
7774                /* wrong return value */
7775                PyErr_SetString(PyExc_TypeError,
7776                                "character mapping must return integer, None or str");
7777                Py_DECREF(x);
7778                goto onError;
7779            }
7780            Py_DECREF(x);
7781            ++s;
7782        }
7783    }
7784    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
7785        if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
7786            goto onError;
7787    Py_XDECREF(errorHandler);
7788    Py_XDECREF(exc);
7789#ifndef DONT_MAKE_RESULT_READY
7790    if (_PyUnicode_READY_REPLACE(&v)) {
7791        Py_DECREF(v);
7792        return NULL;
7793    }
7794#endif
7795    assert(_PyUnicode_CheckConsistency(v, 1));
7796    return v;
7797
7798  onError:
7799    Py_XDECREF(errorHandler);
7800    Py_XDECREF(exc);
7801    Py_XDECREF(v);
7802    return NULL;
7803}
7804
7805/* Charmap encoding: the lookup table */
7806
7807struct encoding_map {
7808    PyObject_HEAD
7809    unsigned char level1[32];
7810    int count2, count3;
7811    unsigned char level23[1];
7812};
7813
7814static PyObject*
7815encoding_map_size(PyObject *obj, PyObject* args)
7816{
7817    struct encoding_map *map = (struct encoding_map*)obj;
7818    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7819                           128*map->count3);
7820}
7821
7822static PyMethodDef encoding_map_methods[] = {
7823    {"size", encoding_map_size, METH_NOARGS,
7824     PyDoc_STR("Return the size (in bytes) of this object") },
7825    { 0 }
7826};
7827
7828static void
7829encoding_map_dealloc(PyObject* o)
7830{
7831    PyObject_FREE(o);
7832}
7833
7834static PyTypeObject EncodingMapType = {
7835    PyVarObject_HEAD_INIT(NULL, 0)
7836    "EncodingMap",          /*tp_name*/
7837    sizeof(struct encoding_map),   /*tp_basicsize*/
7838    0,                      /*tp_itemsize*/
7839    /* methods */
7840    encoding_map_dealloc,   /*tp_dealloc*/
7841    0,                      /*tp_print*/
7842    0,                      /*tp_getattr*/
7843    0,                      /*tp_setattr*/
7844    0,                      /*tp_reserved*/
7845    0,                      /*tp_repr*/
7846    0,                      /*tp_as_number*/
7847    0,                      /*tp_as_sequence*/
7848    0,                      /*tp_as_mapping*/
7849    0,                      /*tp_hash*/
7850    0,                      /*tp_call*/
7851    0,                      /*tp_str*/
7852    0,                      /*tp_getattro*/
7853    0,                      /*tp_setattro*/
7854    0,                      /*tp_as_buffer*/
7855    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7856    0,                      /*tp_doc*/
7857    0,                      /*tp_traverse*/
7858    0,                      /*tp_clear*/
7859    0,                      /*tp_richcompare*/
7860    0,                      /*tp_weaklistoffset*/
7861    0,                      /*tp_iter*/
7862    0,                      /*tp_iternext*/
7863    encoding_map_methods,   /*tp_methods*/
7864    0,                      /*tp_members*/
7865    0,                      /*tp_getset*/
7866    0,                      /*tp_base*/
7867    0,                      /*tp_dict*/
7868    0,                      /*tp_descr_get*/
7869    0,                      /*tp_descr_set*/
7870    0,                      /*tp_dictoffset*/
7871    0,                      /*tp_init*/
7872    0,                      /*tp_alloc*/
7873    0,                      /*tp_new*/
7874    0,                      /*tp_free*/
7875    0,                      /*tp_is_gc*/
7876};
7877
7878PyObject*
7879PyUnicode_BuildEncodingMap(PyObject* string)
7880{
7881    PyObject *result;
7882    struct encoding_map *mresult;
7883    int i;
7884    int need_dict = 0;
7885    unsigned char level1[32];
7886    unsigned char level2[512];
7887    unsigned char *mlevel1, *mlevel2, *mlevel3;
7888    int count2 = 0, count3 = 0;
7889    int kind;
7890    void *data;
7891    Py_UCS4 ch;
7892
7893    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7894        PyErr_BadArgument();
7895        return NULL;
7896    }
7897    kind = PyUnicode_KIND(string);
7898    data = PyUnicode_DATA(string);
7899    memset(level1, 0xFF, sizeof level1);
7900    memset(level2, 0xFF, sizeof level2);
7901
7902    /* If there isn't a one-to-one mapping of NULL to \0,
7903       or if there are non-BMP characters, we need to use
7904       a mapping dictionary. */
7905    if (PyUnicode_READ(kind, data, 0) != 0)
7906        need_dict = 1;
7907    for (i = 1; i < 256; i++) {
7908        int l1, l2;
7909        ch = PyUnicode_READ(kind, data, i);
7910        if (ch == 0 || ch > 0xFFFF) {
7911            need_dict = 1;
7912            break;
7913        }
7914        if (ch == 0xFFFE)
7915            /* unmapped character */
7916            continue;
7917        l1 = ch >> 11;
7918        l2 = ch >> 7;
7919        if (level1[l1] == 0xFF)
7920            level1[l1] = count2++;
7921        if (level2[l2] == 0xFF)
7922            level2[l2] = count3++;
7923    }
7924
7925    if (count2 >= 0xFF || count3 >= 0xFF)
7926        need_dict = 1;
7927
7928    if (need_dict) {
7929        PyObject *result = PyDict_New();
7930        PyObject *key, *value;
7931        if (!result)
7932            return NULL;
7933        for (i = 0; i < 256; i++) {
7934            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7935            value = PyLong_FromLong(i);
7936            if (!key || !value)
7937                goto failed1;
7938            if (PyDict_SetItem(result, key, value) == -1)
7939                goto failed1;
7940            Py_DECREF(key);
7941            Py_DECREF(value);
7942        }
7943        return result;
7944      failed1:
7945        Py_XDECREF(key);
7946        Py_XDECREF(value);
7947        Py_DECREF(result);
7948        return NULL;
7949    }
7950
7951    /* Create a three-level trie */
7952    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7953                             16*count2 + 128*count3 - 1);
7954    if (!result)
7955        return PyErr_NoMemory();
7956    PyObject_Init(result, &EncodingMapType);
7957    mresult = (struct encoding_map*)result;
7958    mresult->count2 = count2;
7959    mresult->count3 = count3;
7960    mlevel1 = mresult->level1;
7961    mlevel2 = mresult->level23;
7962    mlevel3 = mresult->level23 + 16*count2;
7963    memcpy(mlevel1, level1, 32);
7964    memset(mlevel2, 0xFF, 16*count2);
7965    memset(mlevel3, 0, 128*count3);
7966    count3 = 0;
7967    for (i = 1; i < 256; i++) {
7968        int o1, o2, o3, i2, i3;
7969        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7970            /* unmapped character */
7971            continue;
7972        o1 = PyUnicode_READ(kind, data, i)>>11;
7973        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7974        i2 = 16*mlevel1[o1] + o2;
7975        if (mlevel2[i2] == 0xFF)
7976            mlevel2[i2] = count3++;
7977        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7978        i3 = 128*mlevel2[i2] + o3;
7979        mlevel3[i3] = i;
7980    }
7981    return result;
7982}
7983
7984static int
7985encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7986{
7987    struct encoding_map *map = (struct encoding_map*)mapping;
7988    int l1 = c>>11;
7989    int l2 = (c>>7) & 0xF;
7990    int l3 = c & 0x7F;
7991    int i;
7992
7993#ifdef Py_UNICODE_WIDE
7994    if (c > 0xFFFF) {
7995        return -1;
7996    }
7997#endif
7998    if (c == 0)
7999        return 0;
8000    /* level 1*/
8001    i = map->level1[l1];
8002    if (i == 0xFF) {
8003        return -1;
8004    }
8005    /* level 2*/
8006    i = map->level23[16*i+l2];
8007    if (i == 0xFF) {
8008        return -1;
8009    }
8010    /* level 3 */
8011    i = map->level23[16*map->count2 + 128*i + l3];
8012    if (i == 0) {
8013        return -1;
8014    }
8015    return i;
8016}
8017
8018/* Lookup the character ch in the mapping. If the character
8019   can't be found, Py_None is returned (or NULL, if another
8020   error occurred). */
8021static PyObject *
8022charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
8023{
8024    PyObject *w = PyLong_FromLong((long)c);
8025    PyObject *x;
8026
8027    if (w == NULL)
8028        return NULL;
8029    x = PyObject_GetItem(mapping, w);
8030    Py_DECREF(w);
8031    if (x == NULL) {
8032        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8033            /* No mapping found means: mapping is undefined. */
8034            PyErr_Clear();
8035            x = Py_None;
8036            Py_INCREF(x);
8037            return x;
8038        } else
8039            return NULL;
8040    }
8041    else if (x == Py_None)
8042        return x;
8043    else if (PyLong_Check(x)) {
8044        long value = PyLong_AS_LONG(x);
8045        if (value < 0 || value > 255) {
8046            PyErr_SetString(PyExc_TypeError,
8047                            "character mapping must be in range(256)");
8048            Py_DECREF(x);
8049            return NULL;
8050        }
8051        return x;
8052    }
8053    else if (PyBytes_Check(x))
8054        return x;
8055    else {
8056        /* wrong return value */
8057        PyErr_Format(PyExc_TypeError,
8058                     "character mapping must return integer, bytes or None, not %.400s",
8059                     x->ob_type->tp_name);
8060        Py_DECREF(x);
8061        return NULL;
8062    }
8063}
8064
8065static int
8066charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8067{
8068    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8069    /* exponentially overallocate to minimize reallocations */
8070    if (requiredsize < 2*outsize)
8071        requiredsize = 2*outsize;
8072    if (_PyBytes_Resize(outobj, requiredsize))
8073        return -1;
8074    return 0;
8075}
8076
8077typedef enum charmapencode_result {
8078    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8079} charmapencode_result;
8080/* lookup the character, put the result in the output string and adjust
8081   various state variables. Resize the output bytes object if not enough
8082   space is available. Return a new reference to the object that
8083   was put in the output buffer, or Py_None, if the mapping was undefined
8084   (in which case no character was written) or NULL, if a
8085   reallocation error occurred. The caller must decref the result */
8086static charmapencode_result
8087charmapencode_output(Py_UNICODE c, PyObject *mapping,
8088                     PyObject **outobj, Py_ssize_t *outpos)
8089{
8090    PyObject *rep;
8091    char *outstart;
8092    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8093
8094    if (Py_TYPE(mapping) == &EncodingMapType) {
8095        int res = encoding_map_lookup(c, mapping);
8096        Py_ssize_t requiredsize = *outpos+1;
8097        if (res == -1)
8098            return enc_FAILED;
8099        if (outsize<requiredsize)
8100            if (charmapencode_resize(outobj, outpos, requiredsize))
8101                return enc_EXCEPTION;
8102        outstart = PyBytes_AS_STRING(*outobj);
8103        outstart[(*outpos)++] = (char)res;
8104        return enc_SUCCESS;
8105    }
8106
8107    rep = charmapencode_lookup(c, mapping);
8108    if (rep==NULL)
8109        return enc_EXCEPTION;
8110    else if (rep==Py_None) {
8111        Py_DECREF(rep);
8112        return enc_FAILED;
8113    } else {
8114        if (PyLong_Check(rep)) {
8115            Py_ssize_t requiredsize = *outpos+1;
8116            if (outsize<requiredsize)
8117                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8118                    Py_DECREF(rep);
8119                    return enc_EXCEPTION;
8120                }
8121            outstart = PyBytes_AS_STRING(*outobj);
8122            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8123        }
8124        else {
8125            const char *repchars = PyBytes_AS_STRING(rep);
8126            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8127            Py_ssize_t requiredsize = *outpos+repsize;
8128            if (outsize<requiredsize)
8129                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8130                    Py_DECREF(rep);
8131                    return enc_EXCEPTION;
8132                }
8133            outstart = PyBytes_AS_STRING(*outobj);
8134            memcpy(outstart + *outpos, repchars, repsize);
8135            *outpos += repsize;
8136        }
8137    }
8138    Py_DECREF(rep);
8139    return enc_SUCCESS;
8140}
8141
8142/* handle an error in PyUnicode_EncodeCharmap
8143   Return 0 on success, -1 on error */
8144static int
8145charmap_encoding_error(
8146    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8147    PyObject **exceptionObject,
8148    int *known_errorHandler, PyObject **errorHandler, const char *errors,
8149    PyObject **res, Py_ssize_t *respos)
8150{
8151    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8152    Py_ssize_t size, repsize;
8153    Py_ssize_t newpos;
8154    Py_UNICODE *uni2;
8155    /* startpos for collecting unencodable chars */
8156    Py_ssize_t collstartpos = *inpos;
8157    Py_ssize_t collendpos = *inpos+1;
8158    Py_ssize_t collpos;
8159    char *encoding = "charmap";
8160    char *reason = "character maps to <undefined>";
8161    charmapencode_result x;
8162    Py_UCS4 ch;
8163    int val;
8164
8165    if (PyUnicode_READY(unicode) < 0)
8166        return -1;
8167    size = PyUnicode_GET_LENGTH(unicode);
8168    /* find all unencodable characters */
8169    while (collendpos < size) {
8170        PyObject *rep;
8171        if (Py_TYPE(mapping) == &EncodingMapType) {
8172            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8173            val = encoding_map_lookup(ch, mapping);
8174            if (val != -1)
8175                break;
8176            ++collendpos;
8177            continue;
8178        }
8179
8180        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8181        rep = charmapencode_lookup(ch, mapping);
8182        if (rep==NULL)
8183            return -1;
8184        else if (rep!=Py_None) {
8185            Py_DECREF(rep);
8186            break;
8187        }
8188        Py_DECREF(rep);
8189        ++collendpos;
8190    }
8191    /* cache callback name lookup
8192     * (if not done yet, i.e. it's the first error) */
8193    if (*known_errorHandler==-1) {
8194        if ((errors==NULL) || (!strcmp(errors, "strict")))
8195            *known_errorHandler = 1;
8196        else if (!strcmp(errors, "replace"))
8197            *known_errorHandler = 2;
8198        else if (!strcmp(errors, "ignore"))
8199            *known_errorHandler = 3;
8200        else if (!strcmp(errors, "xmlcharrefreplace"))
8201            *known_errorHandler = 4;
8202        else
8203            *known_errorHandler = 0;
8204    }
8205    switch (*known_errorHandler) {
8206    case 1: /* strict */
8207        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8208        return -1;
8209    case 2: /* replace */
8210        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8211            x = charmapencode_output('?', mapping, res, respos);
8212            if (x==enc_EXCEPTION) {
8213                return -1;
8214            }
8215            else if (x==enc_FAILED) {
8216                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8217                return -1;
8218            }
8219        }
8220        /* fall through */
8221    case 3: /* ignore */
8222        *inpos = collendpos;
8223        break;
8224    case 4: /* xmlcharrefreplace */
8225        /* generate replacement (temporarily (mis)uses p) */
8226        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8227            char buffer[2+29+1+1];
8228            char *cp;
8229            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8230            for (cp = buffer; *cp; ++cp) {
8231                x = charmapencode_output(*cp, mapping, res, respos);
8232                if (x==enc_EXCEPTION)
8233                    return -1;
8234                else if (x==enc_FAILED) {
8235                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8236                    return -1;
8237                }
8238            }
8239        }
8240        *inpos = collendpos;
8241        break;
8242    default:
8243        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8244                                                      encoding, reason, unicode, exceptionObject,
8245                                                      collstartpos, collendpos, &newpos);
8246        if (repunicode == NULL)
8247            return -1;
8248        if (PyBytes_Check(repunicode)) {
8249            /* Directly copy bytes result to output. */
8250            Py_ssize_t outsize = PyBytes_Size(*res);
8251            Py_ssize_t requiredsize;
8252            repsize = PyBytes_Size(repunicode);
8253            requiredsize = *respos + repsize;
8254            if (requiredsize > outsize)
8255                /* Make room for all additional bytes. */
8256                if (charmapencode_resize(res, respos, requiredsize)) {
8257                    Py_DECREF(repunicode);
8258                    return -1;
8259                }
8260            memcpy(PyBytes_AsString(*res) + *respos,
8261                   PyBytes_AsString(repunicode),  repsize);
8262            *respos += repsize;
8263            *inpos = newpos;
8264            Py_DECREF(repunicode);
8265            break;
8266        }
8267        /* generate replacement  */
8268        repsize = PyUnicode_GET_SIZE(repunicode);
8269        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8270            x = charmapencode_output(*uni2, mapping, res, respos);
8271            if (x==enc_EXCEPTION) {
8272                return -1;
8273            }
8274            else if (x==enc_FAILED) {
8275                Py_DECREF(repunicode);
8276                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8277                return -1;
8278            }
8279        }
8280        *inpos = newpos;
8281        Py_DECREF(repunicode);
8282    }
8283    return 0;
8284}
8285
8286PyObject *
8287_PyUnicode_EncodeCharmap(PyObject *unicode,
8288                         PyObject *mapping,
8289                         const char *errors)
8290{
8291    /* output object */
8292    PyObject *res = NULL;
8293    /* current input position */
8294    Py_ssize_t inpos = 0;
8295    Py_ssize_t size;
8296    /* current output position */
8297    Py_ssize_t respos = 0;
8298    PyObject *errorHandler = NULL;
8299    PyObject *exc = NULL;
8300    /* the following variable is used for caching string comparisons
8301     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8302     * 3=ignore, 4=xmlcharrefreplace */
8303    int known_errorHandler = -1;
8304
8305    if (PyUnicode_READY(unicode) < 0)
8306        return NULL;
8307    size = PyUnicode_GET_LENGTH(unicode);
8308
8309    /* Default to Latin-1 */
8310    if (mapping == NULL)
8311        return unicode_encode_ucs1(unicode, errors, 256);
8312
8313    /* allocate enough for a simple encoding without
8314       replacements, if we need more, we'll resize */
8315    res = PyBytes_FromStringAndSize(NULL, size);
8316    if (res == NULL)
8317        goto onError;
8318    if (size == 0)
8319        return res;
8320
8321    while (inpos<size) {
8322        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
8323        /* try to encode it */
8324        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8325        if (x==enc_EXCEPTION) /* error */
8326            goto onError;
8327        if (x==enc_FAILED) { /* unencodable character */
8328            if (charmap_encoding_error(unicode, &inpos, mapping,
8329                                       &exc,
8330                                       &known_errorHandler, &errorHandler, errors,
8331                                       &res, &respos)) {
8332                goto onError;
8333            }
8334        }
8335        else
8336            /* done with this character => adjust input position */
8337            ++inpos;
8338    }
8339
8340    /* Resize if we allocated to much */
8341    if (respos<PyBytes_GET_SIZE(res))
8342        if (_PyBytes_Resize(&res, respos) < 0)
8343            goto onError;
8344
8345    Py_XDECREF(exc);
8346    Py_XDECREF(errorHandler);
8347    return res;
8348
8349  onError:
8350    Py_XDECREF(res);
8351    Py_XDECREF(exc);
8352    Py_XDECREF(errorHandler);
8353    return NULL;
8354}
8355
8356/* Deprecated */
8357PyObject *
8358PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8359                        Py_ssize_t size,
8360                        PyObject *mapping,
8361                        const char *errors)
8362{
8363    PyObject *result;
8364    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8365    if (unicode == NULL)
8366        return NULL;
8367    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8368    Py_DECREF(unicode);
8369    return result;
8370}
8371
8372PyObject *
8373PyUnicode_AsCharmapString(PyObject *unicode,
8374                          PyObject *mapping)
8375{
8376    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8377        PyErr_BadArgument();
8378        return NULL;
8379    }
8380    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8381}
8382
8383/* create or adjust a UnicodeTranslateError */
8384static void
8385make_translate_exception(PyObject **exceptionObject,
8386                         PyObject *unicode,
8387                         Py_ssize_t startpos, Py_ssize_t endpos,
8388                         const char *reason)
8389{
8390    if (*exceptionObject == NULL) {
8391        *exceptionObject = _PyUnicodeTranslateError_Create(
8392            unicode, startpos, endpos, reason);
8393    }
8394    else {
8395        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8396            goto onError;
8397        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8398            goto onError;
8399        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8400            goto onError;
8401        return;
8402      onError:
8403        Py_DECREF(*exceptionObject);
8404        *exceptionObject = NULL;
8405    }
8406}
8407
8408/* raises a UnicodeTranslateError */
8409static void
8410raise_translate_exception(PyObject **exceptionObject,
8411                          PyObject *unicode,
8412                          Py_ssize_t startpos, Py_ssize_t endpos,
8413                          const char *reason)
8414{
8415    make_translate_exception(exceptionObject,
8416                             unicode, startpos, endpos, reason);
8417    if (*exceptionObject != NULL)
8418        PyCodec_StrictErrors(*exceptionObject);
8419}
8420
8421/* error handling callback helper:
8422   build arguments, call the callback and check the arguments,
8423   put the result into newpos and return the replacement string, which
8424   has to be freed by the caller */
8425static PyObject *
8426unicode_translate_call_errorhandler(const char *errors,
8427                                    PyObject **errorHandler,
8428                                    const char *reason,
8429                                    PyObject *unicode, PyObject **exceptionObject,
8430                                    Py_ssize_t startpos, Py_ssize_t endpos,
8431                                    Py_ssize_t *newpos)
8432{
8433    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8434
8435    Py_ssize_t i_newpos;
8436    PyObject *restuple;
8437    PyObject *resunicode;
8438
8439    if (*errorHandler == NULL) {
8440        *errorHandler = PyCodec_LookupError(errors);
8441        if (*errorHandler == NULL)
8442            return NULL;
8443    }
8444
8445    make_translate_exception(exceptionObject,
8446                             unicode, startpos, endpos, reason);
8447    if (*exceptionObject == NULL)
8448        return NULL;
8449
8450    restuple = PyObject_CallFunctionObjArgs(
8451        *errorHandler, *exceptionObject, NULL);
8452    if (restuple == NULL)
8453        return NULL;
8454    if (!PyTuple_Check(restuple)) {
8455        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8456        Py_DECREF(restuple);
8457        return NULL;
8458    }
8459    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8460                          &resunicode, &i_newpos)) {
8461        Py_DECREF(restuple);
8462        return NULL;
8463    }
8464    if (i_newpos<0)
8465        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8466    else
8467        *newpos = i_newpos;
8468    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8469        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8470        Py_DECREF(restuple);
8471        return NULL;
8472    }
8473    Py_INCREF(resunicode);
8474    Py_DECREF(restuple);
8475    return resunicode;
8476}
8477
8478/* Lookup the character ch in the mapping and put the result in result,
8479   which must be decrefed by the caller.
8480   Return 0 on success, -1 on error */
8481static int
8482charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8483{
8484    PyObject *w = PyLong_FromLong((long)c);
8485    PyObject *x;
8486
8487    if (w == NULL)
8488        return -1;
8489    x = PyObject_GetItem(mapping, w);
8490    Py_DECREF(w);
8491    if (x == NULL) {
8492        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8493            /* No mapping found means: use 1:1 mapping. */
8494            PyErr_Clear();
8495            *result = NULL;
8496            return 0;
8497        } else
8498            return -1;
8499    }
8500    else if (x == Py_None) {
8501        *result = x;
8502        return 0;
8503    }
8504    else if (PyLong_Check(x)) {
8505        long value = PyLong_AS_LONG(x);
8506        long max = PyUnicode_GetMax();
8507        if (value < 0 || value > max) {
8508            PyErr_Format(PyExc_TypeError,
8509                         "character mapping must be in range(0x%x)", max+1);
8510            Py_DECREF(x);
8511            return -1;
8512        }
8513        *result = x;
8514        return 0;
8515    }
8516    else if (PyUnicode_Check(x)) {
8517        *result = x;
8518        return 0;
8519    }
8520    else {
8521        /* wrong return value */
8522        PyErr_SetString(PyExc_TypeError,
8523                        "character mapping must return integer, None or str");
8524        Py_DECREF(x);
8525        return -1;
8526    }
8527}
8528/* ensure that *outobj is at least requiredsize characters long,
8529   if not reallocate and adjust various state variables.
8530   Return 0 on success, -1 on error */
8531static int
8532charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8533                               Py_ssize_t requiredsize)
8534{
8535    Py_ssize_t oldsize = *psize;
8536    if (requiredsize > oldsize) {
8537        /* exponentially overallocate to minimize reallocations */
8538        if (requiredsize < 2 * oldsize)
8539            requiredsize = 2 * oldsize;
8540        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8541        if (*outobj == 0)
8542            return -1;
8543        *psize = requiredsize;
8544    }
8545    return 0;
8546}
8547/* lookup the character, put the result in the output string and adjust
8548   various state variables. Return a new reference to the object that
8549   was put in the output buffer in *result, or Py_None, if the mapping was
8550   undefined (in which case no character was written).
8551   The called must decref result.
8552   Return 0 on success, -1 on error. */
8553static int
8554charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8555                        PyObject *mapping, Py_UCS4 **output,
8556                        Py_ssize_t *osize, Py_ssize_t *opos,
8557                        PyObject **res)
8558{
8559    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8560    if (charmaptranslate_lookup(curinp, mapping, res))
8561        return -1;
8562    if (*res==NULL) {
8563        /* not found => default to 1:1 mapping */
8564        (*output)[(*opos)++] = curinp;
8565    }
8566    else if (*res==Py_None)
8567        ;
8568    else if (PyLong_Check(*res)) {
8569        /* no overflow check, because we know that the space is enough */
8570        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8571    }
8572    else if (PyUnicode_Check(*res)) {
8573        Py_ssize_t repsize;
8574        if (PyUnicode_READY(*res) == -1)
8575            return -1;
8576        repsize = PyUnicode_GET_LENGTH(*res);
8577        if (repsize==1) {
8578            /* no overflow check, because we know that the space is enough */
8579            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8580        }
8581        else if (repsize!=0) {
8582            /* more than one character */
8583            Py_ssize_t requiredsize = *opos +
8584                (PyUnicode_GET_LENGTH(input) - ipos) +
8585                repsize - 1;
8586            Py_ssize_t i;
8587            if (charmaptranslate_makespace(output, osize, requiredsize))
8588                return -1;
8589            for(i = 0; i < repsize; i++)
8590                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8591        }
8592    }
8593    else
8594        return -1;
8595    return 0;
8596}
8597
8598PyObject *
8599_PyUnicode_TranslateCharmap(PyObject *input,
8600                            PyObject *mapping,
8601                            const char *errors)
8602{
8603    /* input object */
8604    char *idata;
8605    Py_ssize_t size, i;
8606    int kind;
8607    /* output buffer */
8608    Py_UCS4 *output = NULL;
8609    Py_ssize_t osize;
8610    PyObject *res;
8611    /* current output position */
8612    Py_ssize_t opos;
8613    char *reason = "character maps to <undefined>";
8614    PyObject *errorHandler = NULL;
8615    PyObject *exc = NULL;
8616    /* the following variable is used for caching string comparisons
8617     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8618     * 3=ignore, 4=xmlcharrefreplace */
8619    int known_errorHandler = -1;
8620
8621    if (mapping == NULL) {
8622        PyErr_BadArgument();
8623        return NULL;
8624    }
8625
8626    if (PyUnicode_READY(input) == -1)
8627        return NULL;
8628    idata = (char*)PyUnicode_DATA(input);
8629    kind = PyUnicode_KIND(input);
8630    size = PyUnicode_GET_LENGTH(input);
8631    i = 0;
8632
8633    if (size == 0) {
8634        Py_INCREF(input);
8635        return input;
8636    }
8637
8638    /* allocate enough for a simple 1:1 translation without
8639       replacements, if we need more, we'll resize */
8640    osize = size;
8641    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8642    opos = 0;
8643    if (output == NULL) {
8644        PyErr_NoMemory();
8645        goto onError;
8646    }
8647
8648    while (i<size) {
8649        /* try to encode it */
8650        PyObject *x = NULL;
8651        if (charmaptranslate_output(input, i, mapping,
8652                                    &output, &osize, &opos, &x)) {
8653            Py_XDECREF(x);
8654            goto onError;
8655        }
8656        Py_XDECREF(x);
8657        if (x!=Py_None) /* it worked => adjust input pointer */
8658            ++i;
8659        else { /* untranslatable character */
8660            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8661            Py_ssize_t repsize;
8662            Py_ssize_t newpos;
8663            Py_ssize_t uni2;
8664            /* startpos for collecting untranslatable chars */
8665            Py_ssize_t collstart = i;
8666            Py_ssize_t collend = i+1;
8667            Py_ssize_t coll;
8668
8669            /* find all untranslatable characters */
8670            while (collend < size) {
8671                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8672                    goto onError;
8673                Py_XDECREF(x);
8674                if (x!=Py_None)
8675                    break;
8676                ++collend;
8677            }
8678            /* cache callback name lookup
8679             * (if not done yet, i.e. it's the first error) */
8680            if (known_errorHandler==-1) {
8681                if ((errors==NULL) || (!strcmp(errors, "strict")))
8682                    known_errorHandler = 1;
8683                else if (!strcmp(errors, "replace"))
8684                    known_errorHandler = 2;
8685                else if (!strcmp(errors, "ignore"))
8686                    known_errorHandler = 3;
8687                else if (!strcmp(errors, "xmlcharrefreplace"))
8688                    known_errorHandler = 4;
8689                else
8690                    known_errorHandler = 0;
8691            }
8692            switch (known_errorHandler) {
8693            case 1: /* strict */
8694                raise_translate_exception(&exc, input, collstart,
8695                                          collend, reason);
8696                goto onError;
8697            case 2: /* replace */
8698                /* No need to check for space, this is a 1:1 replacement */
8699                for (coll = collstart; coll<collend; coll++)
8700                    output[opos++] = '?';
8701                /* fall through */
8702            case 3: /* ignore */
8703                i = collend;
8704                break;
8705            case 4: /* xmlcharrefreplace */
8706                /* generate replacement (temporarily (mis)uses i) */
8707                for (i = collstart; i < collend; ++i) {
8708                    char buffer[2+29+1+1];
8709                    char *cp;
8710                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8711                    if (charmaptranslate_makespace(&output, &osize,
8712                                                   opos+strlen(buffer)+(size-collend)))
8713                        goto onError;
8714                    for (cp = buffer; *cp; ++cp)
8715                        output[opos++] = *cp;
8716                }
8717                i = collend;
8718                break;
8719            default:
8720                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8721                                                                 reason, input, &exc,
8722                                                                 collstart, collend, &newpos);
8723                if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
8724                    goto onError;
8725                /* generate replacement  */
8726                repsize = PyUnicode_GET_LENGTH(repunicode);
8727                if (charmaptranslate_makespace(&output, &osize,
8728                                               opos+repsize+(size-collend))) {
8729                    Py_DECREF(repunicode);
8730                    goto onError;
8731                }
8732                for (uni2 = 0; repsize-->0; ++uni2)
8733                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8734                i = newpos;
8735                Py_DECREF(repunicode);
8736            }
8737        }
8738    }
8739    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8740    if (!res)
8741        goto onError;
8742    PyMem_Free(output);
8743    Py_XDECREF(exc);
8744    Py_XDECREF(errorHandler);
8745    return res;
8746
8747  onError:
8748    PyMem_Free(output);
8749    Py_XDECREF(exc);
8750    Py_XDECREF(errorHandler);
8751    return NULL;
8752}
8753
8754/* Deprecated. Use PyUnicode_Translate instead. */
8755PyObject *
8756PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8757                           Py_ssize_t size,
8758                           PyObject *mapping,
8759                           const char *errors)
8760{
8761    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8762    if (!unicode)
8763        return NULL;
8764    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8765}
8766
8767PyObject *
8768PyUnicode_Translate(PyObject *str,
8769                    PyObject *mapping,
8770                    const char *errors)
8771{
8772    PyObject *result;
8773
8774    str = PyUnicode_FromObject(str);
8775    if (str == NULL)
8776        goto onError;
8777    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8778    Py_DECREF(str);
8779    return result;
8780
8781  onError:
8782    Py_XDECREF(str);
8783    return NULL;
8784}
8785
8786static Py_UCS4
8787fix_decimal_and_space_to_ascii(PyObject *self)
8788{
8789    /* No need to call PyUnicode_READY(self) because this function is only
8790       called as a callback from fixup() which does it already. */
8791    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8792    const int kind = PyUnicode_KIND(self);
8793    void *data = PyUnicode_DATA(self);
8794    Py_UCS4 maxchar = 0, ch, fixed;
8795    Py_ssize_t i;
8796
8797    for (i = 0; i < len; ++i) {
8798        ch = PyUnicode_READ(kind, data, i);
8799        fixed = 0;
8800        if (ch > 127) {
8801            if (Py_UNICODE_ISSPACE(ch))
8802                fixed = ' ';
8803            else {
8804                const int decimal = Py_UNICODE_TODECIMAL(ch);
8805                if (decimal >= 0)
8806                    fixed = '0' + decimal;
8807            }
8808            if (fixed != 0) {
8809                if (fixed > maxchar)
8810                    maxchar = fixed;
8811                PyUnicode_WRITE(kind, data, i, fixed);
8812            }
8813            else if (ch > maxchar)
8814                maxchar = ch;
8815        }
8816        else if (ch > maxchar)
8817            maxchar = ch;
8818    }
8819
8820    return maxchar;
8821}
8822
8823PyObject *
8824_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8825{
8826    if (!PyUnicode_Check(unicode)) {
8827        PyErr_BadInternalCall();
8828        return NULL;
8829    }
8830    if (PyUnicode_READY(unicode) == -1)
8831        return NULL;
8832    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8833        /* If the string is already ASCII, just return the same string */
8834        Py_INCREF(unicode);
8835        return unicode;
8836    }
8837    return fixup(unicode, fix_decimal_and_space_to_ascii);
8838}
8839
8840PyObject *
8841PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8842                                  Py_ssize_t length)
8843{
8844    PyObject *result;
8845    Py_UNICODE *p; /* write pointer into result */
8846    Py_ssize_t i;
8847    /* Copy to a new string */
8848    result = (PyObject *)_PyUnicode_New(length);
8849    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8850    if (result == NULL)
8851        return result;
8852    p = PyUnicode_AS_UNICODE(result);
8853    /* Iterate over code points */
8854    for (i = 0; i < length; i++) {
8855        Py_UNICODE ch =s[i];
8856        if (ch > 127) {
8857            int decimal = Py_UNICODE_TODECIMAL(ch);
8858            if (decimal >= 0)
8859                p[i] = '0' + decimal;
8860        }
8861    }
8862#ifndef DONT_MAKE_RESULT_READY
8863    if (_PyUnicode_READY_REPLACE(&result)) {
8864        Py_DECREF(result);
8865        return NULL;
8866    }
8867#endif
8868    assert(_PyUnicode_CheckConsistency(result, 1));
8869    return result;
8870}
8871/* --- Decimal Encoder ---------------------------------------------------- */
8872
8873int
8874PyUnicode_EncodeDecimal(Py_UNICODE *s,
8875                        Py_ssize_t length,
8876                        char *output,
8877                        const char *errors)
8878{
8879    Py_UNICODE *p, *end;
8880    PyObject *errorHandler = NULL;
8881    PyObject *exc = NULL;
8882    PyObject *unicode;
8883    const char *encoding = "decimal";
8884    const char *reason = "invalid decimal Unicode string";
8885    /* the following variable is used for caching string comparisons
8886     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8887    int known_errorHandler = -1;
8888
8889    if (output == NULL) {
8890        PyErr_BadArgument();
8891        return -1;
8892    }
8893
8894    p = s;
8895    end = s + length;
8896    while (p < end) {
8897        register Py_UNICODE ch = *p;
8898        int decimal;
8899        PyObject *repunicode;
8900        Py_ssize_t repsize;
8901        Py_ssize_t newpos;
8902        Py_UNICODE *uni2;
8903        Py_UNICODE *collstart;
8904        Py_UNICODE *collend;
8905
8906        if (Py_UNICODE_ISSPACE(ch)) {
8907            *output++ = ' ';
8908            ++p;
8909            continue;
8910        }
8911        decimal = Py_UNICODE_TODECIMAL(ch);
8912        if (decimal >= 0) {
8913            *output++ = '0' + decimal;
8914            ++p;
8915            continue;
8916        }
8917        if (0 < ch && ch < 256) {
8918            *output++ = (char)ch;
8919            ++p;
8920            continue;
8921        }
8922        /* All other characters are considered unencodable */
8923        collstart = p;
8924        collend = p+1;
8925        while (collend < end) {
8926            if ((0 < *collend && *collend < 256) ||
8927                !Py_UNICODE_ISSPACE(*collend) ||
8928                Py_UNICODE_TODECIMAL(*collend))
8929                break;
8930        }
8931        /* cache callback name lookup
8932         * (if not done yet, i.e. it's the first error) */
8933        if (known_errorHandler==-1) {
8934            if ((errors==NULL) || (!strcmp(errors, "strict")))
8935                known_errorHandler = 1;
8936            else if (!strcmp(errors, "replace"))
8937                known_errorHandler = 2;
8938            else if (!strcmp(errors, "ignore"))
8939                known_errorHandler = 3;
8940            else if (!strcmp(errors, "xmlcharrefreplace"))
8941                known_errorHandler = 4;
8942            else
8943                known_errorHandler = 0;
8944        }
8945        switch (known_errorHandler) {
8946        case 1: /* strict */
8947            unicode = PyUnicode_FromUnicode(s, length);
8948            if (unicode == NULL)
8949                goto onError;
8950            raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
8951            Py_DECREF(unicode);
8952            goto onError;
8953        case 2: /* replace */
8954            for (p = collstart; p < collend; ++p)
8955                *output++ = '?';
8956            /* fall through */
8957        case 3: /* ignore */
8958            p = collend;
8959            break;
8960        case 4: /* xmlcharrefreplace */
8961            /* generate replacement (temporarily (mis)uses p) */
8962            for (p = collstart; p < collend; ++p)
8963                output += sprintf(output, "&#%d;", (int)*p);
8964            p = collend;
8965            break;
8966        default:
8967            unicode = PyUnicode_FromUnicode(s, length);
8968            if (unicode == NULL)
8969                goto onError;
8970            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8971                                                          encoding, reason, unicode, &exc,
8972                                                          collstart-s, collend-s, &newpos);
8973            Py_DECREF(unicode);
8974            if (repunicode == NULL)
8975                goto onError;
8976            if (!PyUnicode_Check(repunicode)) {
8977                /* Byte results not supported, since they have no decimal property. */
8978                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8979                Py_DECREF(repunicode);
8980                goto onError;
8981            }
8982            /* generate replacement  */
8983            repsize = PyUnicode_GET_SIZE(repunicode);
8984            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8985                Py_UNICODE ch = *uni2;
8986                if (Py_UNICODE_ISSPACE(ch))
8987                    *output++ = ' ';
8988                else {
8989                    decimal = Py_UNICODE_TODECIMAL(ch);
8990                    if (decimal >= 0)
8991                        *output++ = '0' + decimal;
8992                    else if (0 < ch && ch < 256)
8993                        *output++ = (char)ch;
8994                    else {
8995                        Py_DECREF(repunicode);
8996                        unicode = PyUnicode_FromUnicode(s, length);
8997                        if (unicode == NULL)
8998                            goto onError;
8999                        raise_encode_exception(&exc, encoding,
9000                                               unicode, collstart-s, collend-s, reason);
9001                        Py_DECREF(unicode);
9002                        goto onError;
9003                    }
9004                }
9005            }
9006            p = s + newpos;
9007            Py_DECREF(repunicode);
9008        }
9009    }
9010    /* 0-terminate the output string */
9011    *output++ = '\0';
9012    Py_XDECREF(exc);
9013    Py_XDECREF(errorHandler);
9014    return 0;
9015
9016  onError:
9017    Py_XDECREF(exc);
9018    Py_XDECREF(errorHandler);
9019    return -1;
9020}
9021
9022/* --- Helpers ------------------------------------------------------------ */
9023
9024static Py_ssize_t
9025any_find_slice(int direction, PyObject* s1, PyObject* s2,
9026               Py_ssize_t start,
9027               Py_ssize_t end)
9028{
9029    int kind1, kind2, kind;
9030    void *buf1, *buf2;
9031    Py_ssize_t len1, len2, result;
9032
9033    kind1 = PyUnicode_KIND(s1);
9034    kind2 = PyUnicode_KIND(s2);
9035    kind = kind1 > kind2 ? kind1 : kind2;
9036    buf1 = PyUnicode_DATA(s1);
9037    buf2 = PyUnicode_DATA(s2);
9038    if (kind1 != kind)
9039        buf1 = _PyUnicode_AsKind(s1, kind);
9040    if (!buf1)
9041        return -2;
9042    if (kind2 != kind)
9043        buf2 = _PyUnicode_AsKind(s2, kind);
9044    if (!buf2) {
9045        if (kind1 != kind) PyMem_Free(buf1);
9046        return -2;
9047    }
9048    len1 = PyUnicode_GET_LENGTH(s1);
9049    len2 = PyUnicode_GET_LENGTH(s2);
9050
9051    if (direction > 0) {
9052        switch(kind) {
9053        case PyUnicode_1BYTE_KIND:
9054            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9055                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9056            else
9057                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9058            break;
9059        case PyUnicode_2BYTE_KIND:
9060            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9061            break;
9062        case PyUnicode_4BYTE_KIND:
9063            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9064            break;
9065        default:
9066            assert(0); result = -2;
9067        }
9068    }
9069    else {
9070        switch(kind) {
9071        case PyUnicode_1BYTE_KIND:
9072            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9073                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9074            else
9075                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9076            break;
9077        case PyUnicode_2BYTE_KIND:
9078            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9079            break;
9080        case PyUnicode_4BYTE_KIND:
9081            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9082            break;
9083        default:
9084            assert(0); result = -2;
9085        }
9086    }
9087
9088    if (kind1 != kind)
9089        PyMem_Free(buf1);
9090    if (kind2 != kind)
9091        PyMem_Free(buf2);
9092
9093    return result;
9094}
9095
9096Py_ssize_t
9097_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
9098                                   Py_ssize_t n_buffer,
9099                                   void *digits, Py_ssize_t n_digits,
9100                                   Py_ssize_t min_width,
9101                                   const char *grouping,
9102                                   const char *thousands_sep)
9103{
9104    switch(kind) {
9105    case PyUnicode_1BYTE_KIND:
9106        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9107            return _PyUnicode_ascii_InsertThousandsGrouping(
9108                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9109                min_width, grouping, thousands_sep);
9110        else
9111            return _PyUnicode_ucs1_InsertThousandsGrouping(
9112                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9113                min_width, grouping, thousands_sep);
9114    case PyUnicode_2BYTE_KIND:
9115        return _PyUnicode_ucs2_InsertThousandsGrouping(
9116            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9117            min_width, grouping, thousands_sep);
9118    case PyUnicode_4BYTE_KIND:
9119        return _PyUnicode_ucs4_InsertThousandsGrouping(
9120            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9121            min_width, grouping, thousands_sep);
9122    }
9123    assert(0);
9124    return -1;
9125}
9126
9127
9128/* helper macro to fixup start/end slice values */
9129#define ADJUST_INDICES(start, end, len)         \
9130    if (end > len)                              \
9131        end = len;                              \
9132    else if (end < 0) {                         \
9133        end += len;                             \
9134        if (end < 0)                            \
9135            end = 0;                            \
9136    }                                           \
9137    if (start < 0) {                            \
9138        start += len;                           \
9139        if (start < 0)                          \
9140            start = 0;                          \
9141    }
9142
9143Py_ssize_t
9144PyUnicode_Count(PyObject *str,
9145                PyObject *substr,
9146                Py_ssize_t start,
9147                Py_ssize_t end)
9148{
9149    Py_ssize_t result;
9150    PyObject* str_obj;
9151    PyObject* sub_obj;
9152    int kind1, kind2, kind;
9153    void *buf1 = NULL, *buf2 = NULL;
9154    Py_ssize_t len1, len2;
9155
9156    str_obj = PyUnicode_FromObject(str);
9157    if (!str_obj || PyUnicode_READY(str_obj) == -1)
9158        return -1;
9159    sub_obj = PyUnicode_FromObject(substr);
9160    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
9161        Py_DECREF(str_obj);
9162        return -1;
9163    }
9164
9165    kind1 = PyUnicode_KIND(str_obj);
9166    kind2 = PyUnicode_KIND(sub_obj);
9167    kind = kind1 > kind2 ? kind1 : kind2;
9168    buf1 = PyUnicode_DATA(str_obj);
9169    if (kind1 != kind)
9170        buf1 = _PyUnicode_AsKind(str_obj, kind);
9171    if (!buf1)
9172        goto onError;
9173    buf2 = PyUnicode_DATA(sub_obj);
9174    if (kind2 != kind)
9175        buf2 = _PyUnicode_AsKind(sub_obj, kind);
9176    if (!buf2)
9177        goto onError;
9178    len1 = PyUnicode_GET_LENGTH(str_obj);
9179    len2 = PyUnicode_GET_LENGTH(sub_obj);
9180
9181    ADJUST_INDICES(start, end, len1);
9182    switch(kind) {
9183    case PyUnicode_1BYTE_KIND:
9184        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9185            result = asciilib_count(
9186                ((Py_UCS1*)buf1) + start, end - start,
9187                buf2, len2, PY_SSIZE_T_MAX
9188                );
9189        else
9190            result = ucs1lib_count(
9191                ((Py_UCS1*)buf1) + start, end - start,
9192                buf2, len2, PY_SSIZE_T_MAX
9193                );
9194        break;
9195    case PyUnicode_2BYTE_KIND:
9196        result = ucs2lib_count(
9197            ((Py_UCS2*)buf1) + start, end - start,
9198            buf2, len2, PY_SSIZE_T_MAX
9199            );
9200        break;
9201    case PyUnicode_4BYTE_KIND:
9202        result = ucs4lib_count(
9203            ((Py_UCS4*)buf1) + start, end - start,
9204            buf2, len2, PY_SSIZE_T_MAX
9205            );
9206        break;
9207    default:
9208        assert(0); result = 0;
9209    }
9210
9211    Py_DECREF(sub_obj);
9212    Py_DECREF(str_obj);
9213
9214    if (kind1 != kind)
9215        PyMem_Free(buf1);
9216    if (kind2 != kind)
9217        PyMem_Free(buf2);
9218
9219    return result;
9220  onError:
9221    Py_DECREF(sub_obj);
9222    Py_DECREF(str_obj);
9223    if (kind1 != kind && buf1)
9224        PyMem_Free(buf1);
9225    if (kind2 != kind && buf2)
9226        PyMem_Free(buf2);
9227    return -1;
9228}
9229
9230Py_ssize_t
9231PyUnicode_Find(PyObject *str,
9232               PyObject *sub,
9233               Py_ssize_t start,
9234               Py_ssize_t end,
9235               int direction)
9236{
9237    Py_ssize_t result;
9238
9239    str = PyUnicode_FromObject(str);
9240    if (!str || PyUnicode_READY(str) == -1)
9241        return -2;
9242    sub = PyUnicode_FromObject(sub);
9243    if (!sub || PyUnicode_READY(sub) == -1) {
9244        Py_DECREF(str);
9245        return -2;
9246    }
9247
9248    result = any_find_slice(direction,
9249        str, sub, start, end
9250        );
9251
9252    Py_DECREF(str);
9253    Py_DECREF(sub);
9254
9255    return result;
9256}
9257
9258Py_ssize_t
9259PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9260                   Py_ssize_t start, Py_ssize_t end,
9261                   int direction)
9262{
9263    int kind;
9264    Py_ssize_t result;
9265    if (PyUnicode_READY(str) == -1)
9266        return -2;
9267    if (start < 0 || end < 0) {
9268        PyErr_SetString(PyExc_IndexError, "string index out of range");
9269        return -2;
9270    }
9271    if (end > PyUnicode_GET_LENGTH(str))
9272        end = PyUnicode_GET_LENGTH(str);
9273    kind = PyUnicode_KIND(str);
9274    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9275                      kind, end-start, ch, direction);
9276    if (result == -1)
9277        return -1;
9278    else
9279        return start + result;
9280}
9281
9282static int
9283tailmatch(PyObject *self,
9284          PyObject *substring,
9285          Py_ssize_t start,
9286          Py_ssize_t end,
9287          int direction)
9288{
9289    int kind_self;
9290    int kind_sub;
9291    void *data_self;
9292    void *data_sub;
9293    Py_ssize_t offset;
9294    Py_ssize_t i;
9295    Py_ssize_t end_sub;
9296
9297    if (PyUnicode_READY(self) == -1 ||
9298        PyUnicode_READY(substring) == -1)
9299        return 0;
9300
9301    if (PyUnicode_GET_LENGTH(substring) == 0)
9302        return 1;
9303
9304    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9305    end -= PyUnicode_GET_LENGTH(substring);
9306    if (end < start)
9307        return 0;
9308
9309    kind_self = PyUnicode_KIND(self);
9310    data_self = PyUnicode_DATA(self);
9311    kind_sub = PyUnicode_KIND(substring);
9312    data_sub = PyUnicode_DATA(substring);
9313    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9314
9315    if (direction > 0)
9316        offset = end;
9317    else
9318        offset = start;
9319
9320    if (PyUnicode_READ(kind_self, data_self, offset) ==
9321        PyUnicode_READ(kind_sub, data_sub, 0) &&
9322        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9323        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9324        /* If both are of the same kind, memcmp is sufficient */
9325        if (kind_self == kind_sub) {
9326            return ! memcmp((char *)data_self +
9327                                (offset * PyUnicode_KIND(substring)),
9328                            data_sub,
9329                            PyUnicode_GET_LENGTH(substring) *
9330                                PyUnicode_KIND(substring));
9331        }
9332        /* otherwise we have to compare each character by first accesing it */
9333        else {
9334            /* We do not need to compare 0 and len(substring)-1 because
9335               the if statement above ensured already that they are equal
9336               when we end up here. */
9337            // TODO: honor direction and do a forward or backwards search
9338            for (i = 1; i < end_sub; ++i) {
9339                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9340                    PyUnicode_READ(kind_sub, data_sub, i))
9341                    return 0;
9342            }
9343            return 1;
9344        }
9345    }
9346
9347    return 0;
9348}
9349
9350Py_ssize_t
9351PyUnicode_Tailmatch(PyObject *str,
9352                    PyObject *substr,
9353                    Py_ssize_t start,
9354                    Py_ssize_t end,
9355                    int direction)
9356{
9357    Py_ssize_t result;
9358
9359    str = PyUnicode_FromObject(str);
9360    if (str == NULL)
9361        return -1;
9362    substr = PyUnicode_FromObject(substr);
9363    if (substr == NULL) {
9364        Py_DECREF(str);
9365        return -1;
9366    }
9367
9368    result = tailmatch(str, substr,
9369                       start, end, direction);
9370    Py_DECREF(str);
9371    Py_DECREF(substr);
9372    return result;
9373}
9374
9375/* Apply fixfct filter to the Unicode object self and return a
9376   reference to the modified object */
9377
9378static PyObject *
9379fixup(PyObject *self,
9380      Py_UCS4 (*fixfct)(PyObject *s))
9381{
9382    PyObject *u;
9383    Py_UCS4 maxchar_old, maxchar_new = 0;
9384
9385    if (PyUnicode_READY(self) == -1)
9386        return NULL;
9387    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9388    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9389                      maxchar_old);
9390    if (u == NULL)
9391        return NULL;
9392
9393    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
9394              PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
9395
9396    /* fix functions return the new maximum character in a string,
9397       if the kind of the resulting unicode object does not change,
9398       everything is fine.  Otherwise we need to change the string kind
9399       and re-run the fix function. */
9400    maxchar_new = fixfct(u);
9401    if (maxchar_new == 0)
9402        /* do nothing, keep maxchar_new at 0 which means no changes. */;
9403    else if (maxchar_new <= 127)
9404        maxchar_new = 127;
9405    else if (maxchar_new <= 255)
9406        maxchar_new = 255;
9407    else if (maxchar_new <= 65535)
9408        maxchar_new = 65535;
9409    else
9410        maxchar_new = 1114111; /* 0x10ffff */
9411
9412    if (!maxchar_new && PyUnicode_CheckExact(self)) {
9413        /* fixfct should return TRUE if it modified the buffer. If
9414           FALSE, return a reference to the original buffer instead
9415           (to save space, not time) */
9416        Py_INCREF(self);
9417        Py_DECREF(u);
9418        return self;
9419    }
9420    else if (maxchar_new == maxchar_old) {
9421        return u;
9422    }
9423    else {
9424        /* In case the maximum character changed, we need to
9425           convert the string to the new category. */
9426        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9427        if (v == NULL) {
9428            Py_DECREF(u);
9429            return NULL;
9430        }
9431        if (maxchar_new > maxchar_old) {
9432            /* If the maxchar increased so that the kind changed, not all
9433               characters are representable anymore and we need to fix the
9434               string again. This only happens in very few cases. */
9435            copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9436            maxchar_old = fixfct(v);
9437            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9438        }
9439        else {
9440            copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
9441        }
9442
9443        Py_DECREF(u);
9444        assert(_PyUnicode_CheckConsistency(v, 1));
9445        return v;
9446    }
9447}
9448
9449static Py_UCS4
9450fixupper(PyObject *self)
9451{
9452    /* No need to call PyUnicode_READY(self) because this function is only
9453       called as a callback from fixup() which does it already. */
9454    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9455    const int kind = PyUnicode_KIND(self);
9456    void *data = PyUnicode_DATA(self);
9457    int touched = 0;
9458    Py_UCS4 maxchar = 0;
9459    Py_ssize_t i;
9460
9461    for (i = 0; i < len; ++i) {
9462        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9463        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9464        if (up != ch) {
9465            if (up > maxchar)
9466                maxchar = up;
9467            PyUnicode_WRITE(kind, data, i, up);
9468            touched = 1;
9469        }
9470        else if (ch > maxchar)
9471            maxchar = ch;
9472    }
9473
9474    if (touched)
9475        return maxchar;
9476    else
9477        return 0;
9478}
9479
9480static Py_UCS4
9481fixlower(PyObject *self)
9482{
9483    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9484    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9485    const int kind = PyUnicode_KIND(self);
9486    void *data = PyUnicode_DATA(self);
9487    int touched = 0;
9488    Py_UCS4 maxchar = 0;
9489    Py_ssize_t i;
9490
9491    for(i = 0; i < len; ++i) {
9492        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9493        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9494        if (lo != ch) {
9495            if (lo > maxchar)
9496                maxchar = lo;
9497            PyUnicode_WRITE(kind, data, i, lo);
9498            touched = 1;
9499        }
9500        else if (ch > maxchar)
9501            maxchar = ch;
9502    }
9503
9504    if (touched)
9505        return maxchar;
9506    else
9507        return 0;
9508}
9509
9510static Py_UCS4
9511fixswapcase(PyObject *self)
9512{
9513    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9514    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9515    const int kind = PyUnicode_KIND(self);
9516    void *data = PyUnicode_DATA(self);
9517    int touched = 0;
9518    Py_UCS4 maxchar = 0;
9519    Py_ssize_t i;
9520
9521    for(i = 0; i < len; ++i) {
9522        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9523        Py_UCS4 nu = 0;
9524
9525        if (Py_UNICODE_ISUPPER(ch))
9526            nu = Py_UNICODE_TOLOWER(ch);
9527        else if (Py_UNICODE_ISLOWER(ch))
9528            nu = Py_UNICODE_TOUPPER(ch);
9529
9530        if (nu != 0) {
9531            if (nu > maxchar)
9532                maxchar = nu;
9533            PyUnicode_WRITE(kind, data, i, nu);
9534            touched = 1;
9535        }
9536        else if (ch > maxchar)
9537            maxchar = ch;
9538    }
9539
9540    if (touched)
9541        return maxchar;
9542    else
9543        return 0;
9544}
9545
9546static Py_UCS4
9547fixcapitalize(PyObject *self)
9548{
9549    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9550    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9551    const int kind = PyUnicode_KIND(self);
9552    void *data = PyUnicode_DATA(self);
9553    int touched = 0;
9554    Py_UCS4 maxchar = 0;
9555    Py_ssize_t i = 0;
9556    Py_UCS4 ch;
9557
9558    if (len == 0)
9559        return 0;
9560
9561    ch = PyUnicode_READ(kind, data, i);
9562    if (!Py_UNICODE_ISUPPER(ch)) {
9563        maxchar = Py_UNICODE_TOUPPER(ch);
9564        PyUnicode_WRITE(kind, data, i, maxchar);
9565        touched = 1;
9566    }
9567    ++i;
9568    for(; i < len; ++i) {
9569        ch = PyUnicode_READ(kind, data, i);
9570        if (!Py_UNICODE_ISLOWER(ch)) {
9571            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9572            if (lo > maxchar)
9573                maxchar = lo;
9574            PyUnicode_WRITE(kind, data, i, lo);
9575            touched = 1;
9576        }
9577        else if (ch > maxchar)
9578            maxchar = ch;
9579    }
9580
9581    if (touched)
9582        return maxchar;
9583    else
9584        return 0;
9585}
9586
9587static Py_UCS4
9588fixtitle(PyObject *self)
9589{
9590    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9591    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9592    const int kind = PyUnicode_KIND(self);
9593    void *data = PyUnicode_DATA(self);
9594    Py_UCS4 maxchar = 0;
9595    Py_ssize_t i = 0;
9596    int previous_is_cased;
9597
9598    /* Shortcut for single character strings */
9599    if (len == 1) {
9600        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9601        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9602        if (ti != ch) {
9603            PyUnicode_WRITE(kind, data, i, ti);
9604            return ti;
9605        }
9606        else
9607            return 0;
9608    }
9609    previous_is_cased = 0;
9610    for(; i < len; ++i) {
9611        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9612        Py_UCS4 nu;
9613
9614        if (previous_is_cased)
9615            nu = Py_UNICODE_TOLOWER(ch);
9616        else
9617            nu = Py_UNICODE_TOTITLE(ch);
9618
9619        if (nu > maxchar)
9620            maxchar = nu;
9621        PyUnicode_WRITE(kind, data, i, nu);
9622
9623        if (Py_UNICODE_ISLOWER(ch) ||
9624            Py_UNICODE_ISUPPER(ch) ||
9625            Py_UNICODE_ISTITLE(ch))
9626            previous_is_cased = 1;
9627        else
9628            previous_is_cased = 0;
9629    }
9630    return maxchar;
9631}
9632
9633PyObject *
9634PyUnicode_Join(PyObject *separator, PyObject *seq)
9635{
9636    PyObject *sep = NULL;
9637    Py_ssize_t seplen;
9638    PyObject *res = NULL; /* the result */
9639    PyObject *fseq;          /* PySequence_Fast(seq) */
9640    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9641    PyObject **items;
9642    PyObject *item;
9643    Py_ssize_t sz, i, res_offset;
9644    Py_UCS4 maxchar;
9645    Py_UCS4 item_maxchar;
9646    int use_memcpy;
9647    unsigned char *res_data = NULL, *sep_data = NULL;
9648    PyObject *last_obj;
9649    unsigned int kind = 0;
9650
9651    fseq = PySequence_Fast(seq, "");
9652    if (fseq == NULL) {
9653        return NULL;
9654    }
9655
9656    /* NOTE: the following code can't call back into Python code,
9657     * so we are sure that fseq won't be mutated.
9658     */
9659
9660    seqlen = PySequence_Fast_GET_SIZE(fseq);
9661    /* If empty sequence, return u"". */
9662    if (seqlen == 0) {
9663        Py_DECREF(fseq);
9664        Py_INCREF(unicode_empty);
9665        res = unicode_empty;
9666        return res;
9667    }
9668
9669    /* If singleton sequence with an exact Unicode, return that. */
9670    last_obj = NULL;
9671    items = PySequence_Fast_ITEMS(fseq);
9672    if (seqlen == 1) {
9673        if (PyUnicode_CheckExact(items[0])) {
9674            res = items[0];
9675            Py_INCREF(res);
9676            Py_DECREF(fseq);
9677            return res;
9678        }
9679        seplen = 0;
9680        maxchar = 0;
9681    }
9682    else {
9683        /* Set up sep and seplen */
9684        if (separator == NULL) {
9685            /* fall back to a blank space separator */
9686            sep = PyUnicode_FromOrdinal(' ');
9687            if (!sep)
9688                goto onError;
9689            seplen = 1;
9690            maxchar = 32;
9691        }
9692        else {
9693            if (!PyUnicode_Check(separator)) {
9694                PyErr_Format(PyExc_TypeError,
9695                             "separator: expected str instance,"
9696                             " %.80s found",
9697                             Py_TYPE(separator)->tp_name);
9698                goto onError;
9699            }
9700            if (PyUnicode_READY(separator))
9701                goto onError;
9702            sep = separator;
9703            seplen = PyUnicode_GET_LENGTH(separator);
9704            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9705            /* inc refcount to keep this code path symmetric with the
9706               above case of a blank separator */
9707            Py_INCREF(sep);
9708        }
9709        last_obj = sep;
9710    }
9711
9712    /* There are at least two things to join, or else we have a subclass
9713     * of str in the sequence.
9714     * Do a pre-pass to figure out the total amount of space we'll
9715     * need (sz), and see whether all argument are strings.
9716     */
9717    sz = 0;
9718#ifdef Py_DEBUG
9719    use_memcpy = 0;
9720#else
9721    use_memcpy = 1;
9722#endif
9723    for (i = 0; i < seqlen; i++) {
9724        const Py_ssize_t old_sz = sz;
9725        item = items[i];
9726        if (!PyUnicode_Check(item)) {
9727            PyErr_Format(PyExc_TypeError,
9728                         "sequence item %zd: expected str instance,"
9729                         " %.80s found",
9730                         i, Py_TYPE(item)->tp_name);
9731            goto onError;
9732        }
9733        if (PyUnicode_READY(item) == -1)
9734            goto onError;
9735        sz += PyUnicode_GET_LENGTH(item);
9736        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9737        maxchar = Py_MAX(maxchar, item_maxchar);
9738        if (i != 0)
9739            sz += seplen;
9740        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9741            PyErr_SetString(PyExc_OverflowError,
9742                            "join() result is too long for a Python string");
9743            goto onError;
9744        }
9745        if (use_memcpy && last_obj != NULL) {
9746            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9747                use_memcpy = 0;
9748        }
9749        last_obj = item;
9750    }
9751
9752    res = PyUnicode_New(sz, maxchar);
9753    if (res == NULL)
9754        goto onError;
9755
9756    /* Catenate everything. */
9757#ifdef Py_DEBUG
9758    use_memcpy = 0;
9759#else
9760    if (use_memcpy) {
9761        res_data = PyUnicode_1BYTE_DATA(res);
9762        kind = PyUnicode_KIND(res);
9763        if (seplen != 0)
9764            sep_data = PyUnicode_1BYTE_DATA(sep);
9765    }
9766#endif
9767    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9768        Py_ssize_t itemlen;
9769        item = items[i];
9770        /* Copy item, and maybe the separator. */
9771        if (i && seplen != 0) {
9772            if (use_memcpy) {
9773                Py_MEMCPY(res_data,
9774                          sep_data,
9775                          kind * seplen);
9776                res_data += kind * seplen;
9777            }
9778            else {
9779                copy_characters(res, res_offset, sep, 0, seplen);
9780                res_offset += seplen;
9781            }
9782        }
9783        itemlen = PyUnicode_GET_LENGTH(item);
9784        if (itemlen != 0) {
9785            if (use_memcpy) {
9786                Py_MEMCPY(res_data,
9787                          PyUnicode_DATA(item),
9788                          kind * itemlen);
9789                res_data += kind * itemlen;
9790            }
9791            else {
9792                copy_characters(res, res_offset, item, 0, itemlen);
9793                res_offset += itemlen;
9794            }
9795        }
9796    }
9797    if (use_memcpy)
9798        assert(res_data == PyUnicode_1BYTE_DATA(res)
9799                           + kind * PyUnicode_GET_LENGTH(res));
9800    else
9801        assert(res_offset == PyUnicode_GET_LENGTH(res));
9802
9803    Py_DECREF(fseq);
9804    Py_XDECREF(sep);
9805    assert(_PyUnicode_CheckConsistency(res, 1));
9806    return res;
9807
9808  onError:
9809    Py_DECREF(fseq);
9810    Py_XDECREF(sep);
9811    Py_XDECREF(res);
9812    return NULL;
9813}
9814
9815#define FILL(kind, data, value, start, length) \
9816    do { \
9817        Py_ssize_t i_ = 0; \
9818        assert(kind != PyUnicode_WCHAR_KIND); \
9819        switch ((kind)) { \
9820        case PyUnicode_1BYTE_KIND: { \
9821            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9822            memset(to_, (unsigned char)value, length); \
9823            break; \
9824        } \
9825        case PyUnicode_2BYTE_KIND: { \
9826            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9827            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9828            break; \
9829        } \
9830        default: { \
9831            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9832            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9833            break; \
9834        } \
9835        } \
9836    } while (0)
9837
9838static PyObject *
9839pad(PyObject *self,
9840    Py_ssize_t left,
9841    Py_ssize_t right,
9842    Py_UCS4 fill)
9843{
9844    PyObject *u;
9845    Py_UCS4 maxchar;
9846    int kind;
9847    void *data;
9848
9849    if (left < 0)
9850        left = 0;
9851    if (right < 0)
9852        right = 0;
9853
9854    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9855        Py_INCREF(self);
9856        return self;
9857    }
9858
9859    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9860        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9861        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9862        return NULL;
9863    }
9864    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9865    if (fill > maxchar)
9866        maxchar = fill;
9867    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9868    if (!u)
9869        return NULL;
9870
9871    kind = PyUnicode_KIND(u);
9872    data = PyUnicode_DATA(u);
9873    if (left)
9874        FILL(kind, data, fill, 0, left);
9875    if (right)
9876        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9877    copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
9878    assert(_PyUnicode_CheckConsistency(u, 1));
9879    return u;
9880}
9881#undef FILL
9882
9883PyObject *
9884PyUnicode_Splitlines(PyObject *string, int keepends)
9885{
9886    PyObject *list;
9887
9888    string = PyUnicode_FromObject(string);
9889    if (string == NULL || PyUnicode_READY(string) == -1)
9890        return NULL;
9891
9892    switch(PyUnicode_KIND(string)) {
9893    case PyUnicode_1BYTE_KIND:
9894        if (PyUnicode_IS_ASCII(string))
9895            list = asciilib_splitlines(
9896                string, PyUnicode_1BYTE_DATA(string),
9897                PyUnicode_GET_LENGTH(string), keepends);
9898        else
9899            list = ucs1lib_splitlines(
9900                string, PyUnicode_1BYTE_DATA(string),
9901                PyUnicode_GET_LENGTH(string), keepends);
9902        break;
9903    case PyUnicode_2BYTE_KIND:
9904        list = ucs2lib_splitlines(
9905            string, PyUnicode_2BYTE_DATA(string),
9906            PyUnicode_GET_LENGTH(string), keepends);
9907        break;
9908    case PyUnicode_4BYTE_KIND:
9909        list = ucs4lib_splitlines(
9910            string, PyUnicode_4BYTE_DATA(string),
9911            PyUnicode_GET_LENGTH(string), keepends);
9912        break;
9913    default:
9914        assert(0);
9915        list = 0;
9916    }
9917    Py_DECREF(string);
9918    return list;
9919}
9920
9921static PyObject *
9922split(PyObject *self,
9923      PyObject *substring,
9924      Py_ssize_t maxcount)
9925{
9926    int kind1, kind2, kind;
9927    void *buf1, *buf2;
9928    Py_ssize_t len1, len2;
9929    PyObject* out;
9930
9931    if (maxcount < 0)
9932        maxcount = PY_SSIZE_T_MAX;
9933
9934    if (PyUnicode_READY(self) == -1)
9935        return NULL;
9936
9937    if (substring == NULL)
9938        switch(PyUnicode_KIND(self)) {
9939        case PyUnicode_1BYTE_KIND:
9940            if (PyUnicode_IS_ASCII(self))
9941                return asciilib_split_whitespace(
9942                    self,  PyUnicode_1BYTE_DATA(self),
9943                    PyUnicode_GET_LENGTH(self), maxcount
9944                    );
9945            else
9946                return ucs1lib_split_whitespace(
9947                    self,  PyUnicode_1BYTE_DATA(self),
9948                    PyUnicode_GET_LENGTH(self), maxcount
9949                    );
9950        case PyUnicode_2BYTE_KIND:
9951            return ucs2lib_split_whitespace(
9952                self,  PyUnicode_2BYTE_DATA(self),
9953                PyUnicode_GET_LENGTH(self), maxcount
9954                );
9955        case PyUnicode_4BYTE_KIND:
9956            return ucs4lib_split_whitespace(
9957                self,  PyUnicode_4BYTE_DATA(self),
9958                PyUnicode_GET_LENGTH(self), maxcount
9959                );
9960        default:
9961            assert(0);
9962            return NULL;
9963        }
9964
9965    if (PyUnicode_READY(substring) == -1)
9966        return NULL;
9967
9968    kind1 = PyUnicode_KIND(self);
9969    kind2 = PyUnicode_KIND(substring);
9970    kind = kind1 > kind2 ? kind1 : kind2;
9971    buf1 = PyUnicode_DATA(self);
9972    buf2 = PyUnicode_DATA(substring);
9973    if (kind1 != kind)
9974        buf1 = _PyUnicode_AsKind(self, kind);
9975    if (!buf1)
9976        return NULL;
9977    if (kind2 != kind)
9978        buf2 = _PyUnicode_AsKind(substring, kind);
9979    if (!buf2) {
9980        if (kind1 != kind) PyMem_Free(buf1);
9981        return NULL;
9982    }
9983    len1 = PyUnicode_GET_LENGTH(self);
9984    len2 = PyUnicode_GET_LENGTH(substring);
9985
9986    switch(kind) {
9987    case PyUnicode_1BYTE_KIND:
9988        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9989            out = asciilib_split(
9990                self,  buf1, len1, buf2, len2, maxcount);
9991        else
9992            out = ucs1lib_split(
9993                self,  buf1, len1, buf2, len2, maxcount);
9994        break;
9995    case PyUnicode_2BYTE_KIND:
9996        out = ucs2lib_split(
9997            self,  buf1, len1, buf2, len2, maxcount);
9998        break;
9999    case PyUnicode_4BYTE_KIND:
10000        out = ucs4lib_split(
10001            self,  buf1, len1, buf2, len2, maxcount);
10002        break;
10003    default:
10004        out = NULL;
10005    }
10006    if (kind1 != kind)
10007        PyMem_Free(buf1);
10008    if (kind2 != kind)
10009        PyMem_Free(buf2);
10010    return out;
10011}
10012
10013static PyObject *
10014rsplit(PyObject *self,
10015       PyObject *substring,
10016       Py_ssize_t maxcount)
10017{
10018    int kind1, kind2, kind;
10019    void *buf1, *buf2;
10020    Py_ssize_t len1, len2;
10021    PyObject* out;
10022
10023    if (maxcount < 0)
10024        maxcount = PY_SSIZE_T_MAX;
10025
10026    if (PyUnicode_READY(self) == -1)
10027        return NULL;
10028
10029    if (substring == NULL)
10030        switch(PyUnicode_KIND(self)) {
10031        case PyUnicode_1BYTE_KIND:
10032            if (PyUnicode_IS_ASCII(self))
10033                return asciilib_rsplit_whitespace(
10034                    self,  PyUnicode_1BYTE_DATA(self),
10035                    PyUnicode_GET_LENGTH(self), maxcount
10036                    );
10037            else
10038                return ucs1lib_rsplit_whitespace(
10039                    self,  PyUnicode_1BYTE_DATA(self),
10040                    PyUnicode_GET_LENGTH(self), maxcount
10041                    );
10042        case PyUnicode_2BYTE_KIND:
10043            return ucs2lib_rsplit_whitespace(
10044                self,  PyUnicode_2BYTE_DATA(self),
10045                PyUnicode_GET_LENGTH(self), maxcount
10046                );
10047        case PyUnicode_4BYTE_KIND:
10048            return ucs4lib_rsplit_whitespace(
10049                self,  PyUnicode_4BYTE_DATA(self),
10050                PyUnicode_GET_LENGTH(self), maxcount
10051                );
10052        default:
10053            assert(0);
10054            return NULL;
10055        }
10056
10057    if (PyUnicode_READY(substring) == -1)
10058        return NULL;
10059
10060    kind1 = PyUnicode_KIND(self);
10061    kind2 = PyUnicode_KIND(substring);
10062    kind = kind1 > kind2 ? kind1 : kind2;
10063    buf1 = PyUnicode_DATA(self);
10064    buf2 = PyUnicode_DATA(substring);
10065    if (kind1 != kind)
10066        buf1 = _PyUnicode_AsKind(self, kind);
10067    if (!buf1)
10068        return NULL;
10069    if (kind2 != kind)
10070        buf2 = _PyUnicode_AsKind(substring, kind);
10071    if (!buf2) {
10072        if (kind1 != kind) PyMem_Free(buf1);
10073        return NULL;
10074    }
10075    len1 = PyUnicode_GET_LENGTH(self);
10076    len2 = PyUnicode_GET_LENGTH(substring);
10077
10078    switch(kind) {
10079    case PyUnicode_1BYTE_KIND:
10080        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10081            out = asciilib_rsplit(
10082                self,  buf1, len1, buf2, len2, maxcount);
10083        else
10084            out = ucs1lib_rsplit(
10085                self,  buf1, len1, buf2, len2, maxcount);
10086        break;
10087    case PyUnicode_2BYTE_KIND:
10088        out = ucs2lib_rsplit(
10089            self,  buf1, len1, buf2, len2, maxcount);
10090        break;
10091    case PyUnicode_4BYTE_KIND:
10092        out = ucs4lib_rsplit(
10093            self,  buf1, len1, buf2, len2, maxcount);
10094        break;
10095    default:
10096        out = NULL;
10097    }
10098    if (kind1 != kind)
10099        PyMem_Free(buf1);
10100    if (kind2 != kind)
10101        PyMem_Free(buf2);
10102    return out;
10103}
10104
10105static Py_ssize_t
10106anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10107            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10108{
10109    switch(kind) {
10110    case PyUnicode_1BYTE_KIND:
10111        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10112            return asciilib_find(buf1, len1, buf2, len2, offset);
10113        else
10114            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10115    case PyUnicode_2BYTE_KIND:
10116        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10117    case PyUnicode_4BYTE_KIND:
10118        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10119    }
10120    assert(0);
10121    return -1;
10122}
10123
10124static Py_ssize_t
10125anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10126             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10127{
10128        switch(kind) {
10129        case PyUnicode_1BYTE_KIND:
10130            if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10131                return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10132            else
10133                return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10134        case PyUnicode_2BYTE_KIND:
10135            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10136        case PyUnicode_4BYTE_KIND:
10137            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10138        }
10139        assert(0);
10140        return 0;
10141}
10142
10143static PyObject *
10144replace(PyObject *self, PyObject *str1,
10145        PyObject *str2, Py_ssize_t maxcount)
10146{
10147    PyObject *u;
10148    char *sbuf = PyUnicode_DATA(self);
10149    char *buf1 = PyUnicode_DATA(str1);
10150    char *buf2 = PyUnicode_DATA(str2);
10151    int srelease = 0, release1 = 0, release2 = 0;
10152    int skind = PyUnicode_KIND(self);
10153    int kind1 = PyUnicode_KIND(str1);
10154    int kind2 = PyUnicode_KIND(str2);
10155    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10156    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10157    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10158    int mayshrink;
10159    Py_UCS4 maxchar, maxchar_str2;
10160
10161    if (maxcount < 0)
10162        maxcount = PY_SSIZE_T_MAX;
10163    else if (maxcount == 0 || slen == 0)
10164        goto nothing;
10165
10166    if (str1 == str2)
10167        goto nothing;
10168    if (skind < kind1)
10169        /* substring too wide to be present */
10170        goto nothing;
10171
10172    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10173    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10174    /* Replacing str1 with str2 may cause a maxchar reduction in the
10175       result string. */
10176    mayshrink = (maxchar_str2 < maxchar);
10177    maxchar = Py_MAX(maxchar, maxchar_str2);
10178
10179    if (len1 == len2) {
10180        Py_ssize_t i;
10181        /* same length */
10182        if (len1 == 0)
10183            goto nothing;
10184        if (len1 == 1) {
10185            /* replace characters */
10186            Py_UCS4 u1, u2;
10187            int rkind;
10188            u1 = PyUnicode_READ_CHAR(str1, 0);
10189            if (findchar(sbuf, PyUnicode_KIND(self),
10190                         slen, u1, 1) < 0)
10191                goto nothing;
10192            u2 = PyUnicode_READ_CHAR(str2, 0);
10193            u = PyUnicode_New(slen, maxchar);
10194            if (!u)
10195                goto error;
10196            copy_characters(u, 0, self, 0, slen);
10197            rkind = PyUnicode_KIND(u);
10198            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10199                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
10200                    if (--maxcount < 0)
10201                        break;
10202                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
10203                }
10204        }
10205        else {
10206            int rkind = skind;
10207            char *res;
10208
10209            if (kind1 < rkind) {
10210                /* widen substring */
10211                buf1 = _PyUnicode_AsKind(str1, rkind);
10212                if (!buf1) goto error;
10213                release1 = 1;
10214            }
10215            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10216            if (i < 0)
10217                goto nothing;
10218            if (rkind > kind2) {
10219                /* widen replacement */
10220                buf2 = _PyUnicode_AsKind(str2, rkind);
10221                if (!buf2) goto error;
10222                release2 = 1;
10223            }
10224            else if (rkind < kind2) {
10225                /* widen self and buf1 */
10226                rkind = kind2;
10227                if (release1) PyMem_Free(buf1);
10228                sbuf = _PyUnicode_AsKind(self, rkind);
10229                if (!sbuf) goto error;
10230                srelease = 1;
10231                buf1 = _PyUnicode_AsKind(str1, rkind);
10232                if (!buf1) goto error;
10233                release1 = 1;
10234            }
10235            u = PyUnicode_New(slen, maxchar);
10236            if (!u)
10237                goto error;
10238            assert(PyUnicode_KIND(u) == rkind);
10239            res = PyUnicode_DATA(u);
10240
10241            memcpy(res, sbuf, rkind * slen);
10242            /* change everything in-place, starting with this one */
10243            memcpy(res + rkind * i,
10244                   buf2,
10245                   rkind * len2);
10246            i += len1;
10247
10248            while ( --maxcount > 0) {
10249                i = anylib_find(rkind, self,
10250                                sbuf+rkind*i, slen-i,
10251                                str1, buf1, len1, i);
10252                if (i == -1)
10253                    break;
10254                memcpy(res + rkind * i,
10255                       buf2,
10256                       rkind * len2);
10257                i += len1;
10258            }
10259        }
10260    }
10261    else {
10262        Py_ssize_t n, i, j, ires;
10263        Py_ssize_t product, new_size;
10264        int rkind = skind;
10265        char *res;
10266
10267        if (kind1 < rkind) {
10268            /* widen substring */
10269            buf1 = _PyUnicode_AsKind(str1, rkind);
10270            if (!buf1) goto error;
10271            release1 = 1;
10272        }
10273        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10274        if (n == 0)
10275            goto nothing;
10276        if (kind2 < rkind) {
10277            /* widen replacement */
10278            buf2 = _PyUnicode_AsKind(str2, rkind);
10279            if (!buf2) goto error;
10280            release2 = 1;
10281        }
10282        else if (kind2 > rkind) {
10283            /* widen self and buf1 */
10284            rkind = kind2;
10285            sbuf = _PyUnicode_AsKind(self, rkind);
10286            if (!sbuf) goto error;
10287            srelease = 1;
10288            if (release1) PyMem_Free(buf1);
10289            buf1 = _PyUnicode_AsKind(str1, rkind);
10290            if (!buf1) goto error;
10291            release1 = 1;
10292        }
10293        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10294           PyUnicode_GET_LENGTH(str1))); */
10295        product = n * (len2-len1);
10296        if ((product / (len2-len1)) != n) {
10297                PyErr_SetString(PyExc_OverflowError,
10298                                "replace string is too long");
10299                goto error;
10300        }
10301        new_size = slen + product;
10302        if (new_size == 0) {
10303            Py_INCREF(unicode_empty);
10304            u = unicode_empty;
10305            goto done;
10306        }
10307        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10308            PyErr_SetString(PyExc_OverflowError,
10309                            "replace string is too long");
10310            goto error;
10311        }
10312        u = PyUnicode_New(new_size, maxchar);
10313        if (!u)
10314            goto error;
10315        assert(PyUnicode_KIND(u) == rkind);
10316        res = PyUnicode_DATA(u);
10317        ires = i = 0;
10318        if (len1 > 0) {
10319            while (n-- > 0) {
10320                /* look for next match */
10321                j = anylib_find(rkind, self,
10322                                sbuf + rkind * i, slen-i,
10323                                str1, buf1, len1, i);
10324                if (j == -1)
10325                    break;
10326                else if (j > i) {
10327                    /* copy unchanged part [i:j] */
10328                    memcpy(res + rkind * ires,
10329                           sbuf + rkind * i,
10330                           rkind * (j-i));
10331                    ires += j - i;
10332                }
10333                /* copy substitution string */
10334                if (len2 > 0) {
10335                    memcpy(res + rkind * ires,
10336                           buf2,
10337                           rkind * len2);
10338                    ires += len2;
10339                }
10340                i = j + len1;
10341            }
10342            if (i < slen)
10343                /* copy tail [i:] */
10344                memcpy(res + rkind * ires,
10345                       sbuf + rkind * i,
10346                       rkind * (slen-i));
10347        }
10348        else {
10349            /* interleave */
10350            while (n > 0) {
10351                memcpy(res + rkind * ires,
10352                       buf2,
10353                       rkind * len2);
10354                ires += len2;
10355                if (--n <= 0)
10356                    break;
10357                memcpy(res + rkind * ires,
10358                       sbuf + rkind * i,
10359                       rkind);
10360                ires++;
10361                i++;
10362            }
10363            memcpy(res + rkind * ires,
10364                   sbuf + rkind * i,
10365                   rkind * (slen-i));
10366        }
10367    }
10368
10369    if (mayshrink) {
10370        unicode_adjust_maxchar(&u);
10371        if (u == NULL)
10372            goto error;
10373    }
10374
10375  done:
10376    if (srelease)
10377        PyMem_FREE(sbuf);
10378    if (release1)
10379        PyMem_FREE(buf1);
10380    if (release2)
10381        PyMem_FREE(buf2);
10382    assert(_PyUnicode_CheckConsistency(u, 1));
10383    return u;
10384
10385  nothing:
10386    /* nothing to replace; return original string (when possible) */
10387    if (srelease)
10388        PyMem_FREE(sbuf);
10389    if (release1)
10390        PyMem_FREE(buf1);
10391    if (release2)
10392        PyMem_FREE(buf2);
10393    if (PyUnicode_CheckExact(self)) {
10394        Py_INCREF(self);
10395        return self;
10396    }
10397    return PyUnicode_Copy(self);
10398  error:
10399    if (srelease && sbuf)
10400        PyMem_FREE(sbuf);
10401    if (release1 && buf1)
10402        PyMem_FREE(buf1);
10403    if (release2 && buf2)
10404        PyMem_FREE(buf2);
10405    return NULL;
10406}
10407
10408/* --- Unicode Object Methods --------------------------------------------- */
10409
10410PyDoc_STRVAR(title__doc__,
10411             "S.title() -> str\n\
10412\n\
10413Return a titlecased version of S, i.e. words start with title case\n\
10414characters, all remaining cased characters have lower case.");
10415
10416static PyObject*
10417unicode_title(PyObject *self)
10418{
10419    return fixup(self, fixtitle);
10420}
10421
10422PyDoc_STRVAR(capitalize__doc__,
10423             "S.capitalize() -> str\n\
10424\n\
10425Return a capitalized version of S, i.e. make the first character\n\
10426have upper case and the rest lower case.");
10427
10428static PyObject*
10429unicode_capitalize(PyObject *self)
10430{
10431    return fixup(self, fixcapitalize);
10432}
10433
10434#if 0
10435PyDoc_STRVAR(capwords__doc__,
10436             "S.capwords() -> str\n\
10437\n\
10438Apply .capitalize() to all words in S and return the result with\n\
10439normalized whitespace (all whitespace strings are replaced by ' ').");
10440
10441static PyObject*
10442unicode_capwords(PyObject *self)
10443{
10444    PyObject *list;
10445    PyObject *item;
10446    Py_ssize_t i;
10447
10448    /* Split into words */
10449    list = split(self, NULL, -1);
10450    if (!list)
10451        return NULL;
10452
10453    /* Capitalize each word */
10454    for (i = 0; i < PyList_GET_SIZE(list); i++) {
10455        item = fixup(PyList_GET_ITEM(list, i),
10456                     fixcapitalize);
10457        if (item == NULL)
10458            goto onError;
10459        Py_DECREF(PyList_GET_ITEM(list, i));
10460        PyList_SET_ITEM(list, i, item);
10461    }
10462
10463    /* Join the words to form a new string */
10464    item = PyUnicode_Join(NULL, list);
10465
10466  onError:
10467    Py_DECREF(list);
10468    return item;
10469}
10470#endif
10471
10472/* Argument converter.  Coerces to a single unicode character */
10473
10474static int
10475convert_uc(PyObject *obj, void *addr)
10476{
10477    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10478    PyObject *uniobj;
10479
10480    uniobj = PyUnicode_FromObject(obj);
10481    if (uniobj == NULL) {
10482        PyErr_SetString(PyExc_TypeError,
10483                        "The fill character cannot be converted to Unicode");
10484        return 0;
10485    }
10486    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10487        PyErr_SetString(PyExc_TypeError,
10488                        "The fill character must be exactly one character long");
10489        Py_DECREF(uniobj);
10490        return 0;
10491    }
10492    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10493    Py_DECREF(uniobj);
10494    return 1;
10495}
10496
10497PyDoc_STRVAR(center__doc__,
10498             "S.center(width[, fillchar]) -> str\n\
10499\n\
10500Return S centered in a string of length width. Padding is\n\
10501done using the specified fill character (default is a space)");
10502
10503static PyObject *
10504unicode_center(PyObject *self, PyObject *args)
10505{
10506    Py_ssize_t marg, left;
10507    Py_ssize_t width;
10508    Py_UCS4 fillchar = ' ';
10509
10510    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10511        return NULL;
10512
10513    if (PyUnicode_READY(self) == -1)
10514        return NULL;
10515
10516    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10517        Py_INCREF(self);
10518        return self;
10519    }
10520
10521    marg = width - _PyUnicode_LENGTH(self);
10522    left = marg / 2 + (marg & width & 1);
10523
10524    return pad(self, left, marg - left, fillchar);
10525}
10526
10527/* This function assumes that str1 and str2 are readied by the caller. */
10528
10529static int
10530unicode_compare(PyObject *str1, PyObject *str2)
10531{
10532    int kind1, kind2;
10533    void *data1, *data2;
10534    Py_ssize_t len1, len2, i;
10535
10536    kind1 = PyUnicode_KIND(str1);
10537    kind2 = PyUnicode_KIND(str2);
10538    data1 = PyUnicode_DATA(str1);
10539    data2 = PyUnicode_DATA(str2);
10540    len1 = PyUnicode_GET_LENGTH(str1);
10541    len2 = PyUnicode_GET_LENGTH(str2);
10542
10543    for (i = 0; i < len1 && i < len2; ++i) {
10544        Py_UCS4 c1, c2;
10545        c1 = PyUnicode_READ(kind1, data1, i);
10546        c2 = PyUnicode_READ(kind2, data2, i);
10547
10548        if (c1 != c2)
10549            return (c1 < c2) ? -1 : 1;
10550    }
10551
10552    return (len1 < len2) ? -1 : (len1 != len2);
10553}
10554
10555int
10556PyUnicode_Compare(PyObject *left, PyObject *right)
10557{
10558    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10559        if (PyUnicode_READY(left) == -1 ||
10560            PyUnicode_READY(right) == -1)
10561            return -1;
10562        return unicode_compare(left, right);
10563    }
10564    PyErr_Format(PyExc_TypeError,
10565                 "Can't compare %.100s and %.100s",
10566                 left->ob_type->tp_name,
10567                 right->ob_type->tp_name);
10568    return -1;
10569}
10570
10571int
10572PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10573{
10574    Py_ssize_t i;
10575    int kind;
10576    void *data;
10577    Py_UCS4 chr;
10578
10579    assert(_PyUnicode_CHECK(uni));
10580    if (PyUnicode_READY(uni) == -1)
10581        return -1;
10582    kind = PyUnicode_KIND(uni);
10583    data = PyUnicode_DATA(uni);
10584    /* Compare Unicode string and source character set string */
10585    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10586        if (chr != str[i])
10587            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10588    /* This check keeps Python strings that end in '\0' from comparing equal
10589     to C strings identical up to that point. */
10590    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10591        return 1; /* uni is longer */
10592    if (str[i])
10593        return -1; /* str is longer */
10594    return 0;
10595}
10596
10597
10598#define TEST_COND(cond)                         \
10599    ((cond) ? Py_True : Py_False)
10600
10601PyObject *
10602PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10603{
10604    int result;
10605
10606    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10607        PyObject *v;
10608        if (PyUnicode_READY(left) == -1 ||
10609            PyUnicode_READY(right) == -1)
10610            return NULL;
10611        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10612            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10613            if (op == Py_EQ) {
10614                Py_INCREF(Py_False);
10615                return Py_False;
10616            }
10617            if (op == Py_NE) {
10618                Py_INCREF(Py_True);
10619                return Py_True;
10620            }
10621        }
10622        if (left == right)
10623            result = 0;
10624        else
10625            result = unicode_compare(left, right);
10626
10627        /* Convert the return value to a Boolean */
10628        switch (op) {
10629        case Py_EQ:
10630            v = TEST_COND(result == 0);
10631            break;
10632        case Py_NE:
10633            v = TEST_COND(result != 0);
10634            break;
10635        case Py_LE:
10636            v = TEST_COND(result <= 0);
10637            break;
10638        case Py_GE:
10639            v = TEST_COND(result >= 0);
10640            break;
10641        case Py_LT:
10642            v = TEST_COND(result == -1);
10643            break;
10644        case Py_GT:
10645            v = TEST_COND(result == 1);
10646            break;
10647        default:
10648            PyErr_BadArgument();
10649            return NULL;
10650        }
10651        Py_INCREF(v);
10652        return v;
10653    }
10654
10655    Py_RETURN_NOTIMPLEMENTED;
10656}
10657
10658int
10659PyUnicode_Contains(PyObject *container, PyObject *element)
10660{
10661    PyObject *str, *sub;
10662    int kind1, kind2, kind;
10663    void *buf1, *buf2;
10664    Py_ssize_t len1, len2;
10665    int result;
10666
10667    /* Coerce the two arguments */
10668    sub = PyUnicode_FromObject(element);
10669    if (!sub) {
10670        PyErr_Format(PyExc_TypeError,
10671                     "'in <string>' requires string as left operand, not %s",
10672                     element->ob_type->tp_name);
10673        return -1;
10674    }
10675    if (PyUnicode_READY(sub) == -1)
10676        return -1;
10677
10678    str = PyUnicode_FromObject(container);
10679    if (!str || PyUnicode_READY(str) == -1) {
10680        Py_DECREF(sub);
10681        return -1;
10682    }
10683
10684    kind1 = PyUnicode_KIND(str);
10685    kind2 = PyUnicode_KIND(sub);
10686    kind = kind1 > kind2 ? kind1 : kind2;
10687    buf1 = PyUnicode_DATA(str);
10688    buf2 = PyUnicode_DATA(sub);
10689    if (kind1 != kind)
10690        buf1 = _PyUnicode_AsKind(str, kind);
10691    if (!buf1) {
10692        Py_DECREF(sub);
10693        return -1;
10694    }
10695    if (kind2 != kind)
10696        buf2 = _PyUnicode_AsKind(sub, kind);
10697    if (!buf2) {
10698        Py_DECREF(sub);
10699        if (kind1 != kind) PyMem_Free(buf1);
10700        return -1;
10701    }
10702    len1 = PyUnicode_GET_LENGTH(str);
10703    len2 = PyUnicode_GET_LENGTH(sub);
10704
10705    switch(kind) {
10706    case PyUnicode_1BYTE_KIND:
10707        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10708        break;
10709    case PyUnicode_2BYTE_KIND:
10710        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10711        break;
10712    case PyUnicode_4BYTE_KIND:
10713        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10714        break;
10715    default:
10716        result = -1;
10717        assert(0);
10718    }
10719
10720    Py_DECREF(str);
10721    Py_DECREF(sub);
10722
10723    if (kind1 != kind)
10724        PyMem_Free(buf1);
10725    if (kind2 != kind)
10726        PyMem_Free(buf2);
10727
10728    return result;
10729}
10730
10731/* Concat to string or Unicode object giving a new Unicode object. */
10732
10733PyObject *
10734PyUnicode_Concat(PyObject *left, PyObject *right)
10735{
10736    PyObject *u = NULL, *v = NULL, *w;
10737    Py_UCS4 maxchar, maxchar2;
10738
10739    /* Coerce the two arguments */
10740    u = PyUnicode_FromObject(left);
10741    if (u == NULL)
10742        goto onError;
10743    v = PyUnicode_FromObject(right);
10744    if (v == NULL)
10745        goto onError;
10746
10747    /* Shortcuts */
10748    if (v == unicode_empty) {
10749        Py_DECREF(v);
10750        return u;
10751    }
10752    if (u == unicode_empty) {
10753        Py_DECREF(u);
10754        return v;
10755    }
10756
10757    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10758    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10759    maxchar = Py_MAX(maxchar, maxchar2);
10760
10761    /* Concat the two Unicode strings */
10762    w = PyUnicode_New(
10763        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10764        maxchar);
10765    if (w == NULL)
10766        goto onError;
10767    copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10768    copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
10769    Py_DECREF(u);
10770    Py_DECREF(v);
10771    assert(_PyUnicode_CheckConsistency(w, 1));
10772    return w;
10773
10774  onError:
10775    Py_XDECREF(u);
10776    Py_XDECREF(v);
10777    return NULL;
10778}
10779
10780static void
10781unicode_append_inplace(PyObject **p_left, PyObject *right)
10782{
10783    Py_ssize_t left_len, right_len, new_len;
10784
10785    assert(PyUnicode_IS_READY(*p_left));
10786    assert(PyUnicode_IS_READY(right));
10787
10788    left_len = PyUnicode_GET_LENGTH(*p_left);
10789    right_len = PyUnicode_GET_LENGTH(right);
10790    if (left_len > PY_SSIZE_T_MAX - right_len) {
10791        PyErr_SetString(PyExc_OverflowError,
10792                        "strings are too large to concat");
10793        goto error;
10794    }
10795    new_len = left_len + right_len;
10796
10797    /* Now we own the last reference to 'left', so we can resize it
10798     * in-place.
10799     */
10800    if (unicode_resize(p_left, new_len) != 0) {
10801        /* XXX if _PyUnicode_Resize() fails, 'left' has been
10802         * deallocated so it cannot be put back into
10803         * 'variable'.  The MemoryError is raised when there
10804         * is no value in 'variable', which might (very
10805         * remotely) be a cause of incompatibilities.
10806         */
10807        goto error;
10808    }
10809    /* copy 'right' into the newly allocated area of 'left' */
10810    copy_characters(*p_left, left_len, right, 0, right_len);
10811    _PyUnicode_DIRTY(*p_left);
10812    return;
10813
10814error:
10815    Py_DECREF(*p_left);
10816    *p_left = NULL;
10817}
10818
10819void
10820PyUnicode_Append(PyObject **p_left, PyObject *right)
10821{
10822    PyObject *left, *res;
10823
10824    if (p_left == NULL) {
10825        if (!PyErr_Occurred())
10826            PyErr_BadInternalCall();
10827        return;
10828    }
10829    left = *p_left;
10830    if (right == NULL || !PyUnicode_Check(left)) {
10831        if (!PyErr_Occurred())
10832            PyErr_BadInternalCall();
10833        goto error;
10834    }
10835
10836    if (PyUnicode_READY(left))
10837        goto error;
10838    if (PyUnicode_READY(right))
10839        goto error;
10840
10841    if (PyUnicode_CheckExact(left) && left != unicode_empty
10842        && PyUnicode_CheckExact(right) && right != unicode_empty
10843        && unicode_resizable(left)
10844        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10845            || _PyUnicode_WSTR(left) != NULL))
10846    {
10847        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10848           to change the structure size, but characters are stored just after
10849           the structure, and so it requires to move all characters which is
10850           not so different than duplicating the string. */
10851        if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10852        {
10853            unicode_append_inplace(p_left, right);
10854            if (p_left != NULL)
10855                assert(_PyUnicode_CheckConsistency(*p_left, 1));
10856            return;
10857        }
10858    }
10859
10860    res = PyUnicode_Concat(left, right);
10861    if (res == NULL)
10862        goto error;
10863    Py_DECREF(left);
10864    *p_left = res;
10865    return;
10866
10867error:
10868    Py_DECREF(*p_left);
10869    *p_left = NULL;
10870}
10871
10872void
10873PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10874{
10875    PyUnicode_Append(pleft, right);
10876    Py_XDECREF(right);
10877}
10878
10879PyDoc_STRVAR(count__doc__,
10880             "S.count(sub[, start[, end]]) -> int\n\
10881\n\
10882Return the number of non-overlapping occurrences of substring sub in\n\
10883string S[start:end].  Optional arguments start and end are\n\
10884interpreted as in slice notation.");
10885
10886static PyObject *
10887unicode_count(PyObject *self, PyObject *args)
10888{
10889    PyObject *substring;
10890    Py_ssize_t start = 0;
10891    Py_ssize_t end = PY_SSIZE_T_MAX;
10892    PyObject *result;
10893    int kind1, kind2, kind;
10894    void *buf1, *buf2;
10895    Py_ssize_t len1, len2, iresult;
10896
10897    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10898                                            &start, &end))
10899        return NULL;
10900
10901    kind1 = PyUnicode_KIND(self);
10902    kind2 = PyUnicode_KIND(substring);
10903    kind = kind1 > kind2 ? kind1 : kind2;
10904    buf1 = PyUnicode_DATA(self);
10905    buf2 = PyUnicode_DATA(substring);
10906    if (kind1 != kind)
10907        buf1 = _PyUnicode_AsKind(self, kind);
10908    if (!buf1) {
10909        Py_DECREF(substring);
10910        return NULL;
10911    }
10912    if (kind2 != kind)
10913        buf2 = _PyUnicode_AsKind(substring, kind);
10914    if (!buf2) {
10915        Py_DECREF(substring);
10916        if (kind1 != kind) PyMem_Free(buf1);
10917        return NULL;
10918    }
10919    len1 = PyUnicode_GET_LENGTH(self);
10920    len2 = PyUnicode_GET_LENGTH(substring);
10921
10922    ADJUST_INDICES(start, end, len1);
10923    switch(kind) {
10924    case PyUnicode_1BYTE_KIND:
10925        iresult = ucs1lib_count(
10926            ((Py_UCS1*)buf1) + start, end - start,
10927            buf2, len2, PY_SSIZE_T_MAX
10928            );
10929        break;
10930    case PyUnicode_2BYTE_KIND:
10931        iresult = ucs2lib_count(
10932            ((Py_UCS2*)buf1) + start, end - start,
10933            buf2, len2, PY_SSIZE_T_MAX
10934            );
10935        break;
10936    case PyUnicode_4BYTE_KIND:
10937        iresult = ucs4lib_count(
10938            ((Py_UCS4*)buf1) + start, end - start,
10939            buf2, len2, PY_SSIZE_T_MAX
10940            );
10941        break;
10942    default:
10943        assert(0); iresult = 0;
10944    }
10945
10946    result = PyLong_FromSsize_t(iresult);
10947
10948    if (kind1 != kind)
10949        PyMem_Free(buf1);
10950    if (kind2 != kind)
10951        PyMem_Free(buf2);
10952
10953    Py_DECREF(substring);
10954
10955    return result;
10956}
10957
10958PyDoc_STRVAR(encode__doc__,
10959             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10960\n\
10961Encode S using the codec registered for encoding. Default encoding\n\
10962is 'utf-8'. errors may be given to set a different error\n\
10963handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10964a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10965'xmlcharrefreplace' as well as any other name registered with\n\
10966codecs.register_error that can handle UnicodeEncodeErrors.");
10967
10968static PyObject *
10969unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10970{
10971    static char *kwlist[] = {"encoding", "errors", 0};
10972    char *encoding = NULL;
10973    char *errors = NULL;
10974
10975    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10976                                     kwlist, &encoding, &errors))
10977        return NULL;
10978    return PyUnicode_AsEncodedString(self, encoding, errors);
10979}
10980
10981PyDoc_STRVAR(expandtabs__doc__,
10982             "S.expandtabs([tabsize]) -> str\n\
10983\n\
10984Return a copy of S where all tab characters are expanded using spaces.\n\
10985If tabsize is not given, a tab size of 8 characters is assumed.");
10986
10987static PyObject*
10988unicode_expandtabs(PyObject *self, PyObject *args)
10989{
10990    Py_ssize_t i, j, line_pos, src_len, incr;
10991    Py_UCS4 ch;
10992    PyObject *u;
10993    void *src_data, *dest_data;
10994    int tabsize = 8;
10995    int kind;
10996    int found;
10997
10998    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10999        return NULL;
11000
11001    if (PyUnicode_READY(self) == -1)
11002        return NULL;
11003
11004    /* First pass: determine size of output string */
11005    src_len = PyUnicode_GET_LENGTH(self);
11006    i = j = line_pos = 0;
11007    kind = PyUnicode_KIND(self);
11008    src_data = PyUnicode_DATA(self);
11009    found = 0;
11010    for (; i < src_len; i++) {
11011        ch = PyUnicode_READ(kind, src_data, i);
11012        if (ch == '\t') {
11013            found = 1;
11014            if (tabsize > 0) {
11015                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11016                if (j > PY_SSIZE_T_MAX - incr)
11017                    goto overflow;
11018                line_pos += incr;
11019                j += incr;
11020            }
11021        }
11022        else {
11023            if (j > PY_SSIZE_T_MAX - 1)
11024                goto overflow;
11025            line_pos++;
11026            j++;
11027            if (ch == '\n' || ch == '\r')
11028                line_pos = 0;
11029        }
11030    }
11031    if (!found && PyUnicode_CheckExact(self)) {
11032        Py_INCREF(self);
11033        return self;
11034    }
11035
11036    /* Second pass: create output string and fill it */
11037    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11038    if (!u)
11039        return NULL;
11040    dest_data = PyUnicode_DATA(u);
11041
11042    i = j = line_pos = 0;
11043
11044    for (; i < src_len; i++) {
11045        ch = PyUnicode_READ(kind, src_data, i);
11046        if (ch == '\t') {
11047            if (tabsize > 0) {
11048                incr = tabsize - (line_pos % tabsize);
11049                line_pos += incr;
11050                while (incr--) {
11051                    PyUnicode_WRITE(kind, dest_data, j, ' ');
11052                    j++;
11053                }
11054            }
11055        }
11056        else {
11057            line_pos++;
11058            PyUnicode_WRITE(kind, dest_data, j, ch);
11059            j++;
11060            if (ch == '\n' || ch == '\r')
11061                line_pos = 0;
11062        }
11063    }
11064    assert (j == PyUnicode_GET_LENGTH(u));
11065#ifndef DONT_MAKE_RESULT_READY
11066    if (_PyUnicode_READY_REPLACE(&u)) {
11067        Py_DECREF(u);
11068        return NULL;
11069    }
11070#endif
11071    assert(_PyUnicode_CheckConsistency(u, 1));
11072    return u;
11073
11074  overflow:
11075    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11076    return NULL;
11077}
11078
11079PyDoc_STRVAR(find__doc__,
11080             "S.find(sub[, start[, end]]) -> int\n\
11081\n\
11082Return the lowest index in S where substring sub is found,\n\
11083such that sub is contained within S[start:end].  Optional\n\
11084arguments start and end are interpreted as in slice notation.\n\
11085\n\
11086Return -1 on failure.");
11087
11088static PyObject *
11089unicode_find(PyObject *self, PyObject *args)
11090{
11091    PyObject *substring;
11092    Py_ssize_t start;
11093    Py_ssize_t end;
11094    Py_ssize_t result;
11095
11096    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11097                                            &start, &end))
11098        return NULL;
11099
11100    if (PyUnicode_READY(self) == -1)
11101        return NULL;
11102    if (PyUnicode_READY(substring) == -1)
11103        return NULL;
11104
11105    result = any_find_slice(1, self, substring, start, end);
11106
11107    Py_DECREF(substring);
11108
11109    if (result == -2)
11110        return NULL;
11111
11112    return PyLong_FromSsize_t(result);
11113}
11114
11115static PyObject *
11116unicode_getitem(PyObject *self, Py_ssize_t index)
11117{
11118    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11119    if (ch == (Py_UCS4)-1)
11120        return NULL;
11121    return PyUnicode_FromOrdinal(ch);
11122}
11123
11124/* Believe it or not, this produces the same value for ASCII strings
11125   as bytes_hash(). */
11126static Py_hash_t
11127unicode_hash(PyObject *self)
11128{
11129    Py_ssize_t len;
11130    Py_uhash_t x;
11131
11132    if (_PyUnicode_HASH(self) != -1)
11133        return _PyUnicode_HASH(self);
11134    if (PyUnicode_READY(self) == -1)
11135        return -1;
11136    len = PyUnicode_GET_LENGTH(self);
11137
11138    /* The hash function as a macro, gets expanded three times below. */
11139#define HASH(P) \
11140    x = (Py_uhash_t)*P << 7; \
11141    while (--len >= 0) \
11142        x = (1000003*x) ^ (Py_uhash_t)*P++;
11143
11144    switch (PyUnicode_KIND(self)) {
11145    case PyUnicode_1BYTE_KIND: {
11146        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11147        HASH(c);
11148        break;
11149    }
11150    case PyUnicode_2BYTE_KIND: {
11151        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11152        HASH(s);
11153        break;
11154    }
11155    default: {
11156        Py_UCS4 *l;
11157        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11158               "Impossible switch case in unicode_hash");
11159        l = PyUnicode_4BYTE_DATA(self);
11160        HASH(l);
11161        break;
11162    }
11163    }
11164    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11165
11166    if (x == -1)
11167        x = -2;
11168    _PyUnicode_HASH(self) = x;
11169    return x;
11170}
11171#undef HASH
11172
11173PyDoc_STRVAR(index__doc__,
11174             "S.index(sub[, start[, end]]) -> int\n\
11175\n\
11176Like S.find() but raise ValueError when the substring is not found.");
11177
11178static PyObject *
11179unicode_index(PyObject *self, PyObject *args)
11180{
11181    Py_ssize_t result;
11182    PyObject *substring;
11183    Py_ssize_t start;
11184    Py_ssize_t end;
11185
11186    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11187                                            &start, &end))
11188        return NULL;
11189
11190    if (PyUnicode_READY(self) == -1)
11191        return NULL;
11192    if (PyUnicode_READY(substring) == -1)
11193        return NULL;
11194
11195    result = any_find_slice(1, self, substring, start, end);
11196
11197    Py_DECREF(substring);
11198
11199    if (result == -2)
11200        return NULL;
11201
11202    if (result < 0) {
11203        PyErr_SetString(PyExc_ValueError, "substring not found");
11204        return NULL;
11205    }
11206
11207    return PyLong_FromSsize_t(result);
11208}
11209
11210PyDoc_STRVAR(islower__doc__,
11211             "S.islower() -> bool\n\
11212\n\
11213Return True if all cased characters in S are lowercase and there is\n\
11214at least one cased character in S, False otherwise.");
11215
11216static PyObject*
11217unicode_islower(PyObject *self)
11218{
11219    Py_ssize_t i, length;
11220    int kind;
11221    void *data;
11222    int cased;
11223
11224    if (PyUnicode_READY(self) == -1)
11225        return NULL;
11226    length = PyUnicode_GET_LENGTH(self);
11227    kind = PyUnicode_KIND(self);
11228    data = PyUnicode_DATA(self);
11229
11230    /* Shortcut for single character strings */
11231    if (length == 1)
11232        return PyBool_FromLong(
11233            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11234
11235    /* Special case for empty strings */
11236    if (length == 0)
11237        return PyBool_FromLong(0);
11238
11239    cased = 0;
11240    for (i = 0; i < length; i++) {
11241        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11242
11243        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11244            return PyBool_FromLong(0);
11245        else if (!cased && Py_UNICODE_ISLOWER(ch))
11246            cased = 1;
11247    }
11248    return PyBool_FromLong(cased);
11249}
11250
11251PyDoc_STRVAR(isupper__doc__,
11252             "S.isupper() -> bool\n\
11253\n\
11254Return True if all cased characters in S are uppercase and there is\n\
11255at least one cased character in S, False otherwise.");
11256
11257static PyObject*
11258unicode_isupper(PyObject *self)
11259{
11260    Py_ssize_t i, length;
11261    int kind;
11262    void *data;
11263    int cased;
11264
11265    if (PyUnicode_READY(self) == -1)
11266        return NULL;
11267    length = PyUnicode_GET_LENGTH(self);
11268    kind = PyUnicode_KIND(self);
11269    data = PyUnicode_DATA(self);
11270
11271    /* Shortcut for single character strings */
11272    if (length == 1)
11273        return PyBool_FromLong(
11274            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11275
11276    /* Special case for empty strings */
11277    if (length == 0)
11278        return PyBool_FromLong(0);
11279
11280    cased = 0;
11281    for (i = 0; i < length; i++) {
11282        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11283
11284        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11285            return PyBool_FromLong(0);
11286        else if (!cased && Py_UNICODE_ISUPPER(ch))
11287            cased = 1;
11288    }
11289    return PyBool_FromLong(cased);
11290}
11291
11292PyDoc_STRVAR(istitle__doc__,
11293             "S.istitle() -> bool\n\
11294\n\
11295Return True if S is a titlecased string and there is at least one\n\
11296character in S, i.e. upper- and titlecase characters may only\n\
11297follow uncased characters and lowercase characters only cased ones.\n\
11298Return False otherwise.");
11299
11300static PyObject*
11301unicode_istitle(PyObject *self)
11302{
11303    Py_ssize_t i, length;
11304    int kind;
11305    void *data;
11306    int cased, previous_is_cased;
11307
11308    if (PyUnicode_READY(self) == -1)
11309        return NULL;
11310    length = PyUnicode_GET_LENGTH(self);
11311    kind = PyUnicode_KIND(self);
11312    data = PyUnicode_DATA(self);
11313
11314    /* Shortcut for single character strings */
11315    if (length == 1) {
11316        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11317        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11318                               (Py_UNICODE_ISUPPER(ch) != 0));
11319    }
11320
11321    /* Special case for empty strings */
11322    if (length == 0)
11323        return PyBool_FromLong(0);
11324
11325    cased = 0;
11326    previous_is_cased = 0;
11327    for (i = 0; i < length; i++) {
11328        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11329
11330        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11331            if (previous_is_cased)
11332                return PyBool_FromLong(0);
11333            previous_is_cased = 1;
11334            cased = 1;
11335        }
11336        else if (Py_UNICODE_ISLOWER(ch)) {
11337            if (!previous_is_cased)
11338                return PyBool_FromLong(0);
11339            previous_is_cased = 1;
11340            cased = 1;
11341        }
11342        else
11343            previous_is_cased = 0;
11344    }
11345    return PyBool_FromLong(cased);
11346}
11347
11348PyDoc_STRVAR(isspace__doc__,
11349             "S.isspace() -> bool\n\
11350\n\
11351Return True if all characters in S are whitespace\n\
11352and there is at least one character in S, False otherwise.");
11353
11354static PyObject*
11355unicode_isspace(PyObject *self)
11356{
11357    Py_ssize_t i, length;
11358    int kind;
11359    void *data;
11360
11361    if (PyUnicode_READY(self) == -1)
11362        return NULL;
11363    length = PyUnicode_GET_LENGTH(self);
11364    kind = PyUnicode_KIND(self);
11365    data = PyUnicode_DATA(self);
11366
11367    /* Shortcut for single character strings */
11368    if (length == 1)
11369        return PyBool_FromLong(
11370            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11371
11372    /* Special case for empty strings */
11373    if (length == 0)
11374        return PyBool_FromLong(0);
11375
11376    for (i = 0; i < length; i++) {
11377        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11378        if (!Py_UNICODE_ISSPACE(ch))
11379            return PyBool_FromLong(0);
11380    }
11381    return PyBool_FromLong(1);
11382}
11383
11384PyDoc_STRVAR(isalpha__doc__,
11385             "S.isalpha() -> bool\n\
11386\n\
11387Return True if all characters in S are alphabetic\n\
11388and there is at least one character in S, False otherwise.");
11389
11390static PyObject*
11391unicode_isalpha(PyObject *self)
11392{
11393    Py_ssize_t i, length;
11394    int kind;
11395    void *data;
11396
11397    if (PyUnicode_READY(self) == -1)
11398        return NULL;
11399    length = PyUnicode_GET_LENGTH(self);
11400    kind = PyUnicode_KIND(self);
11401    data = PyUnicode_DATA(self);
11402
11403    /* Shortcut for single character strings */
11404    if (length == 1)
11405        return PyBool_FromLong(
11406            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11407
11408    /* Special case for empty strings */
11409    if (length == 0)
11410        return PyBool_FromLong(0);
11411
11412    for (i = 0; i < length; i++) {
11413        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11414            return PyBool_FromLong(0);
11415    }
11416    return PyBool_FromLong(1);
11417}
11418
11419PyDoc_STRVAR(isalnum__doc__,
11420             "S.isalnum() -> bool\n\
11421\n\
11422Return True if all characters in S are alphanumeric\n\
11423and there is at least one character in S, False otherwise.");
11424
11425static PyObject*
11426unicode_isalnum(PyObject *self)
11427{
11428    int kind;
11429    void *data;
11430    Py_ssize_t len, i;
11431
11432    if (PyUnicode_READY(self) == -1)
11433        return NULL;
11434
11435    kind = PyUnicode_KIND(self);
11436    data = PyUnicode_DATA(self);
11437    len = PyUnicode_GET_LENGTH(self);
11438
11439    /* Shortcut for single character strings */
11440    if (len == 1) {
11441        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11442        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11443    }
11444
11445    /* Special case for empty strings */
11446    if (len == 0)
11447        return PyBool_FromLong(0);
11448
11449    for (i = 0; i < len; i++) {
11450        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11451        if (!Py_UNICODE_ISALNUM(ch))
11452            return PyBool_FromLong(0);
11453    }
11454    return PyBool_FromLong(1);
11455}
11456
11457PyDoc_STRVAR(isdecimal__doc__,
11458             "S.isdecimal() -> bool\n\
11459\n\
11460Return True if there are only decimal characters in S,\n\
11461False otherwise.");
11462
11463static PyObject*
11464unicode_isdecimal(PyObject *self)
11465{
11466    Py_ssize_t i, length;
11467    int kind;
11468    void *data;
11469
11470    if (PyUnicode_READY(self) == -1)
11471        return NULL;
11472    length = PyUnicode_GET_LENGTH(self);
11473    kind = PyUnicode_KIND(self);
11474    data = PyUnicode_DATA(self);
11475
11476    /* Shortcut for single character strings */
11477    if (length == 1)
11478        return PyBool_FromLong(
11479            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11480
11481    /* Special case for empty strings */
11482    if (length == 0)
11483        return PyBool_FromLong(0);
11484
11485    for (i = 0; i < length; i++) {
11486        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11487            return PyBool_FromLong(0);
11488    }
11489    return PyBool_FromLong(1);
11490}
11491
11492PyDoc_STRVAR(isdigit__doc__,
11493             "S.isdigit() -> bool\n\
11494\n\
11495Return True if all characters in S are digits\n\
11496and there is at least one character in S, False otherwise.");
11497
11498static PyObject*
11499unicode_isdigit(PyObject *self)
11500{
11501    Py_ssize_t i, length;
11502    int kind;
11503    void *data;
11504
11505    if (PyUnicode_READY(self) == -1)
11506        return NULL;
11507    length = PyUnicode_GET_LENGTH(self);
11508    kind = PyUnicode_KIND(self);
11509    data = PyUnicode_DATA(self);
11510
11511    /* Shortcut for single character strings */
11512    if (length == 1) {
11513        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11514        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11515    }
11516
11517    /* Special case for empty strings */
11518    if (length == 0)
11519        return PyBool_FromLong(0);
11520
11521    for (i = 0; i < length; i++) {
11522        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11523            return PyBool_FromLong(0);
11524    }
11525    return PyBool_FromLong(1);
11526}
11527
11528PyDoc_STRVAR(isnumeric__doc__,
11529             "S.isnumeric() -> bool\n\
11530\n\
11531Return True if there are only numeric characters in S,\n\
11532False otherwise.");
11533
11534static PyObject*
11535unicode_isnumeric(PyObject *self)
11536{
11537    Py_ssize_t i, length;
11538    int kind;
11539    void *data;
11540
11541    if (PyUnicode_READY(self) == -1)
11542        return NULL;
11543    length = PyUnicode_GET_LENGTH(self);
11544    kind = PyUnicode_KIND(self);
11545    data = PyUnicode_DATA(self);
11546
11547    /* Shortcut for single character strings */
11548    if (length == 1)
11549        return PyBool_FromLong(
11550            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11551
11552    /* Special case for empty strings */
11553    if (length == 0)
11554        return PyBool_FromLong(0);
11555
11556    for (i = 0; i < length; i++) {
11557        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11558            return PyBool_FromLong(0);
11559    }
11560    return PyBool_FromLong(1);
11561}
11562
11563int
11564PyUnicode_IsIdentifier(PyObject *self)
11565{
11566    int kind;
11567    void *data;
11568    Py_ssize_t i;
11569    Py_UCS4 first;
11570
11571    if (PyUnicode_READY(self) == -1) {
11572        Py_FatalError("identifier not ready");
11573        return 0;
11574    }
11575
11576    /* Special case for empty strings */
11577    if (PyUnicode_GET_LENGTH(self) == 0)
11578        return 0;
11579    kind = PyUnicode_KIND(self);
11580    data = PyUnicode_DATA(self);
11581
11582    /* PEP 3131 says that the first character must be in
11583       XID_Start and subsequent characters in XID_Continue,
11584       and for the ASCII range, the 2.x rules apply (i.e
11585       start with letters and underscore, continue with
11586       letters, digits, underscore). However, given the current
11587       definition of XID_Start and XID_Continue, it is sufficient
11588       to check just for these, except that _ must be allowed
11589       as starting an identifier.  */
11590    first = PyUnicode_READ(kind, data, 0);
11591    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11592        return 0;
11593
11594    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11595        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11596            return 0;
11597    return 1;
11598}
11599
11600PyDoc_STRVAR(isidentifier__doc__,
11601             "S.isidentifier() -> bool\n\
11602\n\
11603Return True if S is a valid identifier according\n\
11604to the language definition.");
11605
11606static PyObject*
11607unicode_isidentifier(PyObject *self)
11608{
11609    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11610}
11611
11612PyDoc_STRVAR(isprintable__doc__,
11613             "S.isprintable() -> bool\n\
11614\n\
11615Return True if all characters in S are considered\n\
11616printable in repr() or S is empty, False otherwise.");
11617
11618static PyObject*
11619unicode_isprintable(PyObject *self)
11620{
11621    Py_ssize_t i, length;
11622    int kind;
11623    void *data;
11624
11625    if (PyUnicode_READY(self) == -1)
11626        return NULL;
11627    length = PyUnicode_GET_LENGTH(self);
11628    kind = PyUnicode_KIND(self);
11629    data = PyUnicode_DATA(self);
11630
11631    /* Shortcut for single character strings */
11632    if (length == 1)
11633        return PyBool_FromLong(
11634            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11635
11636    for (i = 0; i < length; i++) {
11637        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11638            Py_RETURN_FALSE;
11639        }
11640    }
11641    Py_RETURN_TRUE;
11642}
11643
11644PyDoc_STRVAR(join__doc__,
11645             "S.join(iterable) -> str\n\
11646\n\
11647Return a string which is the concatenation of the strings in the\n\
11648iterable.  The separator between elements is S.");
11649
11650static PyObject*
11651unicode_join(PyObject *self, PyObject *data)
11652{
11653    return PyUnicode_Join(self, data);
11654}
11655
11656static Py_ssize_t
11657unicode_length(PyObject *self)
11658{
11659    if (PyUnicode_READY(self) == -1)
11660        return -1;
11661    return PyUnicode_GET_LENGTH(self);
11662}
11663
11664PyDoc_STRVAR(ljust__doc__,
11665             "S.ljust(width[, fillchar]) -> str\n\
11666\n\
11667Return S left-justified in a Unicode string of length width. Padding is\n\
11668done using the specified fill character (default is a space).");
11669
11670static PyObject *
11671unicode_ljust(PyObject *self, PyObject *args)
11672{
11673    Py_ssize_t width;
11674    Py_UCS4 fillchar = ' ';
11675
11676    if (PyUnicode_READY(self) == -1)
11677        return NULL;
11678
11679    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11680        return NULL;
11681
11682    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11683        Py_INCREF(self);
11684        return self;
11685    }
11686
11687    return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
11688}
11689
11690PyDoc_STRVAR(lower__doc__,
11691             "S.lower() -> str\n\
11692\n\
11693Return a copy of the string S converted to lowercase.");
11694
11695static PyObject*
11696unicode_lower(PyObject *self)
11697{
11698    return fixup(self, fixlower);
11699}
11700
11701#define LEFTSTRIP 0
11702#define RIGHTSTRIP 1
11703#define BOTHSTRIP 2
11704
11705/* Arrays indexed by above */
11706static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11707
11708#define STRIPNAME(i) (stripformat[i]+3)
11709
11710/* externally visible for str.strip(unicode) */
11711PyObject *
11712_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11713{
11714    void *data;
11715    int kind;
11716    Py_ssize_t i, j, len;
11717    BLOOM_MASK sepmask;
11718
11719    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11720        return NULL;
11721
11722    kind = PyUnicode_KIND(self);
11723    data = PyUnicode_DATA(self);
11724    len = PyUnicode_GET_LENGTH(self);
11725    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11726                              PyUnicode_DATA(sepobj),
11727                              PyUnicode_GET_LENGTH(sepobj));
11728
11729    i = 0;
11730    if (striptype != RIGHTSTRIP) {
11731        while (i < len &&
11732               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11733            i++;
11734        }
11735    }
11736
11737    j = len;
11738    if (striptype != LEFTSTRIP) {
11739        do {
11740            j--;
11741        } while (j >= i &&
11742                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11743        j++;
11744    }
11745
11746    return PyUnicode_Substring(self, i, j);
11747}
11748
11749PyObject*
11750PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11751{
11752    unsigned char *data;
11753    int kind;
11754    Py_ssize_t length;
11755
11756    if (PyUnicode_READY(self) == -1)
11757        return NULL;
11758
11759    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11760
11761    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
11762    {
11763        if (PyUnicode_CheckExact(self)) {
11764            Py_INCREF(self);
11765            return self;
11766        }
11767        else
11768            return PyUnicode_Copy(self);
11769    }
11770
11771    length = end - start;
11772    if (length == 1)
11773        return unicode_getitem(self, start);
11774
11775    if (start < 0 || end < 0) {
11776        PyErr_SetString(PyExc_IndexError, "string index out of range");
11777        return NULL;
11778    }
11779
11780    if (PyUnicode_IS_ASCII(self)) {
11781        kind = PyUnicode_KIND(self);
11782        data = PyUnicode_1BYTE_DATA(self);
11783        return unicode_fromascii(data + start, length);
11784    }
11785    else {
11786        kind = PyUnicode_KIND(self);
11787        data = PyUnicode_1BYTE_DATA(self);
11788        return PyUnicode_FromKindAndData(kind,
11789                                         data + kind * start,
11790                                         length);
11791    }
11792}
11793
11794static PyObject *
11795do_strip(PyObject *self, int striptype)
11796{
11797    int kind;
11798    void *data;
11799    Py_ssize_t len, i, j;
11800
11801    if (PyUnicode_READY(self) == -1)
11802        return NULL;
11803
11804    kind = PyUnicode_KIND(self);
11805    data = PyUnicode_DATA(self);
11806    len = PyUnicode_GET_LENGTH(self);
11807
11808    i = 0;
11809    if (striptype != RIGHTSTRIP) {
11810        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11811            i++;
11812        }
11813    }
11814
11815    j = len;
11816    if (striptype != LEFTSTRIP) {
11817        do {
11818            j--;
11819        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11820        j++;
11821    }
11822
11823    return PyUnicode_Substring(self, i, j);
11824}
11825
11826
11827static PyObject *
11828do_argstrip(PyObject *self, int striptype, PyObject *args)
11829{
11830    PyObject *sep = NULL;
11831
11832    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11833        return NULL;
11834
11835    if (sep != NULL && sep != Py_None) {
11836        if (PyUnicode_Check(sep))
11837            return _PyUnicode_XStrip(self, striptype, sep);
11838        else {
11839            PyErr_Format(PyExc_TypeError,
11840                         "%s arg must be None or str",
11841                         STRIPNAME(striptype));
11842            return NULL;
11843        }
11844    }
11845
11846    return do_strip(self, striptype);
11847}
11848
11849
11850PyDoc_STRVAR(strip__doc__,
11851             "S.strip([chars]) -> str\n\
11852\n\
11853Return a copy of the string S with leading and trailing\n\
11854whitespace removed.\n\
11855If chars is given and not None, remove characters in chars instead.");
11856
11857static PyObject *
11858unicode_strip(PyObject *self, PyObject *args)
11859{
11860    if (PyTuple_GET_SIZE(args) == 0)
11861        return do_strip(self, BOTHSTRIP); /* Common case */
11862    else
11863        return do_argstrip(self, BOTHSTRIP, args);
11864}
11865
11866
11867PyDoc_STRVAR(lstrip__doc__,
11868             "S.lstrip([chars]) -> str\n\
11869\n\
11870Return a copy of the string S with leading whitespace removed.\n\
11871If chars is given and not None, remove characters in chars instead.");
11872
11873static PyObject *
11874unicode_lstrip(PyObject *self, PyObject *args)
11875{
11876    if (PyTuple_GET_SIZE(args) == 0)
11877        return do_strip(self, LEFTSTRIP); /* Common case */
11878    else
11879        return do_argstrip(self, LEFTSTRIP, args);
11880}
11881
11882
11883PyDoc_STRVAR(rstrip__doc__,
11884             "S.rstrip([chars]) -> str\n\
11885\n\
11886Return a copy of the string S with trailing whitespace removed.\n\
11887If chars is given and not None, remove characters in chars instead.");
11888
11889static PyObject *
11890unicode_rstrip(PyObject *self, PyObject *args)
11891{
11892    if (PyTuple_GET_SIZE(args) == 0)
11893        return do_strip(self, RIGHTSTRIP); /* Common case */
11894    else
11895        return do_argstrip(self, RIGHTSTRIP, args);
11896}
11897
11898
11899static PyObject*
11900unicode_repeat(PyObject *str, Py_ssize_t len)
11901{
11902    PyObject *u;
11903    Py_ssize_t nchars, n;
11904
11905    if (len < 1) {
11906        Py_INCREF(unicode_empty);
11907        return unicode_empty;
11908    }
11909
11910    if (len == 1 && PyUnicode_CheckExact(str)) {
11911        /* no repeat, return original string */
11912        Py_INCREF(str);
11913        return str;
11914    }
11915
11916    if (PyUnicode_READY(str) == -1)
11917        return NULL;
11918
11919    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11920        PyErr_SetString(PyExc_OverflowError,
11921                        "repeated string is too long");
11922        return NULL;
11923    }
11924    nchars = len * PyUnicode_GET_LENGTH(str);
11925
11926    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11927    if (!u)
11928        return NULL;
11929    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11930
11931    if (PyUnicode_GET_LENGTH(str) == 1) {
11932        const int kind = PyUnicode_KIND(str);
11933        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11934        void *to = PyUnicode_DATA(u);
11935        if (kind == PyUnicode_1BYTE_KIND)
11936            memset(to, (unsigned char)fill_char, len);
11937        else {
11938            for (n = 0; n < len; ++n)
11939                PyUnicode_WRITE(kind, to, n, fill_char);
11940        }
11941    }
11942    else {
11943        /* number of characters copied this far */
11944        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11945        const Py_ssize_t char_size = PyUnicode_KIND(str);
11946        char *to = (char *) PyUnicode_DATA(u);
11947        Py_MEMCPY(to, PyUnicode_DATA(str),
11948                  PyUnicode_GET_LENGTH(str) * char_size);
11949        while (done < nchars) {
11950            n = (done <= nchars-done) ? done : nchars-done;
11951            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11952            done += n;
11953        }
11954    }
11955
11956    assert(_PyUnicode_CheckConsistency(u, 1));
11957    return u;
11958}
11959
11960PyObject *
11961PyUnicode_Replace(PyObject *obj,
11962                  PyObject *subobj,
11963                  PyObject *replobj,
11964                  Py_ssize_t maxcount)
11965{
11966    PyObject *self;
11967    PyObject *str1;
11968    PyObject *str2;
11969    PyObject *result;
11970
11971    self = PyUnicode_FromObject(obj);
11972    if (self == NULL || PyUnicode_READY(self) == -1)
11973        return NULL;
11974    str1 = PyUnicode_FromObject(subobj);
11975    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11976        Py_DECREF(self);
11977        return NULL;
11978    }
11979    str2 = PyUnicode_FromObject(replobj);
11980    if (str2 == NULL || PyUnicode_READY(str2)) {
11981        Py_DECREF(self);
11982        Py_DECREF(str1);
11983        return NULL;
11984    }
11985    result = replace(self, str1, str2, maxcount);
11986    Py_DECREF(self);
11987    Py_DECREF(str1);
11988    Py_DECREF(str2);
11989    return result;
11990}
11991
11992PyDoc_STRVAR(replace__doc__,
11993             "S.replace(old, new[, count]) -> str\n\
11994\n\
11995Return a copy of S with all occurrences of substring\n\
11996old replaced by new.  If the optional argument count is\n\
11997given, only the first count occurrences are replaced.");
11998
11999static PyObject*
12000unicode_replace(PyObject *self, PyObject *args)
12001{
12002    PyObject *str1;
12003    PyObject *str2;
12004    Py_ssize_t maxcount = -1;
12005    PyObject *result;
12006
12007    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12008        return NULL;
12009    if (!PyUnicode_READY(self) == -1)
12010        return NULL;
12011    str1 = PyUnicode_FromObject(str1);
12012    if (str1 == NULL || PyUnicode_READY(str1) == -1)
12013        return NULL;
12014    str2 = PyUnicode_FromObject(str2);
12015    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
12016        Py_DECREF(str1);
12017        return NULL;
12018    }
12019
12020    result = replace(self, str1, str2, maxcount);
12021
12022    Py_DECREF(str1);
12023    Py_DECREF(str2);
12024    return result;
12025}
12026
12027static PyObject *
12028unicode_repr(PyObject *unicode)
12029{
12030    PyObject *repr;
12031    Py_ssize_t isize;
12032    Py_ssize_t osize, squote, dquote, i, o;
12033    Py_UCS4 max, quote;
12034    int ikind, okind;
12035    void *idata, *odata;
12036
12037    if (PyUnicode_READY(unicode) == -1)
12038        return NULL;
12039
12040    isize = PyUnicode_GET_LENGTH(unicode);
12041    idata = PyUnicode_DATA(unicode);
12042
12043    /* Compute length of output, quote characters, and
12044       maximum character */
12045    osize = 2; /* quotes */
12046    max = 127;
12047    squote = dquote = 0;
12048    ikind = PyUnicode_KIND(unicode);
12049    for (i = 0; i < isize; i++) {
12050        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12051        switch (ch) {
12052        case '\'': squote++; osize++; break;
12053        case '"':  dquote++; osize++; break;
12054        case '\\': case '\t': case '\r': case '\n':
12055            osize += 2; break;
12056        default:
12057            /* Fast-path ASCII */
12058            if (ch < ' ' || ch == 0x7f)
12059                osize += 4; /* \xHH */
12060            else if (ch < 0x7f)
12061                osize++;
12062            else if (Py_UNICODE_ISPRINTABLE(ch)) {
12063                osize++;
12064                max = ch > max ? ch : max;
12065            }
12066            else if (ch < 0x100)
12067                osize += 4; /* \xHH */
12068            else if (ch < 0x10000)
12069                osize += 6; /* \uHHHH */
12070            else
12071                osize += 10; /* \uHHHHHHHH */
12072        }
12073    }
12074
12075    quote = '\'';
12076    if (squote) {
12077        if (dquote)
12078            /* Both squote and dquote present. Use squote,
12079               and escape them */
12080            osize += squote;
12081        else
12082            quote = '"';
12083    }
12084
12085    repr = PyUnicode_New(osize, max);
12086    if (repr == NULL)
12087        return NULL;
12088    okind = PyUnicode_KIND(repr);
12089    odata = PyUnicode_DATA(repr);
12090
12091    PyUnicode_WRITE(okind, odata, 0, quote);
12092    PyUnicode_WRITE(okind, odata, osize-1, quote);
12093
12094    for (i = 0, o = 1; i < isize; i++) {
12095        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12096
12097        /* Escape quotes and backslashes */
12098        if ((ch == quote) || (ch == '\\')) {
12099            PyUnicode_WRITE(okind, odata, o++, '\\');
12100            PyUnicode_WRITE(okind, odata, o++, ch);
12101            continue;
12102        }
12103
12104        /* Map special whitespace to '\t', \n', '\r' */
12105        if (ch == '\t') {
12106            PyUnicode_WRITE(okind, odata, o++, '\\');
12107            PyUnicode_WRITE(okind, odata, o++, 't');
12108        }
12109        else if (ch == '\n') {
12110            PyUnicode_WRITE(okind, odata, o++, '\\');
12111            PyUnicode_WRITE(okind, odata, o++, 'n');
12112        }
12113        else if (ch == '\r') {
12114            PyUnicode_WRITE(okind, odata, o++, '\\');
12115            PyUnicode_WRITE(okind, odata, o++, 'r');
12116        }
12117
12118        /* Map non-printable US ASCII to '\xhh' */
12119        else if (ch < ' ' || ch == 0x7F) {
12120            PyUnicode_WRITE(okind, odata, o++, '\\');
12121            PyUnicode_WRITE(okind, odata, o++, 'x');
12122            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12123            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12124        }
12125
12126        /* Copy ASCII characters as-is */
12127        else if (ch < 0x7F) {
12128            PyUnicode_WRITE(okind, odata, o++, ch);
12129        }
12130
12131        /* Non-ASCII characters */
12132        else {
12133            /* Map Unicode whitespace and control characters
12134               (categories Z* and C* except ASCII space)
12135            */
12136            if (!Py_UNICODE_ISPRINTABLE(ch)) {
12137                /* Map 8-bit characters to '\xhh' */
12138                if (ch <= 0xff) {
12139                    PyUnicode_WRITE(okind, odata, o++, '\\');
12140                    PyUnicode_WRITE(okind, odata, o++, 'x');
12141                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12142                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12143                }
12144                /* Map 21-bit characters to '\U00xxxxxx' */
12145                else if (ch >= 0x10000) {
12146                    PyUnicode_WRITE(okind, odata, o++, '\\');
12147                    PyUnicode_WRITE(okind, odata, o++, 'U');
12148                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12149                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12150                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12151                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12152                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12153                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12154                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12155                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12156                }
12157                /* Map 16-bit characters to '\uxxxx' */
12158                else {
12159                    PyUnicode_WRITE(okind, odata, o++, '\\');
12160                    PyUnicode_WRITE(okind, odata, o++, 'u');
12161                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12162                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12163                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12164                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12165                }
12166            }
12167            /* Copy characters as-is */
12168            else {
12169                PyUnicode_WRITE(okind, odata, o++, ch);
12170            }
12171        }
12172    }
12173    /* Closing quote already added at the beginning */
12174    assert(_PyUnicode_CheckConsistency(repr, 1));
12175    return repr;
12176}
12177
12178PyDoc_STRVAR(rfind__doc__,
12179             "S.rfind(sub[, start[, end]]) -> int\n\
12180\n\
12181Return the highest index in S where substring sub is found,\n\
12182such that sub is contained within S[start:end].  Optional\n\
12183arguments start and end are interpreted as in slice notation.\n\
12184\n\
12185Return -1 on failure.");
12186
12187static PyObject *
12188unicode_rfind(PyObject *self, PyObject *args)
12189{
12190    PyObject *substring;
12191    Py_ssize_t start;
12192    Py_ssize_t end;
12193    Py_ssize_t result;
12194
12195    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12196                                            &start, &end))
12197        return NULL;
12198
12199    if (PyUnicode_READY(self) == -1)
12200        return NULL;
12201    if (PyUnicode_READY(substring) == -1)
12202        return NULL;
12203
12204    result = any_find_slice(-1, self, substring, start, end);
12205
12206    Py_DECREF(substring);
12207
12208    if (result == -2)
12209        return NULL;
12210
12211    return PyLong_FromSsize_t(result);
12212}
12213
12214PyDoc_STRVAR(rindex__doc__,
12215             "S.rindex(sub[, start[, end]]) -> int\n\
12216\n\
12217Like S.rfind() but raise ValueError when the substring is not found.");
12218
12219static PyObject *
12220unicode_rindex(PyObject *self, PyObject *args)
12221{
12222    PyObject *substring;
12223    Py_ssize_t start;
12224    Py_ssize_t end;
12225    Py_ssize_t result;
12226
12227    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12228                                            &start, &end))
12229        return NULL;
12230
12231    if (PyUnicode_READY(self) == -1)
12232        return NULL;
12233    if (PyUnicode_READY(substring) == -1)
12234        return NULL;
12235
12236    result = any_find_slice(-1, self, substring, start, end);
12237
12238    Py_DECREF(substring);
12239
12240    if (result == -2)
12241        return NULL;
12242
12243    if (result < 0) {
12244        PyErr_SetString(PyExc_ValueError, "substring not found");
12245        return NULL;
12246    }
12247
12248    return PyLong_FromSsize_t(result);
12249}
12250
12251PyDoc_STRVAR(rjust__doc__,
12252             "S.rjust(width[, fillchar]) -> str\n\
12253\n\
12254Return S right-justified in a string of length width. Padding is\n\
12255done using the specified fill character (default is a space).");
12256
12257static PyObject *
12258unicode_rjust(PyObject *self, PyObject *args)
12259{
12260    Py_ssize_t width;
12261    Py_UCS4 fillchar = ' ';
12262
12263    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12264        return NULL;
12265
12266    if (PyUnicode_READY(self) == -1)
12267        return NULL;
12268
12269    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
12270        Py_INCREF(self);
12271        return self;
12272    }
12273
12274    return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
12275}
12276
12277PyObject *
12278PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12279{
12280    PyObject *result;
12281
12282    s = PyUnicode_FromObject(s);
12283    if (s == NULL)
12284        return NULL;
12285    if (sep != NULL) {
12286        sep = PyUnicode_FromObject(sep);
12287        if (sep == NULL) {
12288            Py_DECREF(s);
12289            return NULL;
12290        }
12291    }
12292
12293    result = split(s, sep, maxsplit);
12294
12295    Py_DECREF(s);
12296    Py_XDECREF(sep);
12297    return result;
12298}
12299
12300PyDoc_STRVAR(split__doc__,
12301             "S.split([sep[, maxsplit]]) -> list of strings\n\
12302\n\
12303Return a list of the words in S, using sep as the\n\
12304delimiter string.  If maxsplit is given, at most maxsplit\n\
12305splits are done. If sep is not specified or is None, any\n\
12306whitespace string is a separator and empty strings are\n\
12307removed from the result.");
12308
12309static PyObject*
12310unicode_split(PyObject *self, PyObject *args)
12311{
12312    PyObject *substring = Py_None;
12313    Py_ssize_t maxcount = -1;
12314
12315    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
12316        return NULL;
12317
12318    if (substring == Py_None)
12319        return split(self, NULL, maxcount);
12320    else if (PyUnicode_Check(substring))
12321        return split(self, substring, maxcount);
12322    else
12323        return PyUnicode_Split(self, substring, maxcount);
12324}
12325
12326PyObject *
12327PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12328{
12329    PyObject* str_obj;
12330    PyObject* sep_obj;
12331    PyObject* out;
12332    int kind1, kind2, kind;
12333    void *buf1 = NULL, *buf2 = NULL;
12334    Py_ssize_t len1, len2;
12335
12336    str_obj = PyUnicode_FromObject(str_in);
12337    if (!str_obj || PyUnicode_READY(str_obj) == -1)
12338        return NULL;
12339    sep_obj = PyUnicode_FromObject(sep_in);
12340    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
12341        Py_DECREF(str_obj);
12342        return NULL;
12343    }
12344
12345    kind1 = PyUnicode_KIND(str_obj);
12346    kind2 = PyUnicode_KIND(sep_obj);
12347    kind = Py_MAX(kind1, kind2);
12348    buf1 = PyUnicode_DATA(str_obj);
12349    if (kind1 != kind)
12350        buf1 = _PyUnicode_AsKind(str_obj, kind);
12351    if (!buf1)
12352        goto onError;
12353    buf2 = PyUnicode_DATA(sep_obj);
12354    if (kind2 != kind)
12355        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12356    if (!buf2)
12357        goto onError;
12358    len1 = PyUnicode_GET_LENGTH(str_obj);
12359    len2 = PyUnicode_GET_LENGTH(sep_obj);
12360
12361    switch(PyUnicode_KIND(str_obj)) {
12362    case PyUnicode_1BYTE_KIND:
12363        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12364            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12365        else
12366            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12367        break;
12368    case PyUnicode_2BYTE_KIND:
12369        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12370        break;
12371    case PyUnicode_4BYTE_KIND:
12372        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12373        break;
12374    default:
12375        assert(0);
12376        out = 0;
12377    }
12378
12379    Py_DECREF(sep_obj);
12380    Py_DECREF(str_obj);
12381    if (kind1 != kind)
12382        PyMem_Free(buf1);
12383    if (kind2 != kind)
12384        PyMem_Free(buf2);
12385
12386    return out;
12387  onError:
12388    Py_DECREF(sep_obj);
12389    Py_DECREF(str_obj);
12390    if (kind1 != kind && buf1)
12391        PyMem_Free(buf1);
12392    if (kind2 != kind && buf2)
12393        PyMem_Free(buf2);
12394    return NULL;
12395}
12396
12397
12398PyObject *
12399PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12400{
12401    PyObject* str_obj;
12402    PyObject* sep_obj;
12403    PyObject* out;
12404    int kind1, kind2, kind;
12405    void *buf1 = NULL, *buf2 = NULL;
12406    Py_ssize_t len1, len2;
12407
12408    str_obj = PyUnicode_FromObject(str_in);
12409    if (!str_obj)
12410        return NULL;
12411    sep_obj = PyUnicode_FromObject(sep_in);
12412    if (!sep_obj) {
12413        Py_DECREF(str_obj);
12414        return NULL;
12415    }
12416
12417    kind1 = PyUnicode_KIND(str_in);
12418    kind2 = PyUnicode_KIND(sep_obj);
12419    kind = Py_MAX(kind1, kind2);
12420    buf1 = PyUnicode_DATA(str_in);
12421    if (kind1 != kind)
12422        buf1 = _PyUnicode_AsKind(str_in, kind);
12423    if (!buf1)
12424        goto onError;
12425    buf2 = PyUnicode_DATA(sep_obj);
12426    if (kind2 != kind)
12427        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12428    if (!buf2)
12429        goto onError;
12430    len1 = PyUnicode_GET_LENGTH(str_obj);
12431    len2 = PyUnicode_GET_LENGTH(sep_obj);
12432
12433    switch(PyUnicode_KIND(str_in)) {
12434    case PyUnicode_1BYTE_KIND:
12435        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12436            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12437        else
12438            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12439        break;
12440    case PyUnicode_2BYTE_KIND:
12441        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12442        break;
12443    case PyUnicode_4BYTE_KIND:
12444        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12445        break;
12446    default:
12447        assert(0);
12448        out = 0;
12449    }
12450
12451    Py_DECREF(sep_obj);
12452    Py_DECREF(str_obj);
12453    if (kind1 != kind)
12454        PyMem_Free(buf1);
12455    if (kind2 != kind)
12456        PyMem_Free(buf2);
12457
12458    return out;
12459  onError:
12460    Py_DECREF(sep_obj);
12461    Py_DECREF(str_obj);
12462    if (kind1 != kind && buf1)
12463        PyMem_Free(buf1);
12464    if (kind2 != kind && buf2)
12465        PyMem_Free(buf2);
12466    return NULL;
12467}
12468
12469PyDoc_STRVAR(partition__doc__,
12470             "S.partition(sep) -> (head, sep, tail)\n\
12471\n\
12472Search for the separator sep in S, and return the part before it,\n\
12473the separator itself, and the part after it.  If the separator is not\n\
12474found, return S and two empty strings.");
12475
12476static PyObject*
12477unicode_partition(PyObject *self, PyObject *separator)
12478{
12479    return PyUnicode_Partition(self, separator);
12480}
12481
12482PyDoc_STRVAR(rpartition__doc__,
12483             "S.rpartition(sep) -> (head, sep, tail)\n\
12484\n\
12485Search for the separator sep in S, starting at the end of S, and return\n\
12486the part before it, the separator itself, and the part after it.  If the\n\
12487separator is not found, return two empty strings and S.");
12488
12489static PyObject*
12490unicode_rpartition(PyObject *self, PyObject *separator)
12491{
12492    return PyUnicode_RPartition(self, separator);
12493}
12494
12495PyObject *
12496PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12497{
12498    PyObject *result;
12499
12500    s = PyUnicode_FromObject(s);
12501    if (s == NULL)
12502        return NULL;
12503    if (sep != NULL) {
12504        sep = PyUnicode_FromObject(sep);
12505        if (sep == NULL) {
12506            Py_DECREF(s);
12507            return NULL;
12508        }
12509    }
12510
12511    result = rsplit(s, sep, maxsplit);
12512
12513    Py_DECREF(s);
12514    Py_XDECREF(sep);
12515    return result;
12516}
12517
12518PyDoc_STRVAR(rsplit__doc__,
12519             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
12520\n\
12521Return a list of the words in S, using sep as the\n\
12522delimiter string, starting at the end of the string and\n\
12523working to the front.  If maxsplit is given, at most maxsplit\n\
12524splits are done. If sep is not specified, any whitespace string\n\
12525is a separator.");
12526
12527static PyObject*
12528unicode_rsplit(PyObject *self, PyObject *args)
12529{
12530    PyObject *substring = Py_None;
12531    Py_ssize_t maxcount = -1;
12532
12533    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
12534        return NULL;
12535
12536    if (substring == Py_None)
12537        return rsplit(self, NULL, maxcount);
12538    else if (PyUnicode_Check(substring))
12539        return rsplit(self, substring, maxcount);
12540    else
12541        return PyUnicode_RSplit(self, substring, maxcount);
12542}
12543
12544PyDoc_STRVAR(splitlines__doc__,
12545             "S.splitlines([keepends]) -> list of strings\n\
12546\n\
12547Return a list of the lines in S, breaking at line boundaries.\n\
12548Line breaks are not included in the resulting list unless keepends\n\
12549is given and true.");
12550
12551static PyObject*
12552unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12553{
12554    static char *kwlist[] = {"keepends", 0};
12555    int keepends = 0;
12556
12557    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12558                                     kwlist, &keepends))
12559        return NULL;
12560
12561    return PyUnicode_Splitlines(self, keepends);
12562}
12563
12564static
12565PyObject *unicode_str(PyObject *self)
12566{
12567    if (PyUnicode_CheckExact(self)) {
12568        Py_INCREF(self);
12569        return self;
12570    } else
12571        /* Subtype -- return genuine unicode string with the same value. */
12572        return PyUnicode_Copy(self);
12573}
12574
12575PyDoc_STRVAR(swapcase__doc__,
12576             "S.swapcase() -> str\n\
12577\n\
12578Return a copy of S with uppercase characters converted to lowercase\n\
12579and vice versa.");
12580
12581static PyObject*
12582unicode_swapcase(PyObject *self)
12583{
12584    return fixup(self, fixswapcase);
12585}
12586
12587PyDoc_STRVAR(maketrans__doc__,
12588             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12589\n\
12590Return a translation table usable for str.translate().\n\
12591If there is only one argument, it must be a dictionary mapping Unicode\n\
12592ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12593Character keys will be then converted to ordinals.\n\
12594If there are two arguments, they must be strings of equal length, and\n\
12595in the resulting dictionary, each character in x will be mapped to the\n\
12596character at the same position in y. If there is a third argument, it\n\
12597must be a string, whose characters will be mapped to None in the result.");
12598
12599static PyObject*
12600unicode_maketrans(PyObject *null, PyObject *args)
12601{
12602    PyObject *x, *y = NULL, *z = NULL;
12603    PyObject *new = NULL, *key, *value;
12604    Py_ssize_t i = 0;
12605    int res;
12606
12607    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12608        return NULL;
12609    new = PyDict_New();
12610    if (!new)
12611        return NULL;
12612    if (y != NULL) {
12613        int x_kind, y_kind, z_kind;
12614        void *x_data, *y_data, *z_data;
12615
12616        /* x must be a string too, of equal length */
12617        if (!PyUnicode_Check(x)) {
12618            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12619                            "be a string if there is a second argument");
12620            goto err;
12621        }
12622        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12623            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12624                            "arguments must have equal length");
12625            goto err;
12626        }
12627        /* create entries for translating chars in x to those in y */
12628        x_kind = PyUnicode_KIND(x);
12629        y_kind = PyUnicode_KIND(y);
12630        x_data = PyUnicode_DATA(x);
12631        y_data = PyUnicode_DATA(y);
12632        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12633            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12634            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12635            if (!key || !value)
12636                goto err;
12637            res = PyDict_SetItem(new, key, value);
12638            Py_DECREF(key);
12639            Py_DECREF(value);
12640            if (res < 0)
12641                goto err;
12642        }
12643        /* create entries for deleting chars in z */
12644        if (z != NULL) {
12645            z_kind = PyUnicode_KIND(z);
12646            z_data = PyUnicode_DATA(z);
12647            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12648                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12649                if (!key)
12650                    goto err;
12651                res = PyDict_SetItem(new, key, Py_None);
12652                Py_DECREF(key);
12653                if (res < 0)
12654                    goto err;
12655            }
12656        }
12657    } else {
12658        int kind;
12659        void *data;
12660
12661        /* x must be a dict */
12662        if (!PyDict_CheckExact(x)) {
12663            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12664                            "to maketrans it must be a dict");
12665            goto err;
12666        }
12667        /* copy entries into the new dict, converting string keys to int keys */
12668        while (PyDict_Next(x, &i, &key, &value)) {
12669            if (PyUnicode_Check(key)) {
12670                /* convert string keys to integer keys */
12671                PyObject *newkey;
12672                if (PyUnicode_GET_LENGTH(key) != 1) {
12673                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12674                                    "table must be of length 1");
12675                    goto err;
12676                }
12677                kind = PyUnicode_KIND(key);
12678                data = PyUnicode_DATA(key);
12679                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12680                if (!newkey)
12681                    goto err;
12682                res = PyDict_SetItem(new, newkey, value);
12683                Py_DECREF(newkey);
12684                if (res < 0)
12685                    goto err;
12686            } else if (PyLong_Check(key)) {
12687                /* just keep integer keys */
12688                if (PyDict_SetItem(new, key, value) < 0)
12689                    goto err;
12690            } else {
12691                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12692                                "be strings or integers");
12693                goto err;
12694            }
12695        }
12696    }
12697    return new;
12698  err:
12699    Py_DECREF(new);
12700    return NULL;
12701}
12702
12703PyDoc_STRVAR(translate__doc__,
12704             "S.translate(table) -> str\n\
12705\n\
12706Return a copy of the string S, where all characters have been mapped\n\
12707through the given translation table, which must be a mapping of\n\
12708Unicode ordinals to Unicode ordinals, strings, or None.\n\
12709Unmapped characters are left untouched. Characters mapped to None\n\
12710are deleted.");
12711
12712static PyObject*
12713unicode_translate(PyObject *self, PyObject *table)
12714{
12715    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12716}
12717
12718PyDoc_STRVAR(upper__doc__,
12719             "S.upper() -> str\n\
12720\n\
12721Return a copy of S converted to uppercase.");
12722
12723static PyObject*
12724unicode_upper(PyObject *self)
12725{
12726    return fixup(self, fixupper);
12727}
12728
12729PyDoc_STRVAR(zfill__doc__,
12730             "S.zfill(width) -> str\n\
12731\n\
12732Pad a numeric string S with zeros on the left, to fill a field\n\
12733of the specified width. The string S is never truncated.");
12734
12735static PyObject *
12736unicode_zfill(PyObject *self, PyObject *args)
12737{
12738    Py_ssize_t fill;
12739    PyObject *u;
12740    Py_ssize_t width;
12741    int kind;
12742    void *data;
12743    Py_UCS4 chr;
12744
12745    if (PyUnicode_READY(self) == -1)
12746        return NULL;
12747
12748    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12749        return NULL;
12750
12751    if (PyUnicode_GET_LENGTH(self) >= width) {
12752        if (PyUnicode_CheckExact(self)) {
12753            Py_INCREF(self);
12754            return self;
12755        }
12756        else
12757            return PyUnicode_Copy(self);
12758    }
12759
12760    fill = width - _PyUnicode_LENGTH(self);
12761
12762    u = pad(self, fill, 0, '0');
12763
12764    if (u == NULL)
12765        return NULL;
12766
12767    kind = PyUnicode_KIND(u);
12768    data = PyUnicode_DATA(u);
12769    chr = PyUnicode_READ(kind, data, fill);
12770
12771    if (chr == '+' || chr == '-') {
12772        /* move sign to beginning of string */
12773        PyUnicode_WRITE(kind, data, 0, chr);
12774        PyUnicode_WRITE(kind, data, fill, '0');
12775    }
12776
12777    assert(_PyUnicode_CheckConsistency(u, 1));
12778    return u;
12779}
12780
12781#if 0
12782static PyObject *
12783unicode__decimal2ascii(PyObject *self)
12784{
12785    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12786}
12787#endif
12788
12789PyDoc_STRVAR(startswith__doc__,
12790             "S.startswith(prefix[, start[, end]]) -> bool\n\
12791\n\
12792Return True if S starts with the specified prefix, False otherwise.\n\
12793With optional start, test S beginning at that position.\n\
12794With optional end, stop comparing S at that position.\n\
12795prefix can also be a tuple of strings to try.");
12796
12797static PyObject *
12798unicode_startswith(PyObject *self,
12799                   PyObject *args)
12800{
12801    PyObject *subobj;
12802    PyObject *substring;
12803    Py_ssize_t start = 0;
12804    Py_ssize_t end = PY_SSIZE_T_MAX;
12805    int result;
12806
12807    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12808        return NULL;
12809    if (PyTuple_Check(subobj)) {
12810        Py_ssize_t i;
12811        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12812            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12813            if (substring == NULL)
12814                return NULL;
12815            result = tailmatch(self, substring, start, end, -1);
12816            Py_DECREF(substring);
12817            if (result) {
12818                Py_RETURN_TRUE;
12819            }
12820        }
12821        /* nothing matched */
12822        Py_RETURN_FALSE;
12823    }
12824    substring = PyUnicode_FromObject(subobj);
12825    if (substring == NULL) {
12826        if (PyErr_ExceptionMatches(PyExc_TypeError))
12827            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12828                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12829        return NULL;
12830    }
12831    result = tailmatch(self, substring, start, end, -1);
12832    Py_DECREF(substring);
12833    return PyBool_FromLong(result);
12834}
12835
12836
12837PyDoc_STRVAR(endswith__doc__,
12838             "S.endswith(suffix[, start[, end]]) -> bool\n\
12839\n\
12840Return True if S ends with the specified suffix, False otherwise.\n\
12841With optional start, test S beginning at that position.\n\
12842With optional end, stop comparing S at that position.\n\
12843suffix can also be a tuple of strings to try.");
12844
12845static PyObject *
12846unicode_endswith(PyObject *self,
12847                 PyObject *args)
12848{
12849    PyObject *subobj;
12850    PyObject *substring;
12851    Py_ssize_t start = 0;
12852    Py_ssize_t end = PY_SSIZE_T_MAX;
12853    int result;
12854
12855    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12856        return NULL;
12857    if (PyTuple_Check(subobj)) {
12858        Py_ssize_t i;
12859        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12860            substring = PyUnicode_FromObject(
12861                PyTuple_GET_ITEM(subobj, i));
12862            if (substring == NULL)
12863                return NULL;
12864            result = tailmatch(self, substring, start, end, +1);
12865            Py_DECREF(substring);
12866            if (result) {
12867                Py_RETURN_TRUE;
12868            }
12869        }
12870        Py_RETURN_FALSE;
12871    }
12872    substring = PyUnicode_FromObject(subobj);
12873    if (substring == NULL) {
12874        if (PyErr_ExceptionMatches(PyExc_TypeError))
12875            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12876                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12877        return NULL;
12878    }
12879    result = tailmatch(self, substring, start, end, +1);
12880    Py_DECREF(substring);
12881    return PyBool_FromLong(result);
12882}
12883
12884#include "stringlib/unicode_format.h"
12885
12886PyDoc_STRVAR(format__doc__,
12887             "S.format(*args, **kwargs) -> str\n\
12888\n\
12889Return a formatted version of S, using substitutions from args and kwargs.\n\
12890The substitutions are identified by braces ('{' and '}').");
12891
12892PyDoc_STRVAR(format_map__doc__,
12893             "S.format_map(mapping) -> str\n\
12894\n\
12895Return a formatted version of S, using substitutions from mapping.\n\
12896The substitutions are identified by braces ('{' and '}').");
12897
12898static PyObject *
12899unicode__format__(PyObject* self, PyObject* args)
12900{
12901    PyObject *format_spec, *out;
12902
12903    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12904        return NULL;
12905
12906    out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
12907                                     PyUnicode_GET_LENGTH(format_spec));
12908    return out;
12909}
12910
12911PyDoc_STRVAR(p_format__doc__,
12912             "S.__format__(format_spec) -> str\n\
12913\n\
12914Return a formatted version of S as described by format_spec.");
12915
12916static PyObject *
12917unicode__sizeof__(PyObject *v)
12918{
12919    Py_ssize_t size;
12920
12921    /* If it's a compact object, account for base structure +
12922       character data. */
12923    if (PyUnicode_IS_COMPACT_ASCII(v))
12924        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12925    else if (PyUnicode_IS_COMPACT(v))
12926        size = sizeof(PyCompactUnicodeObject) +
12927            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12928    else {
12929        /* If it is a two-block object, account for base object, and
12930           for character block if present. */
12931        size = sizeof(PyUnicodeObject);
12932        if (_PyUnicode_DATA_ANY(v))
12933            size += (PyUnicode_GET_LENGTH(v) + 1) *
12934                PyUnicode_KIND(v);
12935    }
12936    /* If the wstr pointer is present, account for it unless it is shared
12937       with the data pointer. Check if the data is not shared. */
12938    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12939        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12940    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12941        size += PyUnicode_UTF8_LENGTH(v) + 1;
12942
12943    return PyLong_FromSsize_t(size);
12944}
12945
12946PyDoc_STRVAR(sizeof__doc__,
12947             "S.__sizeof__() -> size of S in memory, in bytes");
12948
12949static PyObject *
12950unicode_getnewargs(PyObject *v)
12951{
12952    PyObject *copy = PyUnicode_Copy(v);
12953    if (!copy)
12954        return NULL;
12955    return Py_BuildValue("(N)", copy);
12956}
12957
12958static PyMethodDef unicode_methods[] = {
12959
12960    /* Order is according to common usage: often used methods should
12961       appear first, since lookup is done sequentially. */
12962
12963    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12964    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12965    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12966    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12967    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12968    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12969    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12970    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12971    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12972    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12973    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12974    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12975    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12976    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12977    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12978    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12979    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12980    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12981    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12982    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12983    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12984    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12985    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12986    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12987    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12988    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12989    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12990    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12991    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12992    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12993    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12994    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12995    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12996    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12997    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12998    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12999    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13000    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13001    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13002    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13003    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13004    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13005    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13006    {"maketrans", (PyCFunction) unicode_maketrans,
13007     METH_VARARGS | METH_STATIC, maketrans__doc__},
13008    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13009#if 0
13010    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
13011#endif
13012
13013#if 0
13014    /* These methods are just used for debugging the implementation. */
13015    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13016#endif
13017
13018    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13019    {NULL, NULL}
13020};
13021
13022static PyObject *
13023unicode_mod(PyObject *v, PyObject *w)
13024{
13025    if (!PyUnicode_Check(v))
13026        Py_RETURN_NOTIMPLEMENTED;
13027    return PyUnicode_Format(v, w);
13028}
13029
13030static PyNumberMethods unicode_as_number = {
13031    0,              /*nb_add*/
13032    0,              /*nb_subtract*/
13033    0,              /*nb_multiply*/
13034    unicode_mod,            /*nb_remainder*/
13035};
13036
13037static PySequenceMethods unicode_as_sequence = {
13038    (lenfunc) unicode_length,       /* sq_length */
13039    PyUnicode_Concat,           /* sq_concat */
13040    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13041    (ssizeargfunc) unicode_getitem,     /* sq_item */
13042    0,                  /* sq_slice */
13043    0,                  /* sq_ass_item */
13044    0,                  /* sq_ass_slice */
13045    PyUnicode_Contains,         /* sq_contains */
13046};
13047
13048static PyObject*
13049unicode_subscript(PyObject* self, PyObject* item)
13050{
13051    if (PyUnicode_READY(self) == -1)
13052        return NULL;
13053
13054    if (PyIndex_Check(item)) {
13055        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13056        if (i == -1 && PyErr_Occurred())
13057            return NULL;
13058        if (i < 0)
13059            i += PyUnicode_GET_LENGTH(self);
13060        return unicode_getitem(self, i);
13061    } else if (PySlice_Check(item)) {
13062        Py_ssize_t start, stop, step, slicelength, cur, i;
13063        PyObject *result;
13064        void *src_data, *dest_data;
13065        int src_kind, dest_kind;
13066        Py_UCS4 ch, max_char, kind_limit;
13067
13068        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13069                                 &start, &stop, &step, &slicelength) < 0) {
13070            return NULL;
13071        }
13072
13073        if (slicelength <= 0) {
13074            return PyUnicode_New(0, 0);
13075        } else if (start == 0 && step == 1 &&
13076                   slicelength == PyUnicode_GET_LENGTH(self) &&
13077                   PyUnicode_CheckExact(self)) {
13078            Py_INCREF(self);
13079            return self;
13080        } else if (step == 1) {
13081            return PyUnicode_Substring(self,
13082                                       start, start + slicelength);
13083        }
13084        /* General case */
13085        src_kind = PyUnicode_KIND(self);
13086        src_data = PyUnicode_DATA(self);
13087        if (!PyUnicode_IS_ASCII(self)) {
13088            kind_limit = kind_maxchar_limit(src_kind);
13089            max_char = 0;
13090            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13091                ch = PyUnicode_READ(src_kind, src_data, cur);
13092                if (ch > max_char) {
13093                    max_char = ch;
13094                    if (max_char >= kind_limit)
13095                        break;
13096                }
13097            }
13098        }
13099        else
13100            max_char = 127;
13101        result = PyUnicode_New(slicelength, max_char);
13102        if (result == NULL)
13103            return NULL;
13104        dest_kind = PyUnicode_KIND(result);
13105        dest_data = PyUnicode_DATA(result);
13106
13107        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13108            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13109            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13110        }
13111        assert(_PyUnicode_CheckConsistency(result, 1));
13112        return result;
13113    } else {
13114        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13115        return NULL;
13116    }
13117}
13118
13119static PyMappingMethods unicode_as_mapping = {
13120    (lenfunc)unicode_length,        /* mp_length */
13121    (binaryfunc)unicode_subscript,  /* mp_subscript */
13122    (objobjargproc)0,           /* mp_ass_subscript */
13123};
13124
13125
13126/* Helpers for PyUnicode_Format() */
13127
13128static PyObject *
13129getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
13130{
13131    Py_ssize_t argidx = *p_argidx;
13132    if (argidx < arglen) {
13133        (*p_argidx)++;
13134        if (arglen < 0)
13135            return args;
13136        else
13137            return PyTuple_GetItem(args, argidx);
13138    }
13139    PyErr_SetString(PyExc_TypeError,
13140                    "not enough arguments for format string");
13141    return NULL;
13142}
13143
13144/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13145
13146static PyObject *
13147formatfloat(PyObject *v, int flags, int prec, int type)
13148{
13149    char *p;
13150    PyObject *result;
13151    double x;
13152
13153    x = PyFloat_AsDouble(v);
13154    if (x == -1.0 && PyErr_Occurred())
13155        return NULL;
13156
13157    if (prec < 0)
13158        prec = 6;
13159
13160    p = PyOS_double_to_string(x, type, prec,
13161                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
13162    if (p == NULL)
13163        return NULL;
13164    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
13165    PyMem_Free(p);
13166    return result;
13167}
13168
13169static PyObject*
13170formatlong(PyObject *val, int flags, int prec, int type)
13171{
13172    char *buf;
13173    int len;
13174    PyObject *str; /* temporary string object. */
13175    PyObject *result;
13176
13177    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13178    if (!str)
13179        return NULL;
13180    result = PyUnicode_DecodeASCII(buf, len, NULL);
13181    Py_DECREF(str);
13182    return result;
13183}
13184
13185static Py_UCS4
13186formatchar(PyObject *v)
13187{
13188    /* presume that the buffer is at least 3 characters long */
13189    if (PyUnicode_Check(v)) {
13190        if (PyUnicode_GET_LENGTH(v) == 1) {
13191            return PyUnicode_READ_CHAR(v, 0);
13192        }
13193        goto onError;
13194    }
13195    else {
13196        /* Integer input truncated to a character */
13197        long x;
13198        x = PyLong_AsLong(v);
13199        if (x == -1 && PyErr_Occurred())
13200            goto onError;
13201
13202        if (x < 0 || x > 0x10ffff) {
13203            PyErr_SetString(PyExc_OverflowError,
13204                            "%c arg not in range(0x110000)");
13205            return (Py_UCS4) -1;
13206        }
13207
13208        return (Py_UCS4) x;
13209    }
13210
13211  onError:
13212    PyErr_SetString(PyExc_TypeError,
13213                    "%c requires int or char");
13214    return (Py_UCS4) -1;
13215}
13216
13217static int
13218repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13219{
13220    int r;
13221    assert(count > 0);
13222    assert(PyUnicode_Check(obj));
13223    if (count > 5) {
13224        PyObject *repeated = unicode_repeat(obj, count);
13225        if (repeated == NULL)
13226            return -1;
13227        r = _PyAccu_Accumulate(acc, repeated);
13228        Py_DECREF(repeated);
13229        return r;
13230    }
13231    else {
13232        do {
13233            if (_PyAccu_Accumulate(acc, obj))
13234                return -1;
13235        } while (--count);
13236        return 0;
13237    }
13238}
13239
13240PyObject *
13241PyUnicode_Format(PyObject *format, PyObject *args)
13242{
13243    void *fmt;
13244    int fmtkind;
13245    PyObject *result;
13246    int kind;
13247    int r;
13248    Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
13249    int args_owned = 0;
13250    PyObject *dict = NULL;
13251    PyObject *temp = NULL;
13252    PyObject *second = NULL;
13253    PyObject *uformat;
13254    _PyAccu acc;
13255    static PyObject *plus, *minus, *blank, *zero, *percent;
13256
13257    if (!plus && !(plus = get_latin1_char('+')))
13258        return NULL;
13259    if (!minus && !(minus = get_latin1_char('-')))
13260        return NULL;
13261    if (!blank && !(blank = get_latin1_char(' ')))
13262        return NULL;
13263    if (!zero && !(zero = get_latin1_char('0')))
13264        return NULL;
13265    if (!percent && !(percent = get_latin1_char('%')))
13266        return NULL;
13267
13268    if (format == NULL || args == NULL) {
13269        PyErr_BadInternalCall();
13270        return NULL;
13271    }
13272    uformat = PyUnicode_FromObject(format);
13273    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
13274        return NULL;
13275    if (_PyAccu_Init(&acc))
13276        goto onError;
13277    fmt = PyUnicode_DATA(uformat);
13278    fmtkind = PyUnicode_KIND(uformat);
13279    fmtcnt = PyUnicode_GET_LENGTH(uformat);
13280    fmtpos = 0;
13281
13282    if (PyTuple_Check(args)) {
13283        arglen = PyTuple_Size(args);
13284        argidx = 0;
13285    }
13286    else {
13287        arglen = -1;
13288        argidx = -2;
13289    }
13290    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
13291        !PyUnicode_Check(args))
13292        dict = args;
13293
13294    while (--fmtcnt >= 0) {
13295        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13296            PyObject *nonfmt;
13297            Py_ssize_t nonfmtpos;
13298            nonfmtpos = fmtpos++;
13299            while (fmtcnt >= 0 &&
13300                   PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13301                fmtpos++;
13302                fmtcnt--;
13303            }
13304            nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
13305            if (nonfmt == NULL)
13306                goto onError;
13307            r = _PyAccu_Accumulate(&acc, nonfmt);
13308            Py_DECREF(nonfmt);
13309            if (r)
13310                goto onError;
13311        }
13312        else {
13313            /* Got a format specifier */
13314            int flags = 0;
13315            Py_ssize_t width = -1;
13316            int prec = -1;
13317            Py_UCS4 c = '\0';
13318            Py_UCS4 fill, sign;
13319            int isnumok;
13320            PyObject *v = NULL;
13321            void *pbuf = NULL;
13322            Py_ssize_t pindex, len;
13323            PyObject *signobj = NULL, *fillobj = NULL;
13324
13325            fmtpos++;
13326            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13327                Py_ssize_t keystart;
13328                Py_ssize_t keylen;
13329                PyObject *key;
13330                int pcount = 1;
13331
13332                if (dict == NULL) {
13333                    PyErr_SetString(PyExc_TypeError,
13334                                    "format requires a mapping");
13335                    goto onError;
13336                }
13337                ++fmtpos;
13338                --fmtcnt;
13339                keystart = fmtpos;
13340                /* Skip over balanced parentheses */
13341                while (pcount > 0 && --fmtcnt >= 0) {
13342                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
13343                        --pcount;
13344                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
13345                        ++pcount;
13346                    fmtpos++;
13347                }
13348                keylen = fmtpos - keystart - 1;
13349                if (fmtcnt < 0 || pcount > 0) {
13350                    PyErr_SetString(PyExc_ValueError,
13351                                    "incomplete format key");
13352                    goto onError;
13353                }
13354                key = PyUnicode_Substring(uformat,
13355                                          keystart, keystart + keylen);
13356                if (key == NULL)
13357                    goto onError;
13358                if (args_owned) {
13359                    Py_DECREF(args);
13360                    args_owned = 0;
13361                }
13362                args = PyObject_GetItem(dict, key);
13363                Py_DECREF(key);
13364                if (args == NULL) {
13365                    goto onError;
13366                }
13367                args_owned = 1;
13368                arglen = -1;
13369                argidx = -2;
13370            }
13371            while (--fmtcnt >= 0) {
13372                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
13373                case '-': flags |= F_LJUST; continue;
13374                case '+': flags |= F_SIGN; continue;
13375                case ' ': flags |= F_BLANK; continue;
13376                case '#': flags |= F_ALT; continue;
13377                case '0': flags |= F_ZERO; continue;
13378                }
13379                break;
13380            }
13381            if (c == '*') {
13382                v = getnextarg(args, arglen, &argidx);
13383                if (v == NULL)
13384                    goto onError;
13385                if (!PyLong_Check(v)) {
13386                    PyErr_SetString(PyExc_TypeError,
13387                                    "* wants int");
13388                    goto onError;
13389                }
13390                width = PyLong_AsLong(v);
13391                if (width == -1 && PyErr_Occurred())
13392                    goto onError;
13393                if (width < 0) {
13394                    flags |= F_LJUST;
13395                    width = -width;
13396                }
13397                if (--fmtcnt >= 0)
13398                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13399            }
13400            else if (c >= '0' && c <= '9') {
13401                width = c - '0';
13402                while (--fmtcnt >= 0) {
13403                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13404                    if (c < '0' || c > '9')
13405                        break;
13406                    if ((width*10) / 10 != width) {
13407                        PyErr_SetString(PyExc_ValueError,
13408                                        "width too big");
13409                        goto onError;
13410                    }
13411                    width = width*10 + (c - '0');
13412                }
13413            }
13414            if (c == '.') {
13415                prec = 0;
13416                if (--fmtcnt >= 0)
13417                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13418                if (c == '*') {
13419                    v = getnextarg(args, arglen, &argidx);
13420                    if (v == NULL)
13421                        goto onError;
13422                    if (!PyLong_Check(v)) {
13423                        PyErr_SetString(PyExc_TypeError,
13424                                        "* wants int");
13425                        goto onError;
13426                    }
13427                    prec = PyLong_AsLong(v);
13428                    if (prec == -1 && PyErr_Occurred())
13429                        goto onError;
13430                    if (prec < 0)
13431                        prec = 0;
13432                    if (--fmtcnt >= 0)
13433                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13434                }
13435                else if (c >= '0' && c <= '9') {
13436                    prec = c - '0';
13437                    while (--fmtcnt >= 0) {
13438                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13439                        if (c < '0' || c > '9')
13440                            break;
13441                        if ((prec*10) / 10 != prec) {
13442                            PyErr_SetString(PyExc_ValueError,
13443                                            "prec too big");
13444                            goto onError;
13445                        }
13446                        prec = prec*10 + (c - '0');
13447                    }
13448                }
13449            } /* prec */
13450            if (fmtcnt >= 0) {
13451                if (c == 'h' || c == 'l' || c == 'L') {
13452                    if (--fmtcnt >= 0)
13453                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13454                }
13455            }
13456            if (fmtcnt < 0) {
13457                PyErr_SetString(PyExc_ValueError,
13458                                "incomplete format");
13459                goto onError;
13460            }
13461            if (c != '%') {
13462                v = getnextarg(args, arglen, &argidx);
13463                if (v == NULL)
13464                    goto onError;
13465            }
13466            sign = 0;
13467            fill = ' ';
13468            fillobj = blank;
13469            switch (c) {
13470
13471            case '%':
13472                _PyAccu_Accumulate(&acc, percent);
13473                continue;
13474
13475            case 's':
13476            case 'r':
13477            case 'a':
13478                if (PyUnicode_CheckExact(v) && c == 's') {
13479                    temp = v;
13480                    Py_INCREF(temp);
13481                }
13482                else {
13483                    if (c == 's')
13484                        temp = PyObject_Str(v);
13485                    else if (c == 'r')
13486                        temp = PyObject_Repr(v);
13487                    else
13488                        temp = PyObject_ASCII(v);
13489                    if (temp == NULL)
13490                        goto onError;
13491                    if (PyUnicode_Check(temp))
13492                        /* nothing to do */;
13493                    else {
13494                        Py_DECREF(temp);
13495                        PyErr_SetString(PyExc_TypeError,
13496                                        "%s argument has non-string str()");
13497                        goto onError;
13498                    }
13499                }
13500                if (PyUnicode_READY(temp) == -1) {
13501                    Py_CLEAR(temp);
13502                    goto onError;
13503                }
13504                pbuf = PyUnicode_DATA(temp);
13505                kind = PyUnicode_KIND(temp);
13506                len = PyUnicode_GET_LENGTH(temp);
13507                if (prec >= 0 && len > prec)
13508                    len = prec;
13509                break;
13510
13511            case 'i':
13512            case 'd':
13513            case 'u':
13514            case 'o':
13515            case 'x':
13516            case 'X':
13517                isnumok = 0;
13518                if (PyNumber_Check(v)) {
13519                    PyObject *iobj=NULL;
13520
13521                    if (PyLong_Check(v)) {
13522                        iobj = v;
13523                        Py_INCREF(iobj);
13524                    }
13525                    else {
13526                        iobj = PyNumber_Long(v);
13527                    }
13528                    if (iobj!=NULL) {
13529                        if (PyLong_Check(iobj)) {
13530                            isnumok = 1;
13531                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13532                            Py_DECREF(iobj);
13533                            if (!temp)
13534                                goto onError;
13535                            if (PyUnicode_READY(temp) == -1) {
13536                                Py_CLEAR(temp);
13537                                goto onError;
13538                            }
13539                            pbuf = PyUnicode_DATA(temp);
13540                            kind = PyUnicode_KIND(temp);
13541                            len = PyUnicode_GET_LENGTH(temp);
13542                            sign = 1;
13543                        }
13544                        else {
13545                            Py_DECREF(iobj);
13546                        }
13547                    }
13548                }
13549                if (!isnumok) {
13550                    PyErr_Format(PyExc_TypeError,
13551                                 "%%%c format: a number is required, "
13552                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13553                    goto onError;
13554                }
13555                if (flags & F_ZERO) {
13556                    fill = '0';
13557                    fillobj = zero;
13558                }
13559                break;
13560
13561            case 'e':
13562            case 'E':
13563            case 'f':
13564            case 'F':
13565            case 'g':
13566            case 'G':
13567                temp = formatfloat(v, flags, prec, c);
13568                if (!temp)
13569                    goto onError;
13570                if (PyUnicode_READY(temp) == -1) {
13571                    Py_CLEAR(temp);
13572                    goto onError;
13573                }
13574                pbuf = PyUnicode_DATA(temp);
13575                kind = PyUnicode_KIND(temp);
13576                len = PyUnicode_GET_LENGTH(temp);
13577                sign = 1;
13578                if (flags & F_ZERO) {
13579                    fill = '0';
13580                    fillobj = zero;
13581                }
13582                break;
13583
13584            case 'c':
13585            {
13586                Py_UCS4 ch = formatchar(v);
13587                if (ch == (Py_UCS4) -1)
13588                    goto onError;
13589                temp = _PyUnicode_FromUCS4(&ch, 1);
13590                if (temp == NULL)
13591                    goto onError;
13592                pbuf = PyUnicode_DATA(temp);
13593                kind = PyUnicode_KIND(temp);
13594                len = PyUnicode_GET_LENGTH(temp);
13595                break;
13596            }
13597
13598            default:
13599                PyErr_Format(PyExc_ValueError,
13600                             "unsupported format character '%c' (0x%x) "
13601                             "at index %zd",
13602                             (31<=c && c<=126) ? (char)c : '?',
13603                             (int)c,
13604                             fmtpos - 1);
13605                goto onError;
13606            }
13607            /* pbuf is initialized here. */
13608            pindex = 0;
13609            if (sign) {
13610                if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13611                    signobj = minus;
13612                    len--;
13613                    pindex++;
13614                }
13615                else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13616                    signobj = plus;
13617                    len--;
13618                    pindex++;
13619                }
13620                else if (flags & F_SIGN)
13621                    signobj = plus;
13622                else if (flags & F_BLANK)
13623                    signobj = blank;
13624                else
13625                    sign = 0;
13626            }
13627            if (width < len)
13628                width = len;
13629            if (sign) {
13630                if (fill != ' ') {
13631                    assert(signobj != NULL);
13632                    if (_PyAccu_Accumulate(&acc, signobj))
13633                        goto onError;
13634                }
13635                if (width > len)
13636                    width--;
13637            }
13638            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13639                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13640                assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13641                if (fill != ' ') {
13642                    second = get_latin1_char(
13643                        PyUnicode_READ(kind, pbuf, pindex + 1));
13644                    pindex += 2;
13645                    if (second == NULL ||
13646                        _PyAccu_Accumulate(&acc, zero) ||
13647                        _PyAccu_Accumulate(&acc, second))
13648                        goto onError;
13649                    Py_CLEAR(second);
13650                }
13651                width -= 2;
13652                if (width < 0)
13653                    width = 0;
13654                len -= 2;
13655            }
13656            if (width > len && !(flags & F_LJUST)) {
13657                assert(fillobj != NULL);
13658                if (repeat_accumulate(&acc, fillobj, width - len))
13659                    goto onError;
13660                width = len;
13661            }
13662            if (fill == ' ') {
13663                if (sign) {
13664                    assert(signobj != NULL);
13665                    if (_PyAccu_Accumulate(&acc, signobj))
13666                        goto onError;
13667                }
13668                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13669                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13670                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13671                    second = get_latin1_char(
13672                        PyUnicode_READ(kind, pbuf, pindex + 1));
13673                    pindex += 2;
13674                    if (second == NULL ||
13675                        _PyAccu_Accumulate(&acc, zero) ||
13676                        _PyAccu_Accumulate(&acc, second))
13677                        goto onError;
13678                    Py_CLEAR(second);
13679                }
13680            }
13681            /* Copy all characters, preserving len */
13682            if (temp != NULL) {
13683                assert(pbuf == PyUnicode_DATA(temp));
13684                v = PyUnicode_Substring(temp, pindex, pindex + len);
13685            }
13686            else {
13687                const char *p = (const char *) pbuf;
13688                assert(pbuf != NULL);
13689                p += kind * pindex;
13690                v = PyUnicode_FromKindAndData(kind, p, len);
13691            }
13692            if (v == NULL)
13693                goto onError;
13694            r = _PyAccu_Accumulate(&acc, v);
13695            Py_DECREF(v);
13696            if (r)
13697                goto onError;
13698            if (width > len && repeat_accumulate(&acc, blank, width - len))
13699                goto onError;
13700            if (dict && (argidx < arglen) && c != '%') {
13701                PyErr_SetString(PyExc_TypeError,
13702                                "not all arguments converted during string formatting");
13703                goto onError;
13704            }
13705            Py_CLEAR(temp);
13706        } /* '%' */
13707    } /* until end */
13708    if (argidx < arglen && !dict) {
13709        PyErr_SetString(PyExc_TypeError,
13710                        "not all arguments converted during string formatting");
13711        goto onError;
13712    }
13713
13714    result = _PyAccu_Finish(&acc);
13715    if (args_owned) {
13716        Py_DECREF(args);
13717    }
13718    Py_DECREF(uformat);
13719    Py_XDECREF(temp);
13720    Py_XDECREF(second);
13721    return result;
13722
13723  onError:
13724    Py_DECREF(uformat);
13725    Py_XDECREF(temp);
13726    Py_XDECREF(second);
13727    _PyAccu_Destroy(&acc);
13728    if (args_owned) {
13729        Py_DECREF(args);
13730    }
13731    return NULL;
13732}
13733
13734static PyObject *
13735unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13736
13737static PyObject *
13738unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13739{
13740    PyObject *x = NULL;
13741    static char *kwlist[] = {"object", "encoding", "errors", 0};
13742    char *encoding = NULL;
13743    char *errors = NULL;
13744
13745    if (type != &PyUnicode_Type)
13746        return unicode_subtype_new(type, args, kwds);
13747    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13748                                     kwlist, &x, &encoding, &errors))
13749        return NULL;
13750    if (x == NULL)
13751        return PyUnicode_New(0, 0);
13752    if (encoding == NULL && errors == NULL)
13753        return PyObject_Str(x);
13754    else
13755        return PyUnicode_FromEncodedObject(x, encoding, errors);
13756}
13757
13758static PyObject *
13759unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13760{
13761    PyObject *unicode, *self;
13762    Py_ssize_t length, char_size;
13763    int share_wstr, share_utf8;
13764    unsigned int kind;
13765    void *data;
13766
13767    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13768
13769    unicode = unicode_new(&PyUnicode_Type, args, kwds);
13770    if (unicode == NULL)
13771        return NULL;
13772    assert(_PyUnicode_CHECK(unicode));
13773    if (PyUnicode_READY(unicode))
13774        return NULL;
13775
13776    self = type->tp_alloc(type, 0);
13777    if (self == NULL) {
13778        Py_DECREF(unicode);
13779        return NULL;
13780    }
13781    kind = PyUnicode_KIND(unicode);
13782    length = PyUnicode_GET_LENGTH(unicode);
13783
13784    _PyUnicode_LENGTH(self) = length;
13785#ifdef Py_DEBUG
13786    _PyUnicode_HASH(self) = -1;
13787#else
13788    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13789#endif
13790    _PyUnicode_STATE(self).interned = 0;
13791    _PyUnicode_STATE(self).kind = kind;
13792    _PyUnicode_STATE(self).compact = 0;
13793    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13794    _PyUnicode_STATE(self).ready = 1;
13795    _PyUnicode_WSTR(self) = NULL;
13796    _PyUnicode_UTF8_LENGTH(self) = 0;
13797    _PyUnicode_UTF8(self) = NULL;
13798    _PyUnicode_WSTR_LENGTH(self) = 0;
13799    _PyUnicode_DATA_ANY(self) = NULL;
13800
13801    share_utf8 = 0;
13802    share_wstr = 0;
13803    if (kind == PyUnicode_1BYTE_KIND) {
13804        char_size = 1;
13805        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13806            share_utf8 = 1;
13807    }
13808    else if (kind == PyUnicode_2BYTE_KIND) {
13809        char_size = 2;
13810        if (sizeof(wchar_t) == 2)
13811            share_wstr = 1;
13812    }
13813    else {
13814        assert(kind == PyUnicode_4BYTE_KIND);
13815        char_size = 4;
13816        if (sizeof(wchar_t) == 4)
13817            share_wstr = 1;
13818    }
13819
13820    /* Ensure we won't overflow the length. */
13821    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13822        PyErr_NoMemory();
13823        goto onError;
13824    }
13825    data = PyObject_MALLOC((length + 1) * char_size);
13826    if (data == NULL) {
13827        PyErr_NoMemory();
13828        goto onError;
13829    }
13830
13831    _PyUnicode_DATA_ANY(self) = data;
13832    if (share_utf8) {
13833        _PyUnicode_UTF8_LENGTH(self) = length;
13834        _PyUnicode_UTF8(self) = data;
13835    }
13836    if (share_wstr) {
13837        _PyUnicode_WSTR_LENGTH(self) = length;
13838        _PyUnicode_WSTR(self) = (wchar_t *)data;
13839    }
13840
13841    Py_MEMCPY(data, PyUnicode_DATA(unicode),
13842              kind * (length + 1));
13843    assert(_PyUnicode_CheckConsistency(self, 1));
13844#ifdef Py_DEBUG
13845    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13846#endif
13847    Py_DECREF(unicode);
13848    return self;
13849
13850onError:
13851    Py_DECREF(unicode);
13852    Py_DECREF(self);
13853    return NULL;
13854}
13855
13856PyDoc_STRVAR(unicode_doc,
13857             "str(string[, encoding[, errors]]) -> str\n\
13858\n\
13859Create a new string object from the given encoded string.\n\
13860encoding defaults to the current default string encoding.\n\
13861errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
13862
13863static PyObject *unicode_iter(PyObject *seq);
13864
13865PyTypeObject PyUnicode_Type = {
13866    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13867    "str",              /* tp_name */
13868    sizeof(PyUnicodeObject),        /* tp_size */
13869    0,                  /* tp_itemsize */
13870    /* Slots */
13871    (destructor)unicode_dealloc,    /* tp_dealloc */
13872    0,                  /* tp_print */
13873    0,                  /* tp_getattr */
13874    0,                  /* tp_setattr */
13875    0,                  /* tp_reserved */
13876    unicode_repr,           /* tp_repr */
13877    &unicode_as_number,         /* tp_as_number */
13878    &unicode_as_sequence,       /* tp_as_sequence */
13879    &unicode_as_mapping,        /* tp_as_mapping */
13880    (hashfunc) unicode_hash,        /* tp_hash*/
13881    0,                  /* tp_call*/
13882    (reprfunc) unicode_str,     /* tp_str */
13883    PyObject_GenericGetAttr,        /* tp_getattro */
13884    0,                  /* tp_setattro */
13885    0,                  /* tp_as_buffer */
13886    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
13887    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
13888    unicode_doc,            /* tp_doc */
13889    0,                  /* tp_traverse */
13890    0,                  /* tp_clear */
13891    PyUnicode_RichCompare,      /* tp_richcompare */
13892    0,                  /* tp_weaklistoffset */
13893    unicode_iter,           /* tp_iter */
13894    0,                  /* tp_iternext */
13895    unicode_methods,            /* tp_methods */
13896    0,                  /* tp_members */
13897    0,                  /* tp_getset */
13898    &PyBaseObject_Type,         /* tp_base */
13899    0,                  /* tp_dict */
13900    0,                  /* tp_descr_get */
13901    0,                  /* tp_descr_set */
13902    0,                  /* tp_dictoffset */
13903    0,                  /* tp_init */
13904    0,                  /* tp_alloc */
13905    unicode_new,            /* tp_new */
13906    PyObject_Del,           /* tp_free */
13907};
13908
13909/* Initialize the Unicode implementation */
13910
13911int _PyUnicode_Init(void)
13912{
13913    int i;
13914
13915    /* XXX - move this array to unicodectype.c ? */
13916    Py_UCS2 linebreak[] = {
13917        0x000A, /* LINE FEED */
13918        0x000D, /* CARRIAGE RETURN */
13919        0x001C, /* FILE SEPARATOR */
13920        0x001D, /* GROUP SEPARATOR */
13921        0x001E, /* RECORD SEPARATOR */
13922        0x0085, /* NEXT LINE */
13923        0x2028, /* LINE SEPARATOR */
13924        0x2029, /* PARAGRAPH SEPARATOR */
13925    };
13926
13927    /* Init the implementation */
13928    unicode_empty = PyUnicode_New(0, 0);
13929    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
13930    if (!unicode_empty)
13931        Py_FatalError("Can't create empty string");
13932
13933    for (i = 0; i < 256; i++)
13934        unicode_latin1[i] = NULL;
13935    if (PyType_Ready(&PyUnicode_Type) < 0)
13936        Py_FatalError("Can't initialize 'unicode'");
13937
13938    /* initialize the linebreak bloom filter */
13939    bloom_linebreak = make_bloom_mask(
13940        PyUnicode_2BYTE_KIND, linebreak,
13941        Py_ARRAY_LENGTH(linebreak));
13942
13943    PyType_Ready(&EncodingMapType);
13944
13945#ifdef HAVE_MBCS
13946    winver.dwOSVersionInfoSize = sizeof(winver);
13947    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13948        PyErr_SetFromWindowsErr(0);
13949        return -1;
13950    }
13951#endif
13952    return 0;
13953}
13954
13955/* Finalize the Unicode implementation */
13956
13957int
13958PyUnicode_ClearFreeList(void)
13959{
13960    return 0;
13961}
13962
13963void
13964_PyUnicode_Fini(void)
13965{
13966    int i;
13967
13968    Py_XDECREF(unicode_empty);
13969    unicode_empty = NULL;
13970
13971    for (i = 0; i < 256; i++) {
13972        if (unicode_latin1[i]) {
13973            Py_DECREF(unicode_latin1[i]);
13974            unicode_latin1[i] = NULL;
13975        }
13976    }
13977    _PyUnicode_ClearStaticStrings();
13978    (void)PyUnicode_ClearFreeList();
13979}
13980
13981void
13982PyUnicode_InternInPlace(PyObject **p)
13983{
13984    register PyObject *s = *p;
13985    PyObject *t;
13986#ifdef Py_DEBUG
13987    assert(s != NULL);
13988    assert(_PyUnicode_CHECK(s));
13989#else
13990    if (s == NULL || !PyUnicode_Check(s))
13991        return;
13992#endif
13993    /* If it's a subclass, we don't really know what putting
13994       it in the interned dict might do. */
13995    if (!PyUnicode_CheckExact(s))
13996        return;
13997    if (PyUnicode_CHECK_INTERNED(s))
13998        return;
13999    if (_PyUnicode_READY_REPLACE(p)) {
14000        assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
14001        return;
14002    }
14003    s = *p;
14004    if (interned == NULL) {
14005        interned = PyDict_New();
14006        if (interned == NULL) {
14007            PyErr_Clear(); /* Don't leave an exception */
14008            return;
14009        }
14010    }
14011    /* It might be that the GetItem call fails even
14012       though the key is present in the dictionary,
14013       namely when this happens during a stack overflow. */
14014    Py_ALLOW_RECURSION
14015    t = PyDict_GetItem(interned, s);
14016    Py_END_ALLOW_RECURSION
14017
14018        if (t) {
14019            Py_INCREF(t);
14020            Py_DECREF(*p);
14021            *p = t;
14022            return;
14023        }
14024
14025    PyThreadState_GET()->recursion_critical = 1;
14026    if (PyDict_SetItem(interned, s, s) < 0) {
14027        PyErr_Clear();
14028        PyThreadState_GET()->recursion_critical = 0;
14029        return;
14030    }
14031    PyThreadState_GET()->recursion_critical = 0;
14032    /* The two references in interned are not counted by refcnt.
14033       The deallocator will take care of this */
14034    Py_REFCNT(s) -= 2;
14035    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
14036}
14037
14038void
14039PyUnicode_InternImmortal(PyObject **p)
14040{
14041    PyUnicode_InternInPlace(p);
14042    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
14043        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
14044        Py_INCREF(*p);
14045    }
14046}
14047
14048PyObject *
14049PyUnicode_InternFromString(const char *cp)
14050{
14051    PyObject *s = PyUnicode_FromString(cp);
14052    if (s == NULL)
14053        return NULL;
14054    PyUnicode_InternInPlace(&s);
14055    return s;
14056}
14057
14058void
14059_Py_ReleaseInternedUnicodeStrings(void)
14060{
14061    PyObject *keys;
14062    PyObject *s;
14063    Py_ssize_t i, n;
14064    Py_ssize_t immortal_size = 0, mortal_size = 0;
14065
14066    if (interned == NULL || !PyDict_Check(interned))
14067        return;
14068    keys = PyDict_Keys(interned);
14069    if (keys == NULL || !PyList_Check(keys)) {
14070        PyErr_Clear();
14071        return;
14072    }
14073
14074    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14075       detector, interned unicode strings are not forcibly deallocated;
14076       rather, we give them their stolen references back, and then clear
14077       and DECREF the interned dict. */
14078
14079    n = PyList_GET_SIZE(keys);
14080    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14081            n);
14082    for (i = 0; i < n; i++) {
14083        s = PyList_GET_ITEM(keys, i);
14084        if (PyUnicode_READY(s) == -1) {
14085            assert(0 && "could not ready string");
14086            fprintf(stderr, "could not ready string\n");
14087        }
14088        switch (PyUnicode_CHECK_INTERNED(s)) {
14089        case SSTATE_NOT_INTERNED:
14090            /* XXX Shouldn't happen */
14091            break;
14092        case SSTATE_INTERNED_IMMORTAL:
14093            Py_REFCNT(s) += 1;
14094            immortal_size += PyUnicode_GET_LENGTH(s);
14095            break;
14096        case SSTATE_INTERNED_MORTAL:
14097            Py_REFCNT(s) += 2;
14098            mortal_size += PyUnicode_GET_LENGTH(s);
14099            break;
14100        default:
14101            Py_FatalError("Inconsistent interned string state.");
14102        }
14103        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14104    }
14105    fprintf(stderr, "total size of all interned strings: "
14106            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14107            "mortal/immortal\n", mortal_size, immortal_size);
14108    Py_DECREF(keys);
14109    PyDict_Clear(interned);
14110    Py_DECREF(interned);
14111    interned = NULL;
14112}
14113
14114
14115/********************* Unicode Iterator **************************/
14116
14117typedef struct {
14118    PyObject_HEAD
14119    Py_ssize_t it_index;
14120    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14121} unicodeiterobject;
14122
14123static void
14124unicodeiter_dealloc(unicodeiterobject *it)
14125{
14126    _PyObject_GC_UNTRACK(it);
14127    Py_XDECREF(it->it_seq);
14128    PyObject_GC_Del(it);
14129}
14130
14131static int
14132unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14133{
14134    Py_VISIT(it->it_seq);
14135    return 0;
14136}
14137
14138static PyObject *
14139unicodeiter_next(unicodeiterobject *it)
14140{
14141    PyObject *seq, *item;
14142
14143    assert(it != NULL);
14144    seq = it->it_seq;
14145    if (seq == NULL)
14146        return NULL;
14147    assert(_PyUnicode_CHECK(seq));
14148
14149    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14150        int kind = PyUnicode_KIND(seq);
14151        void *data = PyUnicode_DATA(seq);
14152        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14153        item = PyUnicode_FromOrdinal(chr);
14154        if (item != NULL)
14155            ++it->it_index;
14156        return item;
14157    }
14158
14159    Py_DECREF(seq);
14160    it->it_seq = NULL;
14161    return NULL;
14162}
14163
14164static PyObject *
14165unicodeiter_len(unicodeiterobject *it)
14166{
14167    Py_ssize_t len = 0;
14168    if (it->it_seq)
14169        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14170    return PyLong_FromSsize_t(len);
14171}
14172
14173PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14174
14175static PyMethodDef unicodeiter_methods[] = {
14176    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14177     length_hint_doc},
14178    {NULL,      NULL}       /* sentinel */
14179};
14180
14181PyTypeObject PyUnicodeIter_Type = {
14182    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14183    "str_iterator",         /* tp_name */
14184    sizeof(unicodeiterobject),      /* tp_basicsize */
14185    0,                  /* tp_itemsize */
14186    /* methods */
14187    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14188    0,                  /* tp_print */
14189    0,                  /* tp_getattr */
14190    0,                  /* tp_setattr */
14191    0,                  /* tp_reserved */
14192    0,                  /* tp_repr */
14193    0,                  /* tp_as_number */
14194    0,                  /* tp_as_sequence */
14195    0,                  /* tp_as_mapping */
14196    0,                  /* tp_hash */
14197    0,                  /* tp_call */
14198    0,                  /* tp_str */
14199    PyObject_GenericGetAttr,        /* tp_getattro */
14200    0,                  /* tp_setattro */
14201    0,                  /* tp_as_buffer */
14202    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14203    0,                  /* tp_doc */
14204    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14205    0,                  /* tp_clear */
14206    0,                  /* tp_richcompare */
14207    0,                  /* tp_weaklistoffset */
14208    PyObject_SelfIter,          /* tp_iter */
14209    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14210    unicodeiter_methods,            /* tp_methods */
14211    0,
14212};
14213
14214static PyObject *
14215unicode_iter(PyObject *seq)
14216{
14217    unicodeiterobject *it;
14218
14219    if (!PyUnicode_Check(seq)) {
14220        PyErr_BadInternalCall();
14221        return NULL;
14222    }
14223    if (PyUnicode_READY(seq) == -1)
14224        return NULL;
14225    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14226    if (it == NULL)
14227        return NULL;
14228    it->it_index = 0;
14229    Py_INCREF(seq);
14230    it->it_seq = seq;
14231    _PyObject_GC_TRACK(it);
14232    return (PyObject *)it;
14233}
14234
14235
14236size_t
14237Py_UNICODE_strlen(const Py_UNICODE *u)
14238{
14239    int res = 0;
14240    while(*u++)
14241        res++;
14242    return res;
14243}
14244
14245Py_UNICODE*
14246Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14247{
14248    Py_UNICODE *u = s1;
14249    while ((*u++ = *s2++));
14250    return s1;
14251}
14252
14253Py_UNICODE*
14254Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14255{
14256    Py_UNICODE *u = s1;
14257    while ((*u++ = *s2++))
14258        if (n-- == 0)
14259            break;
14260    return s1;
14261}
14262
14263Py_UNICODE*
14264Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14265{
14266    Py_UNICODE *u1 = s1;
14267    u1 += Py_UNICODE_strlen(u1);
14268    Py_UNICODE_strcpy(u1, s2);
14269    return s1;
14270}
14271
14272int
14273Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14274{
14275    while (*s1 && *s2 && *s1 == *s2)
14276        s1++, s2++;
14277    if (*s1 && *s2)
14278        return (*s1 < *s2) ? -1 : +1;
14279    if (*s1)
14280        return 1;
14281    if (*s2)
14282        return -1;
14283    return 0;
14284}
14285
14286int
14287Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14288{
14289    register Py_UNICODE u1, u2;
14290    for (; n != 0; n--) {
14291        u1 = *s1;
14292        u2 = *s2;
14293        if (u1 != u2)
14294            return (u1 < u2) ? -1 : +1;
14295        if (u1 == '\0')
14296            return 0;
14297        s1++;
14298        s2++;
14299    }
14300    return 0;
14301}
14302
14303Py_UNICODE*
14304Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14305{
14306    const Py_UNICODE *p;
14307    for (p = s; *p; p++)
14308        if (*p == c)
14309            return (Py_UNICODE*)p;
14310    return NULL;
14311}
14312
14313Py_UNICODE*
14314Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14315{
14316    const Py_UNICODE *p;
14317    p = s + Py_UNICODE_strlen(s);
14318    while (p != s) {
14319        p--;
14320        if (*p == c)
14321            return (Py_UNICODE*)p;
14322    }
14323    return NULL;
14324}
14325
14326Py_UNICODE*
14327PyUnicode_AsUnicodeCopy(PyObject *unicode)
14328{
14329    Py_UNICODE *u, *copy;
14330    Py_ssize_t len, size;
14331
14332    if (!PyUnicode_Check(unicode)) {
14333        PyErr_BadArgument();
14334        return NULL;
14335    }
14336    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14337    if (u == NULL)
14338        return NULL;
14339    /* Ensure we won't overflow the size. */
14340    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14341        PyErr_NoMemory();
14342        return NULL;
14343    }
14344    size = len + 1; /* copy the null character */
14345    size *= sizeof(Py_UNICODE);
14346    copy = PyMem_Malloc(size);
14347    if (copy == NULL) {
14348        PyErr_NoMemory();
14349        return NULL;
14350    }
14351    memcpy(copy, u, size);
14352    return copy;
14353}
14354
14355/* A _string module, to export formatter_parser and formatter_field_name_split
14356   to the string.Formatter class implemented in Python. */
14357
14358static PyMethodDef _string_methods[] = {
14359    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14360     METH_O, PyDoc_STR("split the argument as a field name")},
14361    {"formatter_parser", (PyCFunction) formatter_parser,
14362     METH_O, PyDoc_STR("parse the argument as a format string")},
14363    {NULL, NULL}
14364};
14365
14366static struct PyModuleDef _string_module = {
14367    PyModuleDef_HEAD_INIT,
14368    "_string",
14369    PyDoc_STR("string helper module"),
14370    0,
14371    _string_methods,
14372    NULL,
14373    NULL,
14374    NULL,
14375    NULL
14376};
14377
14378PyMODINIT_FUNC
14379PyInit__string(void)
14380{
14381    return PyModule_Create(&_string_module);
14382}
14383
14384
14385#ifdef __cplusplus
14386}
14387#endif
14388