unicodeobject.c revision 84def3774d2079ea2a812e0220507ff0e27247e7
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
57/* --- Globals ------------------------------------------------------------
58
59   The globals are initialized by the _PyUnicode_Init() API and should
60   not be used before calling that API.
61
62*/
63
64
65#ifdef __cplusplus
66extern "C" {
67#endif
68
69/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
72#ifdef Py_DEBUG
73#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
74#else
75#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
77
78#define _PyUnicode_UTF8(op)                             \
79    (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op)                              \
81    (assert(_PyUnicode_CHECK(op)),                      \
82     assert(PyUnicode_IS_READY(op)),                    \
83     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
84         ((char*)((PyASCIIObject*)(op) + 1)) :          \
85         _PyUnicode_UTF8(op))
86#define _PyUnicode_UTF8_LENGTH(op)                      \
87    (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op)                       \
89    (assert(_PyUnicode_CHECK(op)),                      \
90     assert(PyUnicode_IS_READY(op)),                    \
91     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
92         ((PyASCIIObject*)(op))->length :               \
93         _PyUnicode_UTF8_LENGTH(op))
94#define _PyUnicode_WSTR(op)                             \
95    (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op)                      \
97    (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op)                           \
99    (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op)                            \
101    (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op)                             \
103    (((PyASCIIObject *)(op))->hash)
104#define _PyUnicode_KIND(op)                             \
105    (assert(_PyUnicode_CHECK(op)),                      \
106     ((PyASCIIObject *)(op))->state.kind)
107#define _PyUnicode_GET_LENGTH(op)                       \
108    (assert(_PyUnicode_CHECK(op)),                      \
109     ((PyASCIIObject *)(op))->length)
110#define _PyUnicode_DATA_ANY(op)                         \
111    (((PyUnicodeObject*)(op))->data.any)
112
113#undef PyUnicode_READY
114#define PyUnicode_READY(op)                             \
115    (assert(_PyUnicode_CHECK(op)),                      \
116     (PyUnicode_IS_READY(op) ?                          \
117      0 :                                               \
118      _PyUnicode_Ready(op)))
119
120#define _PyUnicode_SHARE_UTF8(op)                       \
121    (assert(_PyUnicode_CHECK(op)),                      \
122     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
123     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op)                       \
125    (assert(_PyUnicode_CHECK(op)),                      \
126     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
128/* true if the Unicode object has an allocated UTF-8 memory block
129   (not shared with other data) */
130#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
131    (assert(_PyUnicode_CHECK(op)),                      \
132     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
133      && _PyUnicode_UTF8(op)                            \
134      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
136/* true if the Unicode object has an allocated wstr memory block
137   (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
139    (assert(_PyUnicode_CHECK(op)),                      \
140     (_PyUnicode_WSTR(op) &&                            \
141      (!PyUnicode_IS_READY(op) ||                       \
142       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
144/* Generic helper macro to convert characters of different types.
145   from_type and to_type have to be valid type names, begin and end
146   are pointers to the source characters which should be of type
147   "from_type *".  to is a pointer of type "to_type *" and points to the
148   buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150    do {                                                \
151        to_type *_to = (to_type *) to;                  \
152        const from_type *_iter = (begin);               \
153        const from_type *_end = (end);                  \
154        Py_ssize_t n = (_end) - (_iter);                \
155        const from_type *_unrolled_end =                \
156            _iter + (n & ~ (Py_ssize_t) 3);             \
157        while (_iter < (_unrolled_end)) {               \
158            _to[0] = (to_type) _iter[0];                \
159            _to[1] = (to_type) _iter[1];                \
160            _to[2] = (to_type) _iter[2];                \
161            _to[3] = (to_type) _iter[3];                \
162            _iter += 4; _to += 4;                       \
163        }                                               \
164        while (_iter < (_end))                          \
165            *_to++ = (to_type) *_iter++;                \
166    } while (0)
167
168/* The Unicode string has been modified: reset the hash */
169#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
170
171/* This dictionary holds all interned unicode strings.  Note that references
172   to strings in this dictionary are *not* counted in the string's ob_refcnt.
173   When the interned string reaches a refcnt of 0 the string deallocation
174   function will delete the reference from this dictionary.
175
176   Another way to look at this is that to say that the actual reference
177   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
178*/
179static PyObject *interned;
180
181/* The empty Unicode object is shared to improve performance. */
182static PyObject *unicode_empty;
183
184/* List of static strings. */
185static _Py_Identifier *static_strings;
186
187/* Single character Unicode strings in the Latin-1 range are being
188   shared as well. */
189static PyObject *unicode_latin1[256];
190
191/* Fast detection of the most frequent whitespace characters */
192const unsigned char _Py_ascii_whitespace[] = {
193    0, 0, 0, 0, 0, 0, 0, 0,
194/*     case 0x0009: * CHARACTER TABULATION */
195/*     case 0x000A: * LINE FEED */
196/*     case 0x000B: * LINE TABULATION */
197/*     case 0x000C: * FORM FEED */
198/*     case 0x000D: * CARRIAGE RETURN */
199    0, 1, 1, 1, 1, 1, 0, 0,
200    0, 0, 0, 0, 0, 0, 0, 0,
201/*     case 0x001C: * FILE SEPARATOR */
202/*     case 0x001D: * GROUP SEPARATOR */
203/*     case 0x001E: * RECORD SEPARATOR */
204/*     case 0x001F: * UNIT SEPARATOR */
205    0, 0, 0, 0, 1, 1, 1, 1,
206/*     case 0x0020: * SPACE */
207    1, 0, 0, 0, 0, 0, 0, 0,
208    0, 0, 0, 0, 0, 0, 0, 0,
209    0, 0, 0, 0, 0, 0, 0, 0,
210    0, 0, 0, 0, 0, 0, 0, 0,
211
212    0, 0, 0, 0, 0, 0, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214    0, 0, 0, 0, 0, 0, 0, 0,
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0
220};
221
222/* forward */
223static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
224static PyObject* get_latin1_char(unsigned char ch);
225static void copy_characters(
226    PyObject *to, Py_ssize_t to_start,
227    PyObject *from, Py_ssize_t from_start,
228    Py_ssize_t how_many);
229
230static PyObject *
231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
241       PyObject **errorHandler,const char *encoding, const char *reason,
242       PyObject *unicode, PyObject **exceptionObject,
243       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
245static void
246raise_encode_exception(PyObject **exceptionObject,
247                       const char *encoding,
248                       PyObject *unicode,
249                       Py_ssize_t startpos, Py_ssize_t endpos,
250                       const char *reason);
251
252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
254    0, 0, 0, 0, 0, 0, 0, 0,
255/*         0x000A, * LINE FEED */
256/*         0x000B, * LINE TABULATION */
257/*         0x000C, * FORM FEED */
258/*         0x000D, * CARRIAGE RETURN */
259    0, 0, 1, 1, 1, 1, 0, 0,
260    0, 0, 0, 0, 0, 0, 0, 0,
261/*         0x001C, * FILE SEPARATOR */
262/*         0x001D, * GROUP SEPARATOR */
263/*         0x001E, * RECORD SEPARATOR */
264    0, 0, 0, 0, 1, 1, 1, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268    0, 0, 0, 0, 0, 0, 0, 0,
269
270    0, 0, 0, 0, 0, 0, 0, 0,
271    0, 0, 0, 0, 0, 0, 0, 0,
272    0, 0, 0, 0, 0, 0, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0,
277    0, 0, 0, 0, 0, 0, 0, 0
278};
279
280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281   This function is kept for backward compatibility with the old API. */
282Py_UNICODE
283PyUnicode_GetMax(void)
284{
285#ifdef Py_UNICODE_WIDE
286    return 0x10FFFF;
287#else
288    /* This is actually an illegal character, so it should
289       not be passed to unichr. */
290    return 0xFFFF;
291#endif
292}
293
294#ifdef Py_DEBUG
295int
296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
297{
298    PyASCIIObject *ascii;
299    unsigned int kind;
300
301    assert(PyUnicode_Check(op));
302
303    ascii = (PyASCIIObject *)op;
304    kind = ascii->state.kind;
305
306    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
307        assert(kind == PyUnicode_1BYTE_KIND);
308        assert(ascii->state.ready == 1);
309    }
310    else {
311        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
312        void *data;
313
314        if (ascii->state.compact == 1) {
315            data = compact + 1;
316            assert(kind == PyUnicode_1BYTE_KIND
317                   || kind == PyUnicode_2BYTE_KIND
318                   || kind == PyUnicode_4BYTE_KIND);
319            assert(ascii->state.ascii == 0);
320            assert(ascii->state.ready == 1);
321            assert (compact->utf8 != data);
322        }
323        else {
324            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326            data = unicode->data.any;
327            if (kind == PyUnicode_WCHAR_KIND) {
328                assert(ascii->length == 0);
329                assert(ascii->hash == -1);
330                assert(ascii->state.compact == 0);
331                assert(ascii->state.ascii == 0);
332                assert(ascii->state.ready == 0);
333                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334                assert(ascii->wstr != NULL);
335                assert(data == NULL);
336                assert(compact->utf8 == NULL);
337            }
338            else {
339                assert(kind == PyUnicode_1BYTE_KIND
340                       || kind == PyUnicode_2BYTE_KIND
341                       || kind == PyUnicode_4BYTE_KIND);
342                assert(ascii->state.compact == 0);
343                assert(ascii->state.ready == 1);
344                assert(data != NULL);
345                if (ascii->state.ascii) {
346                    assert (compact->utf8 == data);
347                    assert (compact->utf8_length == ascii->length);
348                }
349                else
350                    assert (compact->utf8 != data);
351            }
352        }
353        if (kind != PyUnicode_WCHAR_KIND) {
354            if (
355#if SIZEOF_WCHAR_T == 2
356                kind == PyUnicode_2BYTE_KIND
357#else
358                kind == PyUnicode_4BYTE_KIND
359#endif
360               )
361            {
362                assert(ascii->wstr == data);
363                assert(compact->wstr_length == ascii->length);
364            } else
365                assert(ascii->wstr != data);
366        }
367
368        if (compact->utf8 == NULL)
369            assert(compact->utf8_length == 0);
370        if (ascii->wstr == NULL)
371            assert(compact->wstr_length == 0);
372    }
373    /* check that the best kind is used */
374    if (check_content && kind != PyUnicode_WCHAR_KIND)
375    {
376        Py_ssize_t i;
377        Py_UCS4 maxchar = 0;
378        void *data = PyUnicode_DATA(ascii);
379        for (i=0; i < ascii->length; i++)
380        {
381            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382            if (ch > maxchar)
383                maxchar = ch;
384        }
385        if (kind == PyUnicode_1BYTE_KIND) {
386            if (ascii->state.ascii == 0) {
387                assert(maxchar >= 128);
388                assert(maxchar <= 255);
389            }
390            else
391                assert(maxchar < 128);
392        }
393        else if (kind == PyUnicode_2BYTE_KIND) {
394            assert(maxchar >= 0x100);
395            assert(maxchar <= 0xFFFF);
396        }
397        else {
398            assert(maxchar >= 0x10000);
399            assert(maxchar <= MAX_UNICODE);
400        }
401    }
402    return 1;
403}
404#endif
405
406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410    Py_ssize_t len;
411
412    assert(Py_REFCNT(unicode) == 1);
413
414    len = _PyUnicode_WSTR_LENGTH(unicode);
415    if (len == 0) {
416        Py_INCREF(unicode_empty);
417        Py_DECREF(unicode);
418        return unicode_empty;
419    }
420
421    if (len == 1) {
422        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423        if (ch < 256) {
424            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425            Py_DECREF(unicode);
426            return latin1_char;
427        }
428    }
429
430    if (_PyUnicode_Ready(unicode) < 0) {
431        Py_XDECREF(unicode);
432        return NULL;
433    }
434#else
435    /* don't make the result ready in debug mode to ensure that the caller
436       makes the string ready before using it */
437    assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439    return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445    Py_ssize_t length;
446
447    length = PyUnicode_GET_LENGTH(unicode);
448    if (length == 0) {
449        if (unicode != unicode_empty) {
450            Py_INCREF(unicode_empty);
451            Py_DECREF(unicode);
452        }
453        return unicode_empty;
454    }
455
456    if (length == 1) {
457        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458        if (ch < 256) {
459            PyObject *latin1_char = unicode_latin1[ch];
460            if (latin1_char != NULL) {
461                if (unicode != latin1_char) {
462                    Py_INCREF(latin1_char);
463                    Py_DECREF(unicode);
464                }
465                return latin1_char;
466            }
467            else {
468                assert(_PyUnicode_CheckConsistency(unicode, 1));
469                Py_INCREF(unicode);
470                unicode_latin1[ch] = unicode;
471                return unicode;
472            }
473        }
474    }
475
476    assert(_PyUnicode_CheckConsistency(unicode, 1));
477    return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483    assert(_PyUnicode_CHECK(unicode));
484    if (PyUnicode_IS_READY(unicode))
485        return unicode_result_ready(unicode);
486    else
487        return unicode_result_wchar(unicode);
488}
489
490#ifdef HAVE_MBCS
491static OSVERSIONINFOEX winver;
492#endif
493
494/* --- Bloom Filters ----------------------------------------------------- */
495
496/* stuff to implement simple "bloom filters" for Unicode characters.
497   to keep things simple, we use a single bitmask, using the least 5
498   bits from each unicode characters as the bit index. */
499
500/* the linebreak mask is set up by Unicode_Init below */
501
502#if LONG_BIT >= 128
503#define BLOOM_WIDTH 128
504#elif LONG_BIT >= 64
505#define BLOOM_WIDTH 64
506#elif LONG_BIT >= 32
507#define BLOOM_WIDTH 32
508#else
509#error "LONG_BIT is smaller than 32"
510#endif
511
512#define BLOOM_MASK unsigned long
513
514static BLOOM_MASK bloom_linebreak;
515
516#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
517#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
518
519#define BLOOM_LINEBREAK(ch)                                             \
520    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
521     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
522
523Py_LOCAL_INLINE(BLOOM_MASK)
524make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
525{
526    /* calculate simple bloom-style bitmask for a given unicode string */
527
528    BLOOM_MASK mask;
529    Py_ssize_t i;
530
531    mask = 0;
532    for (i = 0; i < len; i++)
533        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
534
535    return mask;
536}
537
538#define BLOOM_MEMBER(mask, chr, str) \
539    (BLOOM(mask, chr) \
540     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
541
542/* Compilation of templated routines */
543
544#include "stringlib/asciilib.h"
545#include "stringlib/fastsearch.h"
546#include "stringlib/partition.h"
547#include "stringlib/split.h"
548#include "stringlib/count.h"
549#include "stringlib/find.h"
550#include "stringlib/find_max_char.h"
551#include "stringlib/localeutil.h"
552#include "stringlib/undef.h"
553
554#include "stringlib/ucs1lib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs2lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs4lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
584#include "stringlib/unicodedefs.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
588#include "stringlib/undef.h"
589
590/* --- Unicode Object ----------------------------------------------------- */
591
592static PyObject *
593fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
594
595Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
596                                     Py_ssize_t size, Py_UCS4 ch,
597                                     int direction)
598{
599    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
600
601    switch (kind) {
602    case PyUnicode_1BYTE_KIND:
603        {
604            Py_UCS1 ch1 = (Py_UCS1) ch;
605            if (ch1 == ch)
606                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
607            else
608                return -1;
609        }
610    case PyUnicode_2BYTE_KIND:
611        {
612            Py_UCS2 ch2 = (Py_UCS2) ch;
613            if (ch2 == ch)
614                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
615            else
616                return -1;
617        }
618    case PyUnicode_4BYTE_KIND:
619        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
620    default:
621        assert(0);
622        return -1;
623    }
624}
625
626static PyObject*
627resize_compact(PyObject *unicode, Py_ssize_t length)
628{
629    Py_ssize_t char_size;
630    Py_ssize_t struct_size;
631    Py_ssize_t new_size;
632    int share_wstr;
633    PyObject *new_unicode;
634
635    assert(PyUnicode_IS_READY(unicode));
636    char_size = PyUnicode_KIND(unicode);
637    if (PyUnicode_IS_COMPACT_ASCII(unicode))
638        struct_size = sizeof(PyASCIIObject);
639    else
640        struct_size = sizeof(PyCompactUnicodeObject);
641    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
642
643    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
644        Py_DECREF(unicode);
645        PyErr_NoMemory();
646        return NULL;
647    }
648    new_size = (struct_size + (length + 1) * char_size);
649
650    _Py_DEC_REFTOTAL;
651    _Py_ForgetReference(unicode);
652
653    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
654    if (new_unicode == NULL) {
655        PyObject_Del(unicode);
656        PyErr_NoMemory();
657        return NULL;
658    }
659    unicode = new_unicode;
660    _Py_NewReference(unicode);
661
662    _PyUnicode_LENGTH(unicode) = length;
663    if (share_wstr) {
664        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
665        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
666            _PyUnicode_WSTR_LENGTH(unicode) = length;
667    }
668    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
669                    length, 0);
670    return unicode;
671}
672
673static int
674resize_inplace(PyObject *unicode, Py_ssize_t length)
675{
676    wchar_t *wstr;
677    assert(!PyUnicode_IS_COMPACT(unicode));
678    assert(Py_REFCNT(unicode) == 1);
679
680    _PyUnicode_DIRTY(unicode);
681
682    if (PyUnicode_IS_READY(unicode)) {
683        Py_ssize_t char_size;
684        Py_ssize_t new_size;
685        int share_wstr, share_utf8;
686        void *data;
687
688        data = _PyUnicode_DATA_ANY(unicode);
689        assert(data != NULL);
690        char_size = PyUnicode_KIND(unicode);
691        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
692        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
693        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
694        {
695            PyObject_DEL(_PyUnicode_UTF8(unicode));
696            _PyUnicode_UTF8(unicode) = NULL;
697            _PyUnicode_UTF8_LENGTH(unicode) = 0;
698        }
699
700        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
701            PyErr_NoMemory();
702            return -1;
703        }
704        new_size = (length + 1) * char_size;
705
706        data = (PyObject *)PyObject_REALLOC(data, new_size);
707        if (data == NULL) {
708            PyErr_NoMemory();
709            return -1;
710        }
711        _PyUnicode_DATA_ANY(unicode) = data;
712        if (share_wstr) {
713            _PyUnicode_WSTR(unicode) = data;
714            _PyUnicode_WSTR_LENGTH(unicode) = length;
715        }
716        if (share_utf8) {
717            _PyUnicode_UTF8(unicode) = data;
718            _PyUnicode_UTF8_LENGTH(unicode) = length;
719        }
720        _PyUnicode_LENGTH(unicode) = length;
721        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
722        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
723            assert(_PyUnicode_CheckConsistency(unicode, 0));
724            return 0;
725        }
726    }
727    assert(_PyUnicode_WSTR(unicode) != NULL);
728
729    /* check for integer overflow */
730    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
731        PyErr_NoMemory();
732        return -1;
733    }
734    wstr =  _PyUnicode_WSTR(unicode);
735    wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
736    if (!wstr) {
737        PyErr_NoMemory();
738        return -1;
739    }
740    _PyUnicode_WSTR(unicode) = wstr;
741    _PyUnicode_WSTR(unicode)[length] = 0;
742    _PyUnicode_WSTR_LENGTH(unicode) = length;
743    assert(_PyUnicode_CheckConsistency(unicode, 0));
744    return 0;
745}
746
747static PyObject*
748resize_copy(PyObject *unicode, Py_ssize_t length)
749{
750    Py_ssize_t copy_length;
751    if (PyUnicode_IS_COMPACT(unicode)) {
752        PyObject *copy;
753        assert(PyUnicode_IS_READY(unicode));
754
755        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
756        if (copy == NULL)
757            return NULL;
758
759        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
760        copy_characters(copy, 0, unicode, 0, copy_length);
761        return copy;
762    }
763    else {
764        PyObject *w;
765        assert(_PyUnicode_WSTR(unicode) != NULL);
766        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
767        w = (PyObject*)_PyUnicode_New(length);
768        if (w == NULL)
769            return NULL;
770        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
771        copy_length = Py_MIN(copy_length, length);
772        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
773                        copy_length);
774        return w;
775    }
776}
777
778/* We allocate one more byte to make sure the string is
779   Ux0000 terminated; some code (e.g. new_identifier)
780   relies on that.
781
782   XXX This allocator could further be enhanced by assuring that the
783   free list never reduces its size below 1.
784
785*/
786
787#ifdef Py_DEBUG
788static int unicode_old_new_calls = 0;
789#endif
790
791static PyUnicodeObject *
792_PyUnicode_New(Py_ssize_t length)
793{
794    register PyUnicodeObject *unicode;
795    size_t new_size;
796
797    /* Optimization for empty strings */
798    if (length == 0 && unicode_empty != NULL) {
799        Py_INCREF(unicode_empty);
800        return (PyUnicodeObject*)unicode_empty;
801    }
802
803    /* Ensure we won't overflow the size. */
804    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
805        return (PyUnicodeObject *)PyErr_NoMemory();
806    }
807    if (length < 0) {
808        PyErr_SetString(PyExc_SystemError,
809                        "Negative size passed to _PyUnicode_New");
810        return NULL;
811    }
812
813#ifdef Py_DEBUG
814    ++unicode_old_new_calls;
815#endif
816
817    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
818    if (unicode == NULL)
819        return NULL;
820    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
821    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
822    if (!_PyUnicode_WSTR(unicode)) {
823        PyErr_NoMemory();
824        goto onError;
825    }
826
827    /* Initialize the first element to guard against cases where
828     * the caller fails before initializing str -- unicode_resize()
829     * reads str[0], and the Keep-Alive optimization can keep memory
830     * allocated for str alive across a call to unicode_dealloc(unicode).
831     * We don't want unicode_resize to read uninitialized memory in
832     * that case.
833     */
834    _PyUnicode_WSTR(unicode)[0] = 0;
835    _PyUnicode_WSTR(unicode)[length] = 0;
836    _PyUnicode_WSTR_LENGTH(unicode) = length;
837    _PyUnicode_HASH(unicode) = -1;
838    _PyUnicode_STATE(unicode).interned = 0;
839    _PyUnicode_STATE(unicode).kind = 0;
840    _PyUnicode_STATE(unicode).compact = 0;
841    _PyUnicode_STATE(unicode).ready = 0;
842    _PyUnicode_STATE(unicode).ascii = 0;
843    _PyUnicode_DATA_ANY(unicode) = NULL;
844    _PyUnicode_LENGTH(unicode) = 0;
845    _PyUnicode_UTF8(unicode) = NULL;
846    _PyUnicode_UTF8_LENGTH(unicode) = 0;
847    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
848    return unicode;
849
850  onError:
851    /* XXX UNREF/NEWREF interface should be more symmetrical */
852    _Py_DEC_REFTOTAL;
853    _Py_ForgetReference((PyObject *)unicode);
854    PyObject_Del(unicode);
855    return NULL;
856}
857
858static const char*
859unicode_kind_name(PyObject *unicode)
860{
861    /* don't check consistency: unicode_kind_name() is called from
862       _PyUnicode_Dump() */
863    if (!PyUnicode_IS_COMPACT(unicode))
864    {
865        if (!PyUnicode_IS_READY(unicode))
866            return "wstr";
867        switch(PyUnicode_KIND(unicode))
868        {
869        case PyUnicode_1BYTE_KIND:
870            if (PyUnicode_IS_ASCII(unicode))
871                return "legacy ascii";
872            else
873                return "legacy latin1";
874        case PyUnicode_2BYTE_KIND:
875            return "legacy UCS2";
876        case PyUnicode_4BYTE_KIND:
877            return "legacy UCS4";
878        default:
879            return "<legacy invalid kind>";
880        }
881    }
882    assert(PyUnicode_IS_READY(unicode));
883    switch(PyUnicode_KIND(unicode))
884    {
885    case PyUnicode_1BYTE_KIND:
886        if (PyUnicode_IS_ASCII(unicode))
887            return "ascii";
888        else
889            return "latin1";
890    case PyUnicode_2BYTE_KIND:
891        return "UCS2";
892    case PyUnicode_4BYTE_KIND:
893        return "UCS4";
894    default:
895        return "<invalid compact kind>";
896    }
897}
898
899#ifdef Py_DEBUG
900static int unicode_new_new_calls = 0;
901
902/* Functions wrapping macros for use in debugger */
903char *_PyUnicode_utf8(void *unicode){
904    return PyUnicode_UTF8(unicode);
905}
906
907void *_PyUnicode_compact_data(void *unicode) {
908    return _PyUnicode_COMPACT_DATA(unicode);
909}
910void *_PyUnicode_data(void *unicode){
911    printf("obj %p\n", unicode);
912    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
913    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
914    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
915    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
916    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
917    return PyUnicode_DATA(unicode);
918}
919
920void
921_PyUnicode_Dump(PyObject *op)
922{
923    PyASCIIObject *ascii = (PyASCIIObject *)op;
924    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
925    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
926    void *data;
927
928    if (ascii->state.compact)
929    {
930        if (ascii->state.ascii)
931            data = (ascii + 1);
932        else
933            data = (compact + 1);
934    }
935    else
936        data = unicode->data.any;
937    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
938
939    if (ascii->wstr == data)
940        printf("shared ");
941    printf("wstr=%p", ascii->wstr);
942
943    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
944        printf(" (%zu), ", compact->wstr_length);
945        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
946            printf("shared ");
947        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
948    }
949    printf(", data=%p\n", data);
950}
951#endif
952
953PyObject *
954PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
955{
956    PyObject *obj;
957    PyCompactUnicodeObject *unicode;
958    void *data;
959    int kind_state;
960    int is_sharing, is_ascii;
961    Py_ssize_t char_size;
962    Py_ssize_t struct_size;
963
964    /* Optimization for empty strings */
965    if (size == 0 && unicode_empty != NULL) {
966        Py_INCREF(unicode_empty);
967        return unicode_empty;
968    }
969
970#ifdef Py_DEBUG
971    ++unicode_new_new_calls;
972#endif
973
974    is_ascii = 0;
975    is_sharing = 0;
976    struct_size = sizeof(PyCompactUnicodeObject);
977    if (maxchar < 128) {
978        kind_state = PyUnicode_1BYTE_KIND;
979        char_size = 1;
980        is_ascii = 1;
981        struct_size = sizeof(PyASCIIObject);
982    }
983    else if (maxchar < 256) {
984        kind_state = PyUnicode_1BYTE_KIND;
985        char_size = 1;
986    }
987    else if (maxchar < 65536) {
988        kind_state = PyUnicode_2BYTE_KIND;
989        char_size = 2;
990        if (sizeof(wchar_t) == 2)
991            is_sharing = 1;
992    }
993    else {
994        kind_state = PyUnicode_4BYTE_KIND;
995        char_size = 4;
996        if (sizeof(wchar_t) == 4)
997            is_sharing = 1;
998    }
999
1000    /* Ensure we won't overflow the size. */
1001    if (size < 0) {
1002        PyErr_SetString(PyExc_SystemError,
1003                        "Negative size passed to PyUnicode_New");
1004        return NULL;
1005    }
1006    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1007        return PyErr_NoMemory();
1008
1009    /* Duplicated allocation code from _PyObject_New() instead of a call to
1010     * PyObject_New() so we are able to allocate space for the object and
1011     * it's data buffer.
1012     */
1013    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1014    if (obj == NULL)
1015        return PyErr_NoMemory();
1016    obj = PyObject_INIT(obj, &PyUnicode_Type);
1017    if (obj == NULL)
1018        return NULL;
1019
1020    unicode = (PyCompactUnicodeObject *)obj;
1021    if (is_ascii)
1022        data = ((PyASCIIObject*)obj) + 1;
1023    else
1024        data = unicode + 1;
1025    _PyUnicode_LENGTH(unicode) = size;
1026    _PyUnicode_HASH(unicode) = -1;
1027    _PyUnicode_STATE(unicode).interned = 0;
1028    _PyUnicode_STATE(unicode).kind = kind_state;
1029    _PyUnicode_STATE(unicode).compact = 1;
1030    _PyUnicode_STATE(unicode).ready = 1;
1031    _PyUnicode_STATE(unicode).ascii = is_ascii;
1032    if (is_ascii) {
1033        ((char*)data)[size] = 0;
1034        _PyUnicode_WSTR(unicode) = NULL;
1035    }
1036    else if (kind_state == PyUnicode_1BYTE_KIND) {
1037        ((char*)data)[size] = 0;
1038        _PyUnicode_WSTR(unicode) = NULL;
1039        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1040        unicode->utf8 = NULL;
1041        unicode->utf8_length = 0;
1042        }
1043    else {
1044        unicode->utf8 = NULL;
1045        unicode->utf8_length = 0;
1046        if (kind_state == PyUnicode_2BYTE_KIND)
1047            ((Py_UCS2*)data)[size] = 0;
1048        else /* kind_state == PyUnicode_4BYTE_KIND */
1049            ((Py_UCS4*)data)[size] = 0;
1050        if (is_sharing) {
1051            _PyUnicode_WSTR_LENGTH(unicode) = size;
1052            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1053        }
1054        else {
1055            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1056            _PyUnicode_WSTR(unicode) = NULL;
1057        }
1058    }
1059    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1060    return obj;
1061}
1062
1063#if SIZEOF_WCHAR_T == 2
1064/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1065   will decode surrogate pairs, the other conversions are implemented as macros
1066   for efficiency.
1067
1068   This function assumes that unicode can hold one more code point than wstr
1069   characters for a terminating null character. */
1070static void
1071unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1072                              PyObject *unicode)
1073{
1074    const wchar_t *iter;
1075    Py_UCS4 *ucs4_out;
1076
1077    assert(unicode != NULL);
1078    assert(_PyUnicode_CHECK(unicode));
1079    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1080    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1081
1082    for (iter = begin; iter < end; ) {
1083        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1084                           _PyUnicode_GET_LENGTH(unicode)));
1085        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1086            && (iter+1) < end
1087            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1088        {
1089            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1090            iter += 2;
1091        }
1092        else {
1093            *ucs4_out++ = *iter;
1094            iter++;
1095        }
1096    }
1097    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1098                        _PyUnicode_GET_LENGTH(unicode)));
1099
1100}
1101#endif
1102
1103static int
1104_PyUnicode_Dirty(PyObject *unicode)
1105{
1106    assert(_PyUnicode_CHECK(unicode));
1107    if (Py_REFCNT(unicode) != 1) {
1108        PyErr_SetString(PyExc_SystemError,
1109                        "Cannot modify a string having more than 1 reference");
1110        return -1;
1111    }
1112    _PyUnicode_DIRTY(unicode);
1113    return 0;
1114}
1115
1116static int
1117_copy_characters(PyObject *to, Py_ssize_t to_start,
1118                 PyObject *from, Py_ssize_t from_start,
1119                 Py_ssize_t how_many, int check_maxchar)
1120{
1121    unsigned int from_kind, to_kind;
1122    void *from_data, *to_data;
1123    int fast;
1124
1125    assert(PyUnicode_Check(from));
1126    assert(PyUnicode_Check(to));
1127    assert(PyUnicode_IS_READY(from));
1128    assert(PyUnicode_IS_READY(to));
1129
1130    assert(PyUnicode_GET_LENGTH(from) >= how_many);
1131    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1132    assert(0 <= how_many);
1133
1134    if (how_many == 0)
1135        return 0;
1136
1137    from_kind = PyUnicode_KIND(from);
1138    from_data = PyUnicode_DATA(from);
1139    to_kind = PyUnicode_KIND(to);
1140    to_data = PyUnicode_DATA(to);
1141
1142#ifdef Py_DEBUG
1143    if (!check_maxchar
1144        && (from_kind > to_kind
1145            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
1146    {
1147        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1148        Py_UCS4 ch;
1149        Py_ssize_t i;
1150        for (i=0; i < how_many; i++) {
1151            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1152            assert(ch <= to_maxchar);
1153        }
1154    }
1155#endif
1156    fast = (from_kind == to_kind);
1157    if (check_maxchar
1158        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1159    {
1160        /* deny latin1 => ascii */
1161        fast = 0;
1162    }
1163
1164    if (fast) {
1165        Py_MEMCPY((char*)to_data + to_kind * to_start,
1166                  (char*)from_data + from_kind * from_start,
1167                  to_kind * how_many);
1168    }
1169    else if (from_kind == PyUnicode_1BYTE_KIND
1170             && to_kind == PyUnicode_2BYTE_KIND)
1171    {
1172        _PyUnicode_CONVERT_BYTES(
1173            Py_UCS1, Py_UCS2,
1174            PyUnicode_1BYTE_DATA(from) + from_start,
1175            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1176            PyUnicode_2BYTE_DATA(to) + to_start
1177            );
1178    }
1179    else if (from_kind == PyUnicode_1BYTE_KIND
1180             && to_kind == PyUnicode_4BYTE_KIND)
1181    {
1182        _PyUnicode_CONVERT_BYTES(
1183            Py_UCS1, Py_UCS4,
1184            PyUnicode_1BYTE_DATA(from) + from_start,
1185            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186            PyUnicode_4BYTE_DATA(to) + to_start
1187            );
1188    }
1189    else if (from_kind == PyUnicode_2BYTE_KIND
1190             && to_kind == PyUnicode_4BYTE_KIND)
1191    {
1192        _PyUnicode_CONVERT_BYTES(
1193            Py_UCS2, Py_UCS4,
1194            PyUnicode_2BYTE_DATA(from) + from_start,
1195            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1196            PyUnicode_4BYTE_DATA(to) + to_start
1197            );
1198    }
1199    else {
1200        /* check if max_char(from substring) <= max_char(to) */
1201        if (from_kind > to_kind
1202                /* latin1 => ascii */
1203            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1204        {
1205            /* slow path to check for character overflow */
1206            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1207            Py_UCS4 ch;
1208            Py_ssize_t i;
1209
1210#ifdef Py_DEBUG
1211            for (i=0; i < how_many; i++) {
1212                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1213                assert(ch <= to_maxchar);
1214                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1215            }
1216#else
1217            if (!check_maxchar) {
1218                for (i=0; i < how_many; i++) {
1219                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1220                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1221                }
1222            }
1223            else {
1224                for (i=0; i < how_many; i++) {
1225                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226                    if (ch > to_maxchar)
1227                        return 1;
1228                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1229                }
1230            }
1231#endif
1232        }
1233        else {
1234            assert(0 && "inconsistent state");
1235            return 1;
1236        }
1237    }
1238    return 0;
1239}
1240
1241static void
1242copy_characters(PyObject *to, Py_ssize_t to_start,
1243                       PyObject *from, Py_ssize_t from_start,
1244                       Py_ssize_t how_many)
1245{
1246    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1247}
1248
1249Py_ssize_t
1250PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1251                         PyObject *from, Py_ssize_t from_start,
1252                         Py_ssize_t how_many)
1253{
1254    int err;
1255
1256    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1257        PyErr_BadInternalCall();
1258        return -1;
1259    }
1260
1261    if (PyUnicode_READY(from))
1262        return -1;
1263    if (PyUnicode_READY(to))
1264        return -1;
1265
1266    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1267    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1268        PyErr_Format(PyExc_SystemError,
1269                     "Cannot write %zi characters at %zi "
1270                     "in a string of %zi characters",
1271                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1272        return -1;
1273    }
1274
1275    if (how_many == 0)
1276        return 0;
1277
1278    if (_PyUnicode_Dirty(to))
1279        return -1;
1280
1281    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1282    if (err) {
1283        PyErr_Format(PyExc_SystemError,
1284                     "Cannot copy %s characters "
1285                     "into a string of %s characters",
1286                     unicode_kind_name(from),
1287                     unicode_kind_name(to));
1288        return -1;
1289    }
1290    return how_many;
1291}
1292
1293/* Find the maximum code point and count the number of surrogate pairs so a
1294   correct string length can be computed before converting a string to UCS4.
1295   This function counts single surrogates as a character and not as a pair.
1296
1297   Return 0 on success, or -1 on error. */
1298static int
1299find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1300                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1301{
1302    const wchar_t *iter;
1303    Py_UCS4 ch;
1304
1305    assert(num_surrogates != NULL && maxchar != NULL);
1306    *num_surrogates = 0;
1307    *maxchar = 0;
1308
1309    for (iter = begin; iter < end; ) {
1310#if SIZEOF_WCHAR_T == 2
1311        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1312            && (iter+1) < end
1313            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1314        {
1315            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1316            ++(*num_surrogates);
1317            iter += 2;
1318        }
1319        else
1320#endif
1321        {
1322            ch = *iter;
1323            iter++;
1324        }
1325        if (ch > *maxchar) {
1326            *maxchar = ch;
1327            if (*maxchar > MAX_UNICODE) {
1328                PyErr_Format(PyExc_ValueError,
1329                             "character U+%x is not in range [U+0000; U+10ffff]",
1330                             ch);
1331                return -1;
1332            }
1333        }
1334    }
1335    return 0;
1336}
1337
1338#ifdef Py_DEBUG
1339static int unicode_ready_calls = 0;
1340#endif
1341
1342int
1343_PyUnicode_Ready(PyObject *unicode)
1344{
1345    wchar_t *end;
1346    Py_UCS4 maxchar = 0;
1347    Py_ssize_t num_surrogates;
1348#if SIZEOF_WCHAR_T == 2
1349    Py_ssize_t length_wo_surrogates;
1350#endif
1351
1352    /* _PyUnicode_Ready() is only intended for old-style API usage where
1353       strings were created using _PyObject_New() and where no canonical
1354       representation (the str field) has been set yet aka strings
1355       which are not yet ready. */
1356    assert(_PyUnicode_CHECK(unicode));
1357    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1358    assert(_PyUnicode_WSTR(unicode) != NULL);
1359    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1360    assert(_PyUnicode_UTF8(unicode) == NULL);
1361    /* Actually, it should neither be interned nor be anything else: */
1362    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1363
1364#ifdef Py_DEBUG
1365    ++unicode_ready_calls;
1366#endif
1367
1368    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1369    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1370                                &maxchar, &num_surrogates) == -1)
1371        return -1;
1372
1373    if (maxchar < 256) {
1374        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1375        if (!_PyUnicode_DATA_ANY(unicode)) {
1376            PyErr_NoMemory();
1377            return -1;
1378        }
1379        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1380                                _PyUnicode_WSTR(unicode), end,
1381                                PyUnicode_1BYTE_DATA(unicode));
1382        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1383        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1384        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1385        if (maxchar < 128) {
1386            _PyUnicode_STATE(unicode).ascii = 1;
1387            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1388            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389        }
1390        else {
1391            _PyUnicode_STATE(unicode).ascii = 0;
1392            _PyUnicode_UTF8(unicode) = NULL;
1393            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1394        }
1395        PyObject_FREE(_PyUnicode_WSTR(unicode));
1396        _PyUnicode_WSTR(unicode) = NULL;
1397        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1398    }
1399    /* In this case we might have to convert down from 4-byte native
1400       wchar_t to 2-byte unicode. */
1401    else if (maxchar < 65536) {
1402        assert(num_surrogates == 0 &&
1403               "FindMaxCharAndNumSurrogatePairs() messed up");
1404
1405#if SIZEOF_WCHAR_T == 2
1406        /* We can share representations and are done. */
1407        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1408        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1409        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1410        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1411        _PyUnicode_UTF8(unicode) = NULL;
1412        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1413#else
1414        /* sizeof(wchar_t) == 4 */
1415        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1416            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1417        if (!_PyUnicode_DATA_ANY(unicode)) {
1418            PyErr_NoMemory();
1419            return -1;
1420        }
1421        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1422                                _PyUnicode_WSTR(unicode), end,
1423                                PyUnicode_2BYTE_DATA(unicode));
1424        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1425        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1426        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1427        _PyUnicode_UTF8(unicode) = NULL;
1428        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1429        PyObject_FREE(_PyUnicode_WSTR(unicode));
1430        _PyUnicode_WSTR(unicode) = NULL;
1431        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1432#endif
1433    }
1434    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1435    else {
1436#if SIZEOF_WCHAR_T == 2
1437        /* in case the native representation is 2-bytes, we need to allocate a
1438           new normalized 4-byte version. */
1439        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1440        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1441        if (!_PyUnicode_DATA_ANY(unicode)) {
1442            PyErr_NoMemory();
1443            return -1;
1444        }
1445        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1446        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1447        _PyUnicode_UTF8(unicode) = NULL;
1448        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1449        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1450        _PyUnicode_STATE(unicode).ready = 1;
1451        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1452        PyObject_FREE(_PyUnicode_WSTR(unicode));
1453        _PyUnicode_WSTR(unicode) = NULL;
1454        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1455#else
1456        assert(num_surrogates == 0);
1457
1458        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1459        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1460        _PyUnicode_UTF8(unicode) = NULL;
1461        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1462        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1463#endif
1464        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1465    }
1466    _PyUnicode_STATE(unicode).ready = 1;
1467    assert(_PyUnicode_CheckConsistency(unicode, 1));
1468    return 0;
1469}
1470
1471static void
1472unicode_dealloc(register PyObject *unicode)
1473{
1474    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1475    case SSTATE_NOT_INTERNED:
1476        break;
1477
1478    case SSTATE_INTERNED_MORTAL:
1479        /* revive dead object temporarily for DelItem */
1480        Py_REFCNT(unicode) = 3;
1481        if (PyDict_DelItem(interned, unicode) != 0)
1482            Py_FatalError(
1483                "deletion of interned string failed");
1484        break;
1485
1486    case SSTATE_INTERNED_IMMORTAL:
1487        Py_FatalError("Immortal interned string died.");
1488
1489    default:
1490        Py_FatalError("Inconsistent interned string state.");
1491    }
1492
1493    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1494        PyObject_DEL(_PyUnicode_WSTR(unicode));
1495    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1496        PyObject_DEL(_PyUnicode_UTF8(unicode));
1497
1498    if (PyUnicode_IS_COMPACT(unicode)) {
1499        Py_TYPE(unicode)->tp_free(unicode);
1500    }
1501    else {
1502        if (_PyUnicode_DATA_ANY(unicode))
1503            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1504        Py_TYPE(unicode)->tp_free(unicode);
1505    }
1506}
1507
1508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513    if (unicode == unicode_empty)
1514        return 1;
1515    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516    {
1517        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518        if (ch < 256 && unicode_latin1[ch] == unicode)
1519            return 1;
1520    }
1521    return 0;
1522}
1523#endif
1524
1525static int
1526unicode_resizable(PyObject *unicode)
1527{
1528    if (Py_REFCNT(unicode) != 1)
1529        return 0;
1530    if (PyUnicode_CHECK_INTERNED(unicode))
1531        return 0;
1532#ifdef Py_DEBUG
1533    /* singleton refcount is greater than 1 */
1534    assert(!unicode_is_singleton(unicode));
1535#endif
1536    return 1;
1537}
1538
1539static int
1540unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1541{
1542    PyObject *unicode;
1543    Py_ssize_t old_length;
1544
1545    assert(p_unicode != NULL);
1546    unicode = *p_unicode;
1547
1548    assert(unicode != NULL);
1549    assert(PyUnicode_Check(unicode));
1550    assert(0 <= length);
1551
1552    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1553        old_length = PyUnicode_WSTR_LENGTH(unicode);
1554    else
1555        old_length = PyUnicode_GET_LENGTH(unicode);
1556    if (old_length == length)
1557        return 0;
1558
1559    if (length == 0) {
1560        Py_DECREF(*p_unicode);
1561        *p_unicode = unicode_empty;
1562        Py_INCREF(*p_unicode);
1563        return 0;
1564    }
1565
1566    if (!unicode_resizable(unicode)) {
1567        PyObject *copy = resize_copy(unicode, length);
1568        if (copy == NULL)
1569            return -1;
1570        Py_DECREF(*p_unicode);
1571        *p_unicode = copy;
1572        return 0;
1573    }
1574
1575    if (PyUnicode_IS_COMPACT(unicode)) {
1576        *p_unicode = resize_compact(unicode, length);
1577        if (*p_unicode == NULL)
1578            return -1;
1579        assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
1580        return 0;
1581    }
1582    return resize_inplace(unicode, length);
1583}
1584
1585int
1586PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1587{
1588    PyObject *unicode;
1589    if (p_unicode == NULL) {
1590        PyErr_BadInternalCall();
1591        return -1;
1592    }
1593    unicode = *p_unicode;
1594    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1595    {
1596        PyErr_BadInternalCall();
1597        return -1;
1598    }
1599    return unicode_resize(p_unicode, length);
1600}
1601
1602static int
1603unicode_widen(PyObject **p_unicode, unsigned int maxchar)
1604{
1605    PyObject *result;
1606    assert(PyUnicode_IS_READY(*p_unicode));
1607    if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1608        return 0;
1609    result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1610                           maxchar);
1611    if (result == NULL)
1612        return -1;
1613    PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1614                             PyUnicode_GET_LENGTH(*p_unicode));
1615    Py_DECREF(*p_unicode);
1616    *p_unicode = result;
1617    return 0;
1618}
1619
1620static int
1621unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1622                Py_UCS4 ch)
1623{
1624    if (unicode_widen(p_unicode, ch) < 0)
1625        return -1;
1626    PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1627                    PyUnicode_DATA(*p_unicode),
1628                    (*pos)++, ch);
1629    return 0;
1630}
1631
1632static PyObject*
1633get_latin1_char(unsigned char ch)
1634{
1635    PyObject *unicode = unicode_latin1[ch];
1636    if (!unicode) {
1637        unicode = PyUnicode_New(1, ch);
1638        if (!unicode)
1639            return NULL;
1640        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1641        assert(_PyUnicode_CheckConsistency(unicode, 1));
1642        unicode_latin1[ch] = unicode;
1643    }
1644    Py_INCREF(unicode);
1645    return unicode;
1646}
1647
1648PyObject *
1649PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1650{
1651    PyObject *unicode;
1652    Py_UCS4 maxchar = 0;
1653    Py_ssize_t num_surrogates;
1654
1655    if (u == NULL)
1656        return (PyObject*)_PyUnicode_New(size);
1657
1658    /* If the Unicode data is known at construction time, we can apply
1659       some optimizations which share commonly used objects. */
1660
1661    /* Optimization for empty strings */
1662    if (size == 0 && unicode_empty != NULL) {
1663        Py_INCREF(unicode_empty);
1664        return unicode_empty;
1665    }
1666
1667    /* Single character Unicode objects in the Latin-1 range are
1668       shared when using this constructor */
1669    if (size == 1 && *u < 256)
1670        return get_latin1_char((unsigned char)*u);
1671
1672    /* If not empty and not single character, copy the Unicode data
1673       into the new object */
1674    if (find_maxchar_surrogates(u, u + size,
1675                                &maxchar, &num_surrogates) == -1)
1676        return NULL;
1677
1678    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1679    if (!unicode)
1680        return NULL;
1681
1682    switch (PyUnicode_KIND(unicode)) {
1683    case PyUnicode_1BYTE_KIND:
1684        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1685                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1686        break;
1687    case PyUnicode_2BYTE_KIND:
1688#if Py_UNICODE_SIZE == 2
1689        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1690#else
1691        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1692                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1693#endif
1694        break;
1695    case PyUnicode_4BYTE_KIND:
1696#if SIZEOF_WCHAR_T == 2
1697        /* This is the only case which has to process surrogates, thus
1698           a simple copy loop is not enough and we need a function. */
1699        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1700#else
1701        assert(num_surrogates == 0);
1702        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1703#endif
1704        break;
1705    default:
1706        assert(0 && "Impossible state");
1707    }
1708
1709    return unicode_result(unicode);
1710}
1711
1712PyObject *
1713PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1714{
1715    if (size < 0) {
1716        PyErr_SetString(PyExc_SystemError,
1717                        "Negative size passed to PyUnicode_FromStringAndSize");
1718        return NULL;
1719    }
1720
1721    /* If the Unicode data is known at construction time, we can apply
1722       some optimizations which share commonly used objects.
1723       Also, this means the input must be UTF-8, so fall back to the
1724       UTF-8 decoder at the end. */
1725    if (u != NULL) {
1726
1727        /* Optimization for empty strings */
1728        if (size == 0 && unicode_empty != NULL) {
1729            Py_INCREF(unicode_empty);
1730            return unicode_empty;
1731        }
1732
1733        /* Single characters are shared when using this constructor.
1734           Restrict to ASCII, since the input must be UTF-8. */
1735        if (size == 1 && (unsigned char)*u < 128)
1736            return get_latin1_char((unsigned char)*u);
1737
1738        return PyUnicode_DecodeUTF8(u, size, NULL);
1739    }
1740
1741    return (PyObject *)_PyUnicode_New(size);
1742}
1743
1744PyObject *
1745PyUnicode_FromString(const char *u)
1746{
1747    size_t size = strlen(u);
1748    if (size > PY_SSIZE_T_MAX) {
1749        PyErr_SetString(PyExc_OverflowError, "input too long");
1750        return NULL;
1751    }
1752
1753    return PyUnicode_FromStringAndSize(u, size);
1754}
1755
1756PyObject *
1757_PyUnicode_FromId(_Py_Identifier *id)
1758{
1759    if (!id->object) {
1760        id->object = PyUnicode_FromString(id->string);
1761        if (!id->object)
1762            return NULL;
1763        PyUnicode_InternInPlace(&id->object);
1764        assert(!id->next);
1765        id->next = static_strings;
1766        static_strings = id;
1767    }
1768    return id->object;
1769}
1770
1771void
1772_PyUnicode_ClearStaticStrings()
1773{
1774    _Py_Identifier *i;
1775    for (i = static_strings; i; i = i->next) {
1776        Py_DECREF(i->object);
1777        i->object = NULL;
1778        i->next = NULL;
1779    }
1780}
1781
1782/* Internal function, don't check maximum character */
1783
1784static PyObject*
1785unicode_fromascii(const unsigned char* s, Py_ssize_t size)
1786{
1787    PyObject *res;
1788#ifdef Py_DEBUG
1789    const unsigned char *p;
1790    const unsigned char *end = s + size;
1791    for (p=s; p < end; p++) {
1792        assert(*p < 128);
1793    }
1794#endif
1795    if (size == 1)
1796        return get_latin1_char(s[0]);
1797    res = PyUnicode_New(size, 127);
1798    if (!res)
1799        return NULL;
1800    memcpy(PyUnicode_1BYTE_DATA(res), s, size);
1801    return res;
1802}
1803
1804static Py_UCS4
1805kind_maxchar_limit(unsigned int kind)
1806{
1807    switch(kind) {
1808    case PyUnicode_1BYTE_KIND:
1809        return 0x80;
1810    case PyUnicode_2BYTE_KIND:
1811        return 0x100;
1812    case PyUnicode_4BYTE_KIND:
1813        return 0x10000;
1814    default:
1815        assert(0 && "invalid kind");
1816        return MAX_UNICODE;
1817    }
1818}
1819
1820static PyObject*
1821_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1822{
1823    PyObject *res;
1824    unsigned char max_char;
1825
1826    if (size == 0) {
1827        Py_INCREF(unicode_empty);
1828        return unicode_empty;
1829    }
1830    assert(size > 0);
1831    if (size == 1)
1832        return get_latin1_char(u[0]);
1833
1834    max_char = ucs1lib_find_max_char(u, u + size);
1835    res = PyUnicode_New(size, max_char);
1836    if (!res)
1837        return NULL;
1838    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1839    assert(_PyUnicode_CheckConsistency(res, 1));
1840    return res;
1841}
1842
1843static PyObject*
1844_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1845{
1846    PyObject *res;
1847    Py_UCS2 max_char;
1848
1849    if (size == 0) {
1850        Py_INCREF(unicode_empty);
1851        return unicode_empty;
1852    }
1853    assert(size > 0);
1854    if (size == 1 && u[0] < 256)
1855        return get_latin1_char((unsigned char)u[0]);
1856
1857    max_char = ucs2lib_find_max_char(u, u + size);
1858    res = PyUnicode_New(size, max_char);
1859    if (!res)
1860        return NULL;
1861    if (max_char >= 256)
1862        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1863    else {
1864        _PyUnicode_CONVERT_BYTES(
1865            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1866    }
1867    assert(_PyUnicode_CheckConsistency(res, 1));
1868    return res;
1869}
1870
1871static PyObject*
1872_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1873{
1874    PyObject *res;
1875    Py_UCS4 max_char;
1876
1877    if (size == 0) {
1878        Py_INCREF(unicode_empty);
1879        return unicode_empty;
1880    }
1881    assert(size > 0);
1882    if (size == 1 && u[0] < 256)
1883        return get_latin1_char((unsigned char)u[0]);
1884
1885    max_char = ucs4lib_find_max_char(u, u + size);
1886    res = PyUnicode_New(size, max_char);
1887    if (!res)
1888        return NULL;
1889    if (max_char < 256)
1890        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1891                                 PyUnicode_1BYTE_DATA(res));
1892    else if (max_char < 0x10000)
1893        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1894                                 PyUnicode_2BYTE_DATA(res));
1895    else
1896        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1897    assert(_PyUnicode_CheckConsistency(res, 1));
1898    return res;
1899}
1900
1901PyObject*
1902PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1903{
1904    if (size < 0) {
1905        PyErr_SetString(PyExc_ValueError, "size must be positive");
1906        return NULL;
1907    }
1908    switch(kind) {
1909    case PyUnicode_1BYTE_KIND:
1910        return _PyUnicode_FromUCS1(buffer, size);
1911    case PyUnicode_2BYTE_KIND:
1912        return _PyUnicode_FromUCS2(buffer, size);
1913    case PyUnicode_4BYTE_KIND:
1914        return _PyUnicode_FromUCS4(buffer, size);
1915    default:
1916        PyErr_SetString(PyExc_SystemError, "invalid kind");
1917        return NULL;
1918    }
1919}
1920
1921/* Ensure that a string uses the most efficient storage, if it is not the
1922   case: create a new string with of the right kind. Write NULL into *p_unicode
1923   on error. */
1924static void
1925unicode_adjust_maxchar(PyObject **p_unicode)
1926{
1927    PyObject *unicode, *copy;
1928    Py_UCS4 max_char;
1929    Py_ssize_t len;
1930    unsigned int kind;
1931
1932    assert(p_unicode != NULL);
1933    unicode = *p_unicode;
1934    assert(PyUnicode_IS_READY(unicode));
1935    if (PyUnicode_IS_ASCII(unicode))
1936        return;
1937
1938    len = PyUnicode_GET_LENGTH(unicode);
1939    kind = PyUnicode_KIND(unicode);
1940    if (kind == PyUnicode_1BYTE_KIND) {
1941        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1942        max_char = ucs1lib_find_max_char(u, u + len);
1943        if (max_char >= 128)
1944            return;
1945    }
1946    else if (kind == PyUnicode_2BYTE_KIND) {
1947        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1948        max_char = ucs2lib_find_max_char(u, u + len);
1949        if (max_char >= 256)
1950            return;
1951    }
1952    else {
1953        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
1954        assert(kind == PyUnicode_4BYTE_KIND);
1955        max_char = ucs4lib_find_max_char(u, u + len);
1956        if (max_char >= 0x10000)
1957            return;
1958    }
1959    copy = PyUnicode_New(len, max_char);
1960    copy_characters(copy, 0, unicode, 0, len);
1961    Py_DECREF(unicode);
1962    *p_unicode = copy;
1963}
1964
1965PyObject*
1966PyUnicode_Copy(PyObject *unicode)
1967{
1968    Py_ssize_t length;
1969    PyObject *copy;
1970
1971    if (!PyUnicode_Check(unicode)) {
1972        PyErr_BadInternalCall();
1973        return NULL;
1974    }
1975    if (PyUnicode_READY(unicode))
1976        return NULL;
1977
1978    length = PyUnicode_GET_LENGTH(unicode);
1979    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1980    if (!copy)
1981        return NULL;
1982    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1983
1984    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1985              length * PyUnicode_KIND(unicode));
1986    assert(_PyUnicode_CheckConsistency(copy, 1));
1987    return copy;
1988}
1989
1990
1991/* Widen Unicode objects to larger buffers. Don't write terminating null
1992   character. Return NULL on error. */
1993
1994void*
1995_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1996{
1997    Py_ssize_t len;
1998    void *result;
1999    unsigned int skind;
2000
2001    if (PyUnicode_READY(s))
2002        return NULL;
2003
2004    len = PyUnicode_GET_LENGTH(s);
2005    skind = PyUnicode_KIND(s);
2006    if (skind >= kind) {
2007        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2008        return NULL;
2009    }
2010    switch(kind) {
2011    case PyUnicode_2BYTE_KIND:
2012        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2013        if (!result)
2014            return PyErr_NoMemory();
2015        assert(skind == PyUnicode_1BYTE_KIND);
2016        _PyUnicode_CONVERT_BYTES(
2017            Py_UCS1, Py_UCS2,
2018            PyUnicode_1BYTE_DATA(s),
2019            PyUnicode_1BYTE_DATA(s) + len,
2020            result);
2021        return result;
2022    case PyUnicode_4BYTE_KIND:
2023        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2024        if (!result)
2025            return PyErr_NoMemory();
2026        if (skind == PyUnicode_2BYTE_KIND) {
2027            _PyUnicode_CONVERT_BYTES(
2028                Py_UCS2, Py_UCS4,
2029                PyUnicode_2BYTE_DATA(s),
2030                PyUnicode_2BYTE_DATA(s) + len,
2031                result);
2032        }
2033        else {
2034            assert(skind == PyUnicode_1BYTE_KIND);
2035            _PyUnicode_CONVERT_BYTES(
2036                Py_UCS1, Py_UCS4,
2037                PyUnicode_1BYTE_DATA(s),
2038                PyUnicode_1BYTE_DATA(s) + len,
2039                result);
2040        }
2041        return result;
2042    default:
2043        break;
2044    }
2045    PyErr_SetString(PyExc_SystemError, "invalid kind");
2046    return NULL;
2047}
2048
2049static Py_UCS4*
2050as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2051        int copy_null)
2052{
2053    int kind;
2054    void *data;
2055    Py_ssize_t len, targetlen;
2056    if (PyUnicode_READY(string) == -1)
2057        return NULL;
2058    kind = PyUnicode_KIND(string);
2059    data = PyUnicode_DATA(string);
2060    len = PyUnicode_GET_LENGTH(string);
2061    targetlen = len;
2062    if (copy_null)
2063        targetlen++;
2064    if (!target) {
2065        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2066            PyErr_NoMemory();
2067            return NULL;
2068        }
2069        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2070        if (!target) {
2071            PyErr_NoMemory();
2072            return NULL;
2073        }
2074    }
2075    else {
2076        if (targetsize < targetlen) {
2077            PyErr_Format(PyExc_SystemError,
2078                         "string is longer than the buffer");
2079            if (copy_null && 0 < targetsize)
2080                target[0] = 0;
2081            return NULL;
2082        }
2083    }
2084    if (kind == PyUnicode_1BYTE_KIND) {
2085        Py_UCS1 *start = (Py_UCS1 *) data;
2086        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2087    }
2088    else if (kind == PyUnicode_2BYTE_KIND) {
2089        Py_UCS2 *start = (Py_UCS2 *) data;
2090        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2091    }
2092    else {
2093        assert(kind == PyUnicode_4BYTE_KIND);
2094        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2095    }
2096    if (copy_null)
2097        target[len] = 0;
2098    return target;
2099}
2100
2101Py_UCS4*
2102PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2103                 int copy_null)
2104{
2105    if (target == NULL || targetsize < 0) {
2106        PyErr_BadInternalCall();
2107        return NULL;
2108    }
2109    return as_ucs4(string, target, targetsize, copy_null);
2110}
2111
2112Py_UCS4*
2113PyUnicode_AsUCS4Copy(PyObject *string)
2114{
2115    return as_ucs4(string, NULL, 0, 1);
2116}
2117
2118#ifdef HAVE_WCHAR_H
2119
2120PyObject *
2121PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2122{
2123    if (w == NULL) {
2124        if (size == 0)
2125            return PyUnicode_New(0, 0);
2126        PyErr_BadInternalCall();
2127        return NULL;
2128    }
2129
2130    if (size == -1) {
2131        size = wcslen(w);
2132    }
2133
2134    return PyUnicode_FromUnicode(w, size);
2135}
2136
2137#endif /* HAVE_WCHAR_H */
2138
2139static void
2140makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2141        int zeropad, int width, int precision, char c)
2142{
2143    *fmt++ = '%';
2144    if (width) {
2145        if (zeropad)
2146            *fmt++ = '0';
2147        fmt += sprintf(fmt, "%d", width);
2148    }
2149    if (precision)
2150        fmt += sprintf(fmt, ".%d", precision);
2151    if (longflag)
2152        *fmt++ = 'l';
2153    else if (longlongflag) {
2154        /* longlongflag should only ever be nonzero on machines with
2155           HAVE_LONG_LONG defined */
2156#ifdef HAVE_LONG_LONG
2157        char *f = PY_FORMAT_LONG_LONG;
2158        while (*f)
2159            *fmt++ = *f++;
2160#else
2161        /* we shouldn't ever get here */
2162        assert(0);
2163        *fmt++ = 'l';
2164#endif
2165    }
2166    else if (size_tflag) {
2167        char *f = PY_FORMAT_SIZE_T;
2168        while (*f)
2169            *fmt++ = *f++;
2170    }
2171    *fmt++ = c;
2172    *fmt = '\0';
2173}
2174
2175/* helper for PyUnicode_FromFormatV() */
2176
2177static const char*
2178parse_format_flags(const char *f,
2179                   int *p_width, int *p_precision,
2180                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2181{
2182    int width, precision, longflag, longlongflag, size_tflag;
2183
2184    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2185    f++;
2186    width = 0;
2187    while (Py_ISDIGIT((unsigned)*f))
2188        width = (width*10) + *f++ - '0';
2189    precision = 0;
2190    if (*f == '.') {
2191        f++;
2192        while (Py_ISDIGIT((unsigned)*f))
2193            precision = (precision*10) + *f++ - '0';
2194        if (*f == '%') {
2195            /* "%.3%s" => f points to "3" */
2196            f--;
2197        }
2198    }
2199    if (*f == '\0') {
2200        /* bogus format "%.1" => go backward, f points to "1" */
2201        f--;
2202    }
2203    if (p_width != NULL)
2204        *p_width = width;
2205    if (p_precision != NULL)
2206        *p_precision = precision;
2207
2208    /* Handle %ld, %lu, %lld and %llu. */
2209    longflag = 0;
2210    longlongflag = 0;
2211    size_tflag = 0;
2212
2213    if (*f == 'l') {
2214        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2215            longflag = 1;
2216            ++f;
2217        }
2218#ifdef HAVE_LONG_LONG
2219        else if (f[1] == 'l' &&
2220                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2221            longlongflag = 1;
2222            f += 2;
2223        }
2224#endif
2225    }
2226    /* handle the size_t flag. */
2227    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2228        size_tflag = 1;
2229        ++f;
2230    }
2231    if (p_longflag != NULL)
2232        *p_longflag = longflag;
2233    if (p_longlongflag != NULL)
2234        *p_longlongflag = longlongflag;
2235    if (p_size_tflag != NULL)
2236        *p_size_tflag = size_tflag;
2237    return f;
2238}
2239
2240/* maximum number of characters required for output of %ld.  21 characters
2241   allows for 64-bit integers (in decimal) and an optional sign. */
2242#define MAX_LONG_CHARS 21
2243/* maximum number of characters required for output of %lld.
2244   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2245   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2246#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2247
2248PyObject *
2249PyUnicode_FromFormatV(const char *format, va_list vargs)
2250{
2251    va_list count;
2252    Py_ssize_t callcount = 0;
2253    PyObject **callresults = NULL;
2254    PyObject **callresult = NULL;
2255    Py_ssize_t n = 0;
2256    int width = 0;
2257    int precision = 0;
2258    int zeropad;
2259    const char* f;
2260    PyObject *string;
2261    /* used by sprintf */
2262    char fmt[61]; /* should be enough for %0width.precisionlld */
2263    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2264    Py_UCS4 argmaxchar;
2265    Py_ssize_t numbersize = 0;
2266    char *numberresults = NULL;
2267    char *numberresult = NULL;
2268    Py_ssize_t i;
2269    int kind;
2270    void *data;
2271
2272    Py_VA_COPY(count, vargs);
2273    /* step 1: count the number of %S/%R/%A/%s format specifications
2274     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2275     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2276     * result in an array)
2277     * also estimate a upper bound for all the number formats in the string,
2278     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2279     * buffer before putting everything together. */
2280    for (f = format; *f; f++) {
2281        if (*f == '%') {
2282            int longlongflag;
2283            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2284            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2285            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2286                ++callcount;
2287
2288            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2289#ifdef HAVE_LONG_LONG
2290                if (longlongflag) {
2291                    if (width < MAX_LONG_LONG_CHARS)
2292                        width = MAX_LONG_LONG_CHARS;
2293                }
2294                else
2295#endif
2296                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2297                       including sign.  Decimal takes the most space.  This
2298                       isn't enough for octal.  If a width is specified we
2299                       need more (which we allocate later). */
2300                    if (width < MAX_LONG_CHARS)
2301                        width = MAX_LONG_CHARS;
2302
2303                /* account for the size + '\0' to separate numbers
2304                   inside of the numberresults buffer */
2305                numbersize += (width + 1);
2306            }
2307        }
2308        else if ((unsigned char)*f > 127) {
2309            PyErr_Format(PyExc_ValueError,
2310                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2311                "string, got a non-ASCII byte: 0x%02x",
2312                (unsigned char)*f);
2313            return NULL;
2314        }
2315    }
2316    /* step 2: allocate memory for the results of
2317     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2318    if (callcount) {
2319        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2320        if (!callresults) {
2321            PyErr_NoMemory();
2322            return NULL;
2323        }
2324        callresult = callresults;
2325    }
2326    /* step 2.5: allocate memory for the results of formating numbers */
2327    if (numbersize) {
2328        numberresults = PyObject_Malloc(numbersize);
2329        if (!numberresults) {
2330            PyErr_NoMemory();
2331            goto fail;
2332        }
2333        numberresult = numberresults;
2334    }
2335
2336    /* step 3: format numbers and figure out how large a buffer we need */
2337    for (f = format; *f; f++) {
2338        if (*f == '%') {
2339            const char* p;
2340            int longflag;
2341            int longlongflag;
2342            int size_tflag;
2343            int numprinted;
2344
2345            p = f;
2346            zeropad = (f[1] == '0');
2347            f = parse_format_flags(f, &width, &precision,
2348                                   &longflag, &longlongflag, &size_tflag);
2349            switch (*f) {
2350            case 'c':
2351            {
2352                Py_UCS4 ordinal = va_arg(count, int);
2353                maxchar = Py_MAX(maxchar, ordinal);
2354                n++;
2355                break;
2356            }
2357            case '%':
2358                n++;
2359                break;
2360            case 'i':
2361            case 'd':
2362                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2363                        width, precision, *f);
2364                if (longflag)
2365                    numprinted = sprintf(numberresult, fmt,
2366                                         va_arg(count, long));
2367#ifdef HAVE_LONG_LONG
2368                else if (longlongflag)
2369                    numprinted = sprintf(numberresult, fmt,
2370                                         va_arg(count, PY_LONG_LONG));
2371#endif
2372                else if (size_tflag)
2373                    numprinted = sprintf(numberresult, fmt,
2374                                         va_arg(count, Py_ssize_t));
2375                else
2376                    numprinted = sprintf(numberresult, fmt,
2377                                         va_arg(count, int));
2378                n += numprinted;
2379                /* advance by +1 to skip over the '\0' */
2380                numberresult += (numprinted + 1);
2381                assert(*(numberresult - 1) == '\0');
2382                assert(*(numberresult - 2) != '\0');
2383                assert(numprinted >= 0);
2384                assert(numberresult <= numberresults + numbersize);
2385                break;
2386            case 'u':
2387                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2388                        width, precision, 'u');
2389                if (longflag)
2390                    numprinted = sprintf(numberresult, fmt,
2391                                         va_arg(count, unsigned long));
2392#ifdef HAVE_LONG_LONG
2393                else if (longlongflag)
2394                    numprinted = sprintf(numberresult, fmt,
2395                                         va_arg(count, unsigned PY_LONG_LONG));
2396#endif
2397                else if (size_tflag)
2398                    numprinted = sprintf(numberresult, fmt,
2399                                         va_arg(count, size_t));
2400                else
2401                    numprinted = sprintf(numberresult, fmt,
2402                                         va_arg(count, unsigned int));
2403                n += numprinted;
2404                numberresult += (numprinted + 1);
2405                assert(*(numberresult - 1) == '\0');
2406                assert(*(numberresult - 2) != '\0');
2407                assert(numprinted >= 0);
2408                assert(numberresult <= numberresults + numbersize);
2409                break;
2410            case 'x':
2411                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2412                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2413                n += numprinted;
2414                numberresult += (numprinted + 1);
2415                assert(*(numberresult - 1) == '\0');
2416                assert(*(numberresult - 2) != '\0');
2417                assert(numprinted >= 0);
2418                assert(numberresult <= numberresults + numbersize);
2419                break;
2420            case 'p':
2421                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2422                /* %p is ill-defined:  ensure leading 0x. */
2423                if (numberresult[1] == 'X')
2424                    numberresult[1] = 'x';
2425                else if (numberresult[1] != 'x') {
2426                    memmove(numberresult + 2, numberresult,
2427                            strlen(numberresult) + 1);
2428                    numberresult[0] = '0';
2429                    numberresult[1] = 'x';
2430                    numprinted += 2;
2431                }
2432                n += numprinted;
2433                numberresult += (numprinted + 1);
2434                assert(*(numberresult - 1) == '\0');
2435                assert(*(numberresult - 2) != '\0');
2436                assert(numprinted >= 0);
2437                assert(numberresult <= numberresults + numbersize);
2438                break;
2439            case 's':
2440            {
2441                /* UTF-8 */
2442                const char *s = va_arg(count, const char*);
2443                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2444                if (!str)
2445                    goto fail;
2446                /* since PyUnicode_DecodeUTF8 returns already flexible
2447                   unicode objects, there is no need to call ready on them */
2448                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2449                maxchar = Py_MAX(maxchar, argmaxchar);
2450                n += PyUnicode_GET_LENGTH(str);
2451                /* Remember the str and switch to the next slot */
2452                *callresult++ = str;
2453                break;
2454            }
2455            case 'U':
2456            {
2457                PyObject *obj = va_arg(count, PyObject *);
2458                assert(obj && _PyUnicode_CHECK(obj));
2459                if (PyUnicode_READY(obj) == -1)
2460                    goto fail;
2461                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2462                maxchar = Py_MAX(maxchar, argmaxchar);
2463                n += PyUnicode_GET_LENGTH(obj);
2464                break;
2465            }
2466            case 'V':
2467            {
2468                PyObject *obj = va_arg(count, PyObject *);
2469                const char *str = va_arg(count, const char *);
2470                PyObject *str_obj;
2471                assert(obj || str);
2472                assert(!obj || _PyUnicode_CHECK(obj));
2473                if (obj) {
2474                    if (PyUnicode_READY(obj) == -1)
2475                        goto fail;
2476                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2477                    maxchar = Py_MAX(maxchar, argmaxchar);
2478                    n += PyUnicode_GET_LENGTH(obj);
2479                    *callresult++ = NULL;
2480                }
2481                else {
2482                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2483                    if (!str_obj)
2484                        goto fail;
2485                    if (PyUnicode_READY(str_obj)) {
2486                        Py_DECREF(str_obj);
2487                        goto fail;
2488                    }
2489                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2490                    maxchar = Py_MAX(maxchar, argmaxchar);
2491                    n += PyUnicode_GET_LENGTH(str_obj);
2492                    *callresult++ = str_obj;
2493                }
2494                break;
2495            }
2496            case 'S':
2497            {
2498                PyObject *obj = va_arg(count, PyObject *);
2499                PyObject *str;
2500                assert(obj);
2501                str = PyObject_Str(obj);
2502                if (!str || PyUnicode_READY(str) == -1)
2503                    goto fail;
2504                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2505                maxchar = Py_MAX(maxchar, argmaxchar);
2506                n += PyUnicode_GET_LENGTH(str);
2507                /* Remember the str and switch to the next slot */
2508                *callresult++ = str;
2509                break;
2510            }
2511            case 'R':
2512            {
2513                PyObject *obj = va_arg(count, PyObject *);
2514                PyObject *repr;
2515                assert(obj);
2516                repr = PyObject_Repr(obj);
2517                if (!repr || PyUnicode_READY(repr) == -1)
2518                    goto fail;
2519                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2520                maxchar = Py_MAX(maxchar, argmaxchar);
2521                n += PyUnicode_GET_LENGTH(repr);
2522                /* Remember the repr and switch to the next slot */
2523                *callresult++ = repr;
2524                break;
2525            }
2526            case 'A':
2527            {
2528                PyObject *obj = va_arg(count, PyObject *);
2529                PyObject *ascii;
2530                assert(obj);
2531                ascii = PyObject_ASCII(obj);
2532                if (!ascii || PyUnicode_READY(ascii) == -1)
2533                    goto fail;
2534                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2535                maxchar = Py_MAX(maxchar, argmaxchar);
2536                n += PyUnicode_GET_LENGTH(ascii);
2537                /* Remember the repr and switch to the next slot */
2538                *callresult++ = ascii;
2539                break;
2540            }
2541            default:
2542                /* if we stumble upon an unknown
2543                   formatting code, copy the rest of
2544                   the format string to the output
2545                   string. (we cannot just skip the
2546                   code, since there's no way to know
2547                   what's in the argument list) */
2548                n += strlen(p);
2549                goto expand;
2550            }
2551        } else
2552            n++;
2553    }
2554  expand:
2555    /* step 4: fill the buffer */
2556    /* Since we've analyzed how much space we need,
2557       we don't have to resize the string.
2558       There can be no errors beyond this point. */
2559    string = PyUnicode_New(n, maxchar);
2560    if (!string)
2561        goto fail;
2562    kind = PyUnicode_KIND(string);
2563    data = PyUnicode_DATA(string);
2564    callresult = callresults;
2565    numberresult = numberresults;
2566
2567    for (i = 0, f = format; *f; f++) {
2568        if (*f == '%') {
2569            const char* p;
2570
2571            p = f;
2572            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2573            /* checking for == because the last argument could be a empty
2574               string, which causes i to point to end, the assert at the end of
2575               the loop */
2576            assert(i <= PyUnicode_GET_LENGTH(string));
2577
2578            switch (*f) {
2579            case 'c':
2580            {
2581                const int ordinal = va_arg(vargs, int);
2582                PyUnicode_WRITE(kind, data, i++, ordinal);
2583                break;
2584            }
2585            case 'i':
2586            case 'd':
2587            case 'u':
2588            case 'x':
2589            case 'p':
2590                /* unused, since we already have the result */
2591                if (*f == 'p')
2592                    (void) va_arg(vargs, void *);
2593                else
2594                    (void) va_arg(vargs, int);
2595                /* extract the result from numberresults and append. */
2596                for (; *numberresult; ++i, ++numberresult)
2597                    PyUnicode_WRITE(kind, data, i, *numberresult);
2598                /* skip over the separating '\0' */
2599                assert(*numberresult == '\0');
2600                numberresult++;
2601                assert(numberresult <= numberresults + numbersize);
2602                break;
2603            case 's':
2604            {
2605                /* unused, since we already have the result */
2606                Py_ssize_t size;
2607                (void) va_arg(vargs, char *);
2608                size = PyUnicode_GET_LENGTH(*callresult);
2609                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2610                copy_characters(string, i, *callresult, 0, size);
2611                i += size;
2612                /* We're done with the unicode()/repr() => forget it */
2613                Py_DECREF(*callresult);
2614                /* switch to next unicode()/repr() result */
2615                ++callresult;
2616                break;
2617            }
2618            case 'U':
2619            {
2620                PyObject *obj = va_arg(vargs, PyObject *);
2621                Py_ssize_t size;
2622                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2623                size = PyUnicode_GET_LENGTH(obj);
2624                copy_characters(string, i, obj, 0, size);
2625                i += size;
2626                break;
2627            }
2628            case 'V':
2629            {
2630                Py_ssize_t size;
2631                PyObject *obj = va_arg(vargs, PyObject *);
2632                va_arg(vargs, const char *);
2633                if (obj) {
2634                    size = PyUnicode_GET_LENGTH(obj);
2635                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2636                    copy_characters(string, i, obj, 0, size);
2637                    i += size;
2638                } else {
2639                    size = PyUnicode_GET_LENGTH(*callresult);
2640                    assert(PyUnicode_KIND(*callresult) <=
2641                           PyUnicode_KIND(string));
2642                    copy_characters(string, i, *callresult, 0, size);
2643                    i += size;
2644                    Py_DECREF(*callresult);
2645                }
2646                ++callresult;
2647                break;
2648            }
2649            case 'S':
2650            case 'R':
2651            case 'A':
2652            {
2653                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2654                /* unused, since we already have the result */
2655                (void) va_arg(vargs, PyObject *);
2656                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2657                copy_characters(string, i, *callresult, 0,  size);
2658                i += size;
2659                /* We're done with the unicode()/repr() => forget it */
2660                Py_DECREF(*callresult);
2661                /* switch to next unicode()/repr() result */
2662                ++callresult;
2663                break;
2664            }
2665            case '%':
2666                PyUnicode_WRITE(kind, data, i++, '%');
2667                break;
2668            default:
2669                for (; *p; ++p, ++i)
2670                    PyUnicode_WRITE(kind, data, i, *p);
2671                assert(i == PyUnicode_GET_LENGTH(string));
2672                goto end;
2673            }
2674        }
2675        else {
2676            assert(i < PyUnicode_GET_LENGTH(string));
2677            PyUnicode_WRITE(kind, data, i++, *f);
2678        }
2679    }
2680    assert(i == PyUnicode_GET_LENGTH(string));
2681
2682  end:
2683    if (callresults)
2684        PyObject_Free(callresults);
2685    if (numberresults)
2686        PyObject_Free(numberresults);
2687    return unicode_result(string);
2688  fail:
2689    if (callresults) {
2690        PyObject **callresult2 = callresults;
2691        while (callresult2 < callresult) {
2692            Py_XDECREF(*callresult2);
2693            ++callresult2;
2694        }
2695        PyObject_Free(callresults);
2696    }
2697    if (numberresults)
2698        PyObject_Free(numberresults);
2699    return NULL;
2700}
2701
2702PyObject *
2703PyUnicode_FromFormat(const char *format, ...)
2704{
2705    PyObject* ret;
2706    va_list vargs;
2707
2708#ifdef HAVE_STDARG_PROTOTYPES
2709    va_start(vargs, format);
2710#else
2711    va_start(vargs);
2712#endif
2713    ret = PyUnicode_FromFormatV(format, vargs);
2714    va_end(vargs);
2715    return ret;
2716}
2717
2718#ifdef HAVE_WCHAR_H
2719
2720/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2721   convert a Unicode object to a wide character string.
2722
2723   - If w is NULL: return the number of wide characters (including the null
2724     character) required to convert the unicode object. Ignore size argument.
2725
2726   - Otherwise: return the number of wide characters (excluding the null
2727     character) written into w. Write at most size wide characters (including
2728     the null character). */
2729static Py_ssize_t
2730unicode_aswidechar(PyObject *unicode,
2731                   wchar_t *w,
2732                   Py_ssize_t size)
2733{
2734    Py_ssize_t res;
2735    const wchar_t *wstr;
2736
2737    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2738    if (wstr == NULL)
2739        return -1;
2740
2741    if (w != NULL) {
2742        if (size > res)
2743            size = res + 1;
2744        else
2745            res = size;
2746        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2747        return res;
2748    }
2749    else
2750        return res + 1;
2751}
2752
2753Py_ssize_t
2754PyUnicode_AsWideChar(PyObject *unicode,
2755                     wchar_t *w,
2756                     Py_ssize_t size)
2757{
2758    if (unicode == NULL) {
2759        PyErr_BadInternalCall();
2760        return -1;
2761    }
2762    return unicode_aswidechar(unicode, w, size);
2763}
2764
2765wchar_t*
2766PyUnicode_AsWideCharString(PyObject *unicode,
2767                           Py_ssize_t *size)
2768{
2769    wchar_t* buffer;
2770    Py_ssize_t buflen;
2771
2772    if (unicode == NULL) {
2773        PyErr_BadInternalCall();
2774        return NULL;
2775    }
2776
2777    buflen = unicode_aswidechar(unicode, NULL, 0);
2778    if (buflen == -1)
2779        return NULL;
2780    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2781        PyErr_NoMemory();
2782        return NULL;
2783    }
2784
2785    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2786    if (buffer == NULL) {
2787        PyErr_NoMemory();
2788        return NULL;
2789    }
2790    buflen = unicode_aswidechar(unicode, buffer, buflen);
2791    if (buflen == -1)
2792        return NULL;
2793    if (size != NULL)
2794        *size = buflen;
2795    return buffer;
2796}
2797
2798#endif /* HAVE_WCHAR_H */
2799
2800PyObject *
2801PyUnicode_FromOrdinal(int ordinal)
2802{
2803    PyObject *v;
2804    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2805        PyErr_SetString(PyExc_ValueError,
2806                        "chr() arg not in range(0x110000)");
2807        return NULL;
2808    }
2809
2810    if (ordinal < 256)
2811        return get_latin1_char(ordinal);
2812
2813    v = PyUnicode_New(1, ordinal);
2814    if (v == NULL)
2815        return NULL;
2816    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2817    assert(_PyUnicode_CheckConsistency(v, 1));
2818    return v;
2819}
2820
2821PyObject *
2822PyUnicode_FromObject(register PyObject *obj)
2823{
2824    /* XXX Perhaps we should make this API an alias of
2825       PyObject_Str() instead ?! */
2826    if (PyUnicode_CheckExact(obj)) {
2827        if (PyUnicode_READY(obj))
2828            return NULL;
2829        Py_INCREF(obj);
2830        return obj;
2831    }
2832    if (PyUnicode_Check(obj)) {
2833        /* For a Unicode subtype that's not a Unicode object,
2834           return a true Unicode object with the same data. */
2835        return PyUnicode_Copy(obj);
2836    }
2837    PyErr_Format(PyExc_TypeError,
2838                 "Can't convert '%.100s' object to str implicitly",
2839                 Py_TYPE(obj)->tp_name);
2840    return NULL;
2841}
2842
2843PyObject *
2844PyUnicode_FromEncodedObject(register PyObject *obj,
2845                            const char *encoding,
2846                            const char *errors)
2847{
2848    Py_buffer buffer;
2849    PyObject *v;
2850
2851    if (obj == NULL) {
2852        PyErr_BadInternalCall();
2853        return NULL;
2854    }
2855
2856    /* Decoding bytes objects is the most common case and should be fast */
2857    if (PyBytes_Check(obj)) {
2858        if (PyBytes_GET_SIZE(obj) == 0) {
2859            Py_INCREF(unicode_empty);
2860            v = unicode_empty;
2861        }
2862        else {
2863            v = PyUnicode_Decode(
2864                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2865                    encoding, errors);
2866        }
2867        return v;
2868    }
2869
2870    if (PyUnicode_Check(obj)) {
2871        PyErr_SetString(PyExc_TypeError,
2872                        "decoding str is not supported");
2873        return NULL;
2874    }
2875
2876    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2877    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2878        PyErr_Format(PyExc_TypeError,
2879                     "coercing to str: need bytes, bytearray "
2880                     "or buffer-like object, %.80s found",
2881                     Py_TYPE(obj)->tp_name);
2882        return NULL;
2883    }
2884
2885    if (buffer.len == 0) {
2886        Py_INCREF(unicode_empty);
2887        v = unicode_empty;
2888    }
2889    else
2890        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2891
2892    PyBuffer_Release(&buffer);
2893    return v;
2894}
2895
2896/* Convert encoding to lower case and replace '_' with '-' in order to
2897   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2898   1 on success. */
2899static int
2900normalize_encoding(const char *encoding,
2901                   char *lower,
2902                   size_t lower_len)
2903{
2904    const char *e;
2905    char *l;
2906    char *l_end;
2907
2908    if (encoding == NULL) {
2909        strcpy(lower, "utf-8");
2910        return 1;
2911    }
2912    e = encoding;
2913    l = lower;
2914    l_end = &lower[lower_len - 1];
2915    while (*e) {
2916        if (l == l_end)
2917            return 0;
2918        if (Py_ISUPPER(*e)) {
2919            *l++ = Py_TOLOWER(*e++);
2920        }
2921        else if (*e == '_') {
2922            *l++ = '-';
2923            e++;
2924        }
2925        else {
2926            *l++ = *e++;
2927        }
2928    }
2929    *l = '\0';
2930    return 1;
2931}
2932
2933PyObject *
2934PyUnicode_Decode(const char *s,
2935                 Py_ssize_t size,
2936                 const char *encoding,
2937                 const char *errors)
2938{
2939    PyObject *buffer = NULL, *unicode;
2940    Py_buffer info;
2941    char lower[11];  /* Enough for any encoding shortcut */
2942
2943    /* Shortcuts for common default encodings */
2944    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2945        if ((strcmp(lower, "utf-8") == 0) ||
2946            (strcmp(lower, "utf8") == 0))
2947            return PyUnicode_DecodeUTF8(s, size, errors);
2948        else if ((strcmp(lower, "latin-1") == 0) ||
2949                 (strcmp(lower, "latin1") == 0) ||
2950                 (strcmp(lower, "iso-8859-1") == 0))
2951            return PyUnicode_DecodeLatin1(s, size, errors);
2952#ifdef HAVE_MBCS
2953        else if (strcmp(lower, "mbcs") == 0)
2954            return PyUnicode_DecodeMBCS(s, size, errors);
2955#endif
2956        else if (strcmp(lower, "ascii") == 0)
2957            return PyUnicode_DecodeASCII(s, size, errors);
2958        else if (strcmp(lower, "utf-16") == 0)
2959            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2960        else if (strcmp(lower, "utf-32") == 0)
2961            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2962    }
2963
2964    /* Decode via the codec registry */
2965    buffer = NULL;
2966    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2967        goto onError;
2968    buffer = PyMemoryView_FromBuffer(&info);
2969    if (buffer == NULL)
2970        goto onError;
2971    unicode = PyCodec_Decode(buffer, encoding, errors);
2972    if (unicode == NULL)
2973        goto onError;
2974    if (!PyUnicode_Check(unicode)) {
2975        PyErr_Format(PyExc_TypeError,
2976                     "decoder did not return a str object (type=%.400s)",
2977                     Py_TYPE(unicode)->tp_name);
2978        Py_DECREF(unicode);
2979        goto onError;
2980    }
2981    Py_DECREF(buffer);
2982    return unicode_result(unicode);
2983
2984  onError:
2985    Py_XDECREF(buffer);
2986    return NULL;
2987}
2988
2989PyObject *
2990PyUnicode_AsDecodedObject(PyObject *unicode,
2991                          const char *encoding,
2992                          const char *errors)
2993{
2994    PyObject *v;
2995
2996    if (!PyUnicode_Check(unicode)) {
2997        PyErr_BadArgument();
2998        goto onError;
2999    }
3000
3001    if (encoding == NULL)
3002        encoding = PyUnicode_GetDefaultEncoding();
3003
3004    /* Decode via the codec registry */
3005    v = PyCodec_Decode(unicode, encoding, errors);
3006    if (v == NULL)
3007        goto onError;
3008    return unicode_result(v);
3009
3010  onError:
3011    return NULL;
3012}
3013
3014PyObject *
3015PyUnicode_AsDecodedUnicode(PyObject *unicode,
3016                           const char *encoding,
3017                           const char *errors)
3018{
3019    PyObject *v;
3020
3021    if (!PyUnicode_Check(unicode)) {
3022        PyErr_BadArgument();
3023        goto onError;
3024    }
3025
3026    if (encoding == NULL)
3027        encoding = PyUnicode_GetDefaultEncoding();
3028
3029    /* Decode via the codec registry */
3030    v = PyCodec_Decode(unicode, encoding, errors);
3031    if (v == NULL)
3032        goto onError;
3033    if (!PyUnicode_Check(v)) {
3034        PyErr_Format(PyExc_TypeError,
3035                     "decoder did not return a str object (type=%.400s)",
3036                     Py_TYPE(v)->tp_name);
3037        Py_DECREF(v);
3038        goto onError;
3039    }
3040    return unicode_result(v);
3041
3042  onError:
3043    return NULL;
3044}
3045
3046PyObject *
3047PyUnicode_Encode(const Py_UNICODE *s,
3048                 Py_ssize_t size,
3049                 const char *encoding,
3050                 const char *errors)
3051{
3052    PyObject *v, *unicode;
3053
3054    unicode = PyUnicode_FromUnicode(s, size);
3055    if (unicode == NULL)
3056        return NULL;
3057    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3058    Py_DECREF(unicode);
3059    return v;
3060}
3061
3062PyObject *
3063PyUnicode_AsEncodedObject(PyObject *unicode,
3064                          const char *encoding,
3065                          const char *errors)
3066{
3067    PyObject *v;
3068
3069    if (!PyUnicode_Check(unicode)) {
3070        PyErr_BadArgument();
3071        goto onError;
3072    }
3073
3074    if (encoding == NULL)
3075        encoding = PyUnicode_GetDefaultEncoding();
3076
3077    /* Encode via the codec registry */
3078    v = PyCodec_Encode(unicode, encoding, errors);
3079    if (v == NULL)
3080        goto onError;
3081    return v;
3082
3083  onError:
3084    return NULL;
3085}
3086
3087PyObject *
3088PyUnicode_EncodeFSDefault(PyObject *unicode)
3089{
3090#ifdef HAVE_MBCS
3091    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3092#elif defined(__APPLE__)
3093    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3094#else
3095    PyInterpreterState *interp = PyThreadState_GET()->interp;
3096    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3097       cannot use it to encode and decode filenames before it is loaded. Load
3098       the Python codec requires to encode at least its own filename. Use the C
3099       version of the locale codec until the codec registry is initialized and
3100       the Python codec is loaded.
3101
3102       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3103       cannot only rely on it: check also interp->fscodec_initialized for
3104       subinterpreters. */
3105    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3106        return PyUnicode_AsEncodedString(unicode,
3107                                         Py_FileSystemDefaultEncoding,
3108                                         "surrogateescape");
3109    }
3110    else {
3111        /* locale encoding with surrogateescape */
3112        wchar_t *wchar;
3113        char *bytes;
3114        PyObject *bytes_obj;
3115        size_t error_pos;
3116
3117        wchar = PyUnicode_AsWideCharString(unicode, NULL);
3118        if (wchar == NULL)
3119            return NULL;
3120        bytes = _Py_wchar2char(wchar, &error_pos);
3121        if (bytes == NULL) {
3122            if (error_pos != (size_t)-1) {
3123                char *errmsg = strerror(errno);
3124                PyObject *exc = NULL;
3125                if (errmsg == NULL)
3126                    errmsg = "Py_wchar2char() failed";
3127                raise_encode_exception(&exc,
3128                    "filesystemencoding", unicode,
3129                    error_pos, error_pos+1,
3130                    errmsg);
3131                Py_XDECREF(exc);
3132            }
3133            else
3134                PyErr_NoMemory();
3135            PyMem_Free(wchar);
3136            return NULL;
3137        }
3138        PyMem_Free(wchar);
3139
3140        bytes_obj = PyBytes_FromString(bytes);
3141        PyMem_Free(bytes);
3142        return bytes_obj;
3143    }
3144#endif
3145}
3146
3147PyObject *
3148PyUnicode_AsEncodedString(PyObject *unicode,
3149                          const char *encoding,
3150                          const char *errors)
3151{
3152    PyObject *v;
3153    char lower[11];  /* Enough for any encoding shortcut */
3154
3155    if (!PyUnicode_Check(unicode)) {
3156        PyErr_BadArgument();
3157        return NULL;
3158    }
3159
3160    /* Shortcuts for common default encodings */
3161    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3162        if ((strcmp(lower, "utf-8") == 0) ||
3163            (strcmp(lower, "utf8") == 0))
3164        {
3165            if (errors == NULL || strcmp(errors, "strict") == 0)
3166                return _PyUnicode_AsUTF8String(unicode, NULL);
3167            else
3168                return _PyUnicode_AsUTF8String(unicode, errors);
3169        }
3170        else if ((strcmp(lower, "latin-1") == 0) ||
3171                 (strcmp(lower, "latin1") == 0) ||
3172                 (strcmp(lower, "iso-8859-1") == 0))
3173            return _PyUnicode_AsLatin1String(unicode, errors);
3174#ifdef HAVE_MBCS
3175        else if (strcmp(lower, "mbcs") == 0)
3176            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3177#endif
3178        else if (strcmp(lower, "ascii") == 0)
3179            return _PyUnicode_AsASCIIString(unicode, errors);
3180    }
3181
3182    /* Encode via the codec registry */
3183    v = PyCodec_Encode(unicode, encoding, errors);
3184    if (v == NULL)
3185        return NULL;
3186
3187    /* The normal path */
3188    if (PyBytes_Check(v))
3189        return v;
3190
3191    /* If the codec returns a buffer, raise a warning and convert to bytes */
3192    if (PyByteArray_Check(v)) {
3193        int error;
3194        PyObject *b;
3195
3196        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3197            "encoder %s returned bytearray instead of bytes",
3198            encoding);
3199        if (error) {
3200            Py_DECREF(v);
3201            return NULL;
3202        }
3203
3204        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3205        Py_DECREF(v);
3206        return b;
3207    }
3208
3209    PyErr_Format(PyExc_TypeError,
3210                 "encoder did not return a bytes object (type=%.400s)",
3211                 Py_TYPE(v)->tp_name);
3212    Py_DECREF(v);
3213    return NULL;
3214}
3215
3216PyObject *
3217PyUnicode_AsEncodedUnicode(PyObject *unicode,
3218                           const char *encoding,
3219                           const char *errors)
3220{
3221    PyObject *v;
3222
3223    if (!PyUnicode_Check(unicode)) {
3224        PyErr_BadArgument();
3225        goto onError;
3226    }
3227
3228    if (encoding == NULL)
3229        encoding = PyUnicode_GetDefaultEncoding();
3230
3231    /* Encode via the codec registry */
3232    v = PyCodec_Encode(unicode, encoding, errors);
3233    if (v == NULL)
3234        goto onError;
3235    if (!PyUnicode_Check(v)) {
3236        PyErr_Format(PyExc_TypeError,
3237                     "encoder did not return an str object (type=%.400s)",
3238                     Py_TYPE(v)->tp_name);
3239        Py_DECREF(v);
3240        goto onError;
3241    }
3242    return v;
3243
3244  onError:
3245    return NULL;
3246}
3247
3248PyObject*
3249PyUnicode_DecodeFSDefault(const char *s) {
3250    Py_ssize_t size = (Py_ssize_t)strlen(s);
3251    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3252}
3253
3254PyObject*
3255PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3256{
3257#ifdef HAVE_MBCS
3258    return PyUnicode_DecodeMBCS(s, size, NULL);
3259#elif defined(__APPLE__)
3260    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3261#else
3262    PyInterpreterState *interp = PyThreadState_GET()->interp;
3263    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3264       cannot use it to encode and decode filenames before it is loaded. Load
3265       the Python codec requires to encode at least its own filename. Use the C
3266       version of the locale codec until the codec registry is initialized and
3267       the Python codec is loaded.
3268
3269       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3270       cannot only rely on it: check also interp->fscodec_initialized for
3271       subinterpreters. */
3272    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3273        return PyUnicode_Decode(s, size,
3274                                Py_FileSystemDefaultEncoding,
3275                                "surrogateescape");
3276    }
3277    else {
3278        /* locale encoding with surrogateescape */
3279        wchar_t *wchar;
3280        PyObject *unicode;
3281        size_t len;
3282
3283        if (s[size] != '\0' || size != strlen(s)) {
3284            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3285            return NULL;
3286        }
3287
3288        wchar = _Py_char2wchar(s, &len);
3289        if (wchar == NULL)
3290            return PyErr_NoMemory();
3291
3292        unicode = PyUnicode_FromWideChar(wchar, len);
3293        PyMem_Free(wchar);
3294        return unicode;
3295    }
3296#endif
3297}
3298
3299
3300int
3301PyUnicode_FSConverter(PyObject* arg, void* addr)
3302{
3303    PyObject *output = NULL;
3304    Py_ssize_t size;
3305    void *data;
3306    if (arg == NULL) {
3307        Py_DECREF(*(PyObject**)addr);
3308        return 1;
3309    }
3310    if (PyBytes_Check(arg)) {
3311        output = arg;
3312        Py_INCREF(output);
3313    }
3314    else {
3315        arg = PyUnicode_FromObject(arg);
3316        if (!arg)
3317            return 0;
3318        output = PyUnicode_EncodeFSDefault(arg);
3319        Py_DECREF(arg);
3320        if (!output)
3321            return 0;
3322        if (!PyBytes_Check(output)) {
3323            Py_DECREF(output);
3324            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3325            return 0;
3326        }
3327    }
3328    size = PyBytes_GET_SIZE(output);
3329    data = PyBytes_AS_STRING(output);
3330    if (size != strlen(data)) {
3331        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3332        Py_DECREF(output);
3333        return 0;
3334    }
3335    *(PyObject**)addr = output;
3336    return Py_CLEANUP_SUPPORTED;
3337}
3338
3339
3340int
3341PyUnicode_FSDecoder(PyObject* arg, void* addr)
3342{
3343    PyObject *output = NULL;
3344    if (arg == NULL) {
3345        Py_DECREF(*(PyObject**)addr);
3346        return 1;
3347    }
3348    if (PyUnicode_Check(arg)) {
3349        if (PyUnicode_READY(arg))
3350            return 0;
3351        output = arg;
3352        Py_INCREF(output);
3353    }
3354    else {
3355        arg = PyBytes_FromObject(arg);
3356        if (!arg)
3357            return 0;
3358        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3359                                                  PyBytes_GET_SIZE(arg));
3360        Py_DECREF(arg);
3361        if (!output)
3362            return 0;
3363        if (!PyUnicode_Check(output)) {
3364            Py_DECREF(output);
3365            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3366            return 0;
3367        }
3368    }
3369    if (PyUnicode_READY(output) < 0) {
3370        Py_DECREF(output);
3371        return 0;
3372    }
3373    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3374                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3375        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3376        Py_DECREF(output);
3377        return 0;
3378    }
3379    *(PyObject**)addr = output;
3380    return Py_CLEANUP_SUPPORTED;
3381}
3382
3383
3384char*
3385PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3386{
3387    PyObject *bytes;
3388
3389    if (!PyUnicode_Check(unicode)) {
3390        PyErr_BadArgument();
3391        return NULL;
3392    }
3393    if (PyUnicode_READY(unicode) == -1)
3394        return NULL;
3395
3396    if (PyUnicode_UTF8(unicode) == NULL) {
3397        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3398        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3399        if (bytes == NULL)
3400            return NULL;
3401        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3402        if (_PyUnicode_UTF8(unicode) == NULL) {
3403            Py_DECREF(bytes);
3404            return NULL;
3405        }
3406        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3407        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3408                  PyBytes_AS_STRING(bytes),
3409                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3410        Py_DECREF(bytes);
3411    }
3412
3413    if (psize)
3414        *psize = PyUnicode_UTF8_LENGTH(unicode);
3415    return PyUnicode_UTF8(unicode);
3416}
3417
3418char*
3419PyUnicode_AsUTF8(PyObject *unicode)
3420{
3421    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3422}
3423
3424#ifdef Py_DEBUG
3425static int unicode_as_unicode_calls = 0;
3426#endif
3427
3428
3429Py_UNICODE *
3430PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3431{
3432    const unsigned char *one_byte;
3433#if SIZEOF_WCHAR_T == 4
3434    const Py_UCS2 *two_bytes;
3435#else
3436    const Py_UCS4 *four_bytes;
3437    const Py_UCS4 *ucs4_end;
3438    Py_ssize_t num_surrogates;
3439#endif
3440    wchar_t *w;
3441    wchar_t *wchar_end;
3442
3443    if (!PyUnicode_Check(unicode)) {
3444        PyErr_BadArgument();
3445        return NULL;
3446    }
3447    if (_PyUnicode_WSTR(unicode) == NULL) {
3448        /* Non-ASCII compact unicode object */
3449        assert(_PyUnicode_KIND(unicode) != 0);
3450        assert(PyUnicode_IS_READY(unicode));
3451
3452#ifdef Py_DEBUG
3453        ++unicode_as_unicode_calls;
3454#endif
3455
3456        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3457#if SIZEOF_WCHAR_T == 2
3458            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3459            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3460            num_surrogates = 0;
3461
3462            for (; four_bytes < ucs4_end; ++four_bytes) {
3463                if (*four_bytes > 0xFFFF)
3464                    ++num_surrogates;
3465            }
3466
3467            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3468                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3469            if (!_PyUnicode_WSTR(unicode)) {
3470                PyErr_NoMemory();
3471                return NULL;
3472            }
3473            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3474
3475            w = _PyUnicode_WSTR(unicode);
3476            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3477            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3478            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3479                if (*four_bytes > 0xFFFF) {
3480                    assert(*four_bytes <= MAX_UNICODE);
3481                    /* encode surrogate pair in this case */
3482                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3483                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3484                }
3485                else
3486                    *w = *four_bytes;
3487
3488                if (w > wchar_end) {
3489                    assert(0 && "Miscalculated string end");
3490                }
3491            }
3492            *w = 0;
3493#else
3494            /* sizeof(wchar_t) == 4 */
3495            Py_FatalError("Impossible unicode object state, wstr and str "
3496                          "should share memory already.");
3497            return NULL;
3498#endif
3499        }
3500        else {
3501            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3502                                                  (_PyUnicode_LENGTH(unicode) + 1));
3503            if (!_PyUnicode_WSTR(unicode)) {
3504                PyErr_NoMemory();
3505                return NULL;
3506            }
3507            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3508                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3509            w = _PyUnicode_WSTR(unicode);
3510            wchar_end = w + _PyUnicode_LENGTH(unicode);
3511
3512            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3513                one_byte = PyUnicode_1BYTE_DATA(unicode);
3514                for (; w < wchar_end; ++one_byte, ++w)
3515                    *w = *one_byte;
3516                /* null-terminate the wstr */
3517                *w = 0;
3518            }
3519            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3520#if SIZEOF_WCHAR_T == 4
3521                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3522                for (; w < wchar_end; ++two_bytes, ++w)
3523                    *w = *two_bytes;
3524                /* null-terminate the wstr */
3525                *w = 0;
3526#else
3527                /* sizeof(wchar_t) == 2 */
3528                PyObject_FREE(_PyUnicode_WSTR(unicode));
3529                _PyUnicode_WSTR(unicode) = NULL;
3530                Py_FatalError("Impossible unicode object state, wstr "
3531                              "and str should share memory already.");
3532                return NULL;
3533#endif
3534            }
3535            else {
3536                assert(0 && "This should never happen.");
3537            }
3538        }
3539    }
3540    if (size != NULL)
3541        *size = PyUnicode_WSTR_LENGTH(unicode);
3542    return _PyUnicode_WSTR(unicode);
3543}
3544
3545Py_UNICODE *
3546PyUnicode_AsUnicode(PyObject *unicode)
3547{
3548    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3549}
3550
3551
3552Py_ssize_t
3553PyUnicode_GetSize(PyObject *unicode)
3554{
3555    if (!PyUnicode_Check(unicode)) {
3556        PyErr_BadArgument();
3557        goto onError;
3558    }
3559    return PyUnicode_GET_SIZE(unicode);
3560
3561  onError:
3562    return -1;
3563}
3564
3565Py_ssize_t
3566PyUnicode_GetLength(PyObject *unicode)
3567{
3568    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3569        PyErr_BadArgument();
3570        return -1;
3571    }
3572
3573    return PyUnicode_GET_LENGTH(unicode);
3574}
3575
3576Py_UCS4
3577PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3578{
3579    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3580        PyErr_BadArgument();
3581        return (Py_UCS4)-1;
3582    }
3583    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3584        PyErr_SetString(PyExc_IndexError, "string index out of range");
3585        return (Py_UCS4)-1;
3586    }
3587    return PyUnicode_READ_CHAR(unicode, index);
3588}
3589
3590int
3591PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3592{
3593    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3594        PyErr_BadArgument();
3595        return -1;
3596    }
3597    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3598        PyErr_SetString(PyExc_IndexError, "string index out of range");
3599        return -1;
3600    }
3601    if (_PyUnicode_Dirty(unicode))
3602        return -1;
3603    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3604                    index, ch);
3605    return 0;
3606}
3607
3608const char *
3609PyUnicode_GetDefaultEncoding(void)
3610{
3611    return "utf-8";
3612}
3613
3614/* create or adjust a UnicodeDecodeError */
3615static void
3616make_decode_exception(PyObject **exceptionObject,
3617                      const char *encoding,
3618                      const char *input, Py_ssize_t length,
3619                      Py_ssize_t startpos, Py_ssize_t endpos,
3620                      const char *reason)
3621{
3622    if (*exceptionObject == NULL) {
3623        *exceptionObject = PyUnicodeDecodeError_Create(
3624            encoding, input, length, startpos, endpos, reason);
3625    }
3626    else {
3627        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3628            goto onError;
3629        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3630            goto onError;
3631        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3632            goto onError;
3633    }
3634    return;
3635
3636onError:
3637    Py_DECREF(*exceptionObject);
3638    *exceptionObject = NULL;
3639}
3640
3641/* error handling callback helper:
3642   build arguments, call the callback and check the arguments,
3643   if no exception occurred, copy the replacement to the output
3644   and adjust various state variables.
3645   return 0 on success, -1 on error
3646*/
3647
3648static int
3649unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3650                                 const char *encoding, const char *reason,
3651                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3652                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3653                                 PyObject **output, Py_ssize_t *outpos)
3654{
3655    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3656
3657    PyObject *restuple = NULL;
3658    PyObject *repunicode = NULL;
3659    Py_ssize_t outsize;
3660    Py_ssize_t insize;
3661    Py_ssize_t requiredsize;
3662    Py_ssize_t newpos;
3663    PyObject *inputobj = NULL;
3664    int res = -1;
3665
3666    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3667        outsize = PyUnicode_GET_LENGTH(*output);
3668    else
3669        outsize = _PyUnicode_WSTR_LENGTH(*output);
3670
3671    if (*errorHandler == NULL) {
3672        *errorHandler = PyCodec_LookupError(errors);
3673        if (*errorHandler == NULL)
3674            goto onError;
3675    }
3676
3677    make_decode_exception(exceptionObject,
3678        encoding,
3679        *input, *inend - *input,
3680        *startinpos, *endinpos,
3681        reason);
3682    if (*exceptionObject == NULL)
3683        goto onError;
3684
3685    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3686    if (restuple == NULL)
3687        goto onError;
3688    if (!PyTuple_Check(restuple)) {
3689        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3690        goto onError;
3691    }
3692    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3693        goto onError;
3694    if (PyUnicode_READY(repunicode) < 0)
3695        goto onError;
3696
3697    /* Copy back the bytes variables, which might have been modified by the
3698       callback */
3699    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3700    if (!inputobj)
3701        goto onError;
3702    if (!PyBytes_Check(inputobj)) {
3703        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3704    }
3705    *input = PyBytes_AS_STRING(inputobj);
3706    insize = PyBytes_GET_SIZE(inputobj);
3707    *inend = *input + insize;
3708    /* we can DECREF safely, as the exception has another reference,
3709       so the object won't go away. */
3710    Py_DECREF(inputobj);
3711
3712    if (newpos<0)
3713        newpos = insize+newpos;
3714    if (newpos<0 || newpos>insize) {
3715        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3716        goto onError;
3717    }
3718
3719    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3720        /* need more space? (at least enough for what we
3721           have+the replacement+the rest of the string (starting
3722           at the new input position), so we won't have to check space
3723           when there are no errors in the rest of the string) */
3724        Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3725        requiredsize = *outpos + replen + insize-newpos;
3726        if (requiredsize > outsize) {
3727            if (requiredsize<2*outsize)
3728                requiredsize = 2*outsize;
3729            if (unicode_resize(output, requiredsize) < 0)
3730                goto onError;
3731        }
3732        if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
3733            goto onError;
3734        copy_characters(*output, *outpos, repunicode, 0, replen);
3735        *outpos += replen;
3736    }
3737    else {
3738        wchar_t *repwstr;
3739        Py_ssize_t repwlen;
3740        repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3741        if (repwstr == NULL)
3742            goto onError;
3743        /* need more space? (at least enough for what we
3744           have+the replacement+the rest of the string (starting
3745           at the new input position), so we won't have to check space
3746           when there are no errors in the rest of the string) */
3747        requiredsize = *outpos + repwlen + insize-newpos;
3748        if (requiredsize > outsize) {
3749            if (requiredsize < 2*outsize)
3750                requiredsize = 2*outsize;
3751            if (unicode_resize(output, requiredsize) < 0)
3752                goto onError;
3753        }
3754        wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3755        *outpos += repwlen;
3756    }
3757    *endinpos = newpos;
3758    *inptr = *input + newpos;
3759
3760    /* we made it! */
3761    res = 0;
3762
3763  onError:
3764    Py_XDECREF(restuple);
3765    return res;
3766}
3767
3768/* --- UTF-7 Codec -------------------------------------------------------- */
3769
3770/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3771
3772/* Three simple macros defining base-64. */
3773
3774/* Is c a base-64 character? */
3775
3776#define IS_BASE64(c) \
3777    (((c) >= 'A' && (c) <= 'Z') ||     \
3778     ((c) >= 'a' && (c) <= 'z') ||     \
3779     ((c) >= '0' && (c) <= '9') ||     \
3780     (c) == '+' || (c) == '/')
3781
3782/* given that c is a base-64 character, what is its base-64 value? */
3783
3784#define FROM_BASE64(c)                                                  \
3785    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3786     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3787     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3788     (c) == '+' ? 62 : 63)
3789
3790/* What is the base-64 character of the bottom 6 bits of n? */
3791
3792#define TO_BASE64(n)  \
3793    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3794
3795/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3796 * decoded as itself.  We are permissive on decoding; the only ASCII
3797 * byte not decoding to itself is the + which begins a base64
3798 * string. */
3799
3800#define DECODE_DIRECT(c)                                \
3801    ((c) <= 127 && (c) != '+')
3802
3803/* The UTF-7 encoder treats ASCII characters differently according to
3804 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3805 * the above).  See RFC2152.  This array identifies these different
3806 * sets:
3807 * 0 : "Set D"
3808 *     alphanumeric and '(),-./:?
3809 * 1 : "Set O"
3810 *     !"#$%&*;<=>@[]^_`{|}
3811 * 2 : "whitespace"
3812 *     ht nl cr sp
3813 * 3 : special (must be base64 encoded)
3814 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3815 */
3816
3817static
3818char utf7_category[128] = {
3819/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3820    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3821/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3822    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3823/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3824    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3825/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3826    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3827/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3828    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3829/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3830    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3831/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3832    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3833/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3834    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3835};
3836
3837/* ENCODE_DIRECT: this character should be encoded as itself.  The
3838 * answer depends on whether we are encoding set O as itself, and also
3839 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3840 * clear that the answers to these questions vary between
3841 * applications, so this code needs to be flexible.  */
3842
3843#define ENCODE_DIRECT(c, directO, directWS)             \
3844    ((c) < 128 && (c) > 0 &&                            \
3845     ((utf7_category[(c)] == 0) ||                      \
3846      (directWS && (utf7_category[(c)] == 2)) ||        \
3847      (directO && (utf7_category[(c)] == 1))))
3848
3849PyObject *
3850PyUnicode_DecodeUTF7(const char *s,
3851                     Py_ssize_t size,
3852                     const char *errors)
3853{
3854    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3855}
3856
3857/* The decoder.  The only state we preserve is our read position,
3858 * i.e. how many characters we have consumed.  So if we end in the
3859 * middle of a shift sequence we have to back off the read position
3860 * and the output to the beginning of the sequence, otherwise we lose
3861 * all the shift state (seen bits, number of bits seen, high
3862 * surrogate). */
3863
3864PyObject *
3865PyUnicode_DecodeUTF7Stateful(const char *s,
3866                             Py_ssize_t size,
3867                             const char *errors,
3868                             Py_ssize_t *consumed)
3869{
3870    const char *starts = s;
3871    Py_ssize_t startinpos;
3872    Py_ssize_t endinpos;
3873    Py_ssize_t outpos;
3874    const char *e;
3875    PyObject *unicode;
3876    const char *errmsg = "";
3877    int inShift = 0;
3878    Py_ssize_t shiftOutStart;
3879    unsigned int base64bits = 0;
3880    unsigned long base64buffer = 0;
3881    Py_UCS4 surrogate = 0;
3882    PyObject *errorHandler = NULL;
3883    PyObject *exc = NULL;
3884
3885    /* Start off assuming it's all ASCII. Widen later as necessary. */
3886    unicode = PyUnicode_New(size, 127);
3887    if (!unicode)
3888        return NULL;
3889    if (size == 0) {
3890        if (consumed)
3891            *consumed = 0;
3892        return unicode;
3893    }
3894
3895    shiftOutStart = outpos = 0;
3896    e = s + size;
3897
3898    while (s < e) {
3899        Py_UCS4 ch;
3900      restart:
3901        ch = (unsigned char) *s;
3902
3903        if (inShift) { /* in a base-64 section */
3904            if (IS_BASE64(ch)) { /* consume a base-64 character */
3905                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3906                base64bits += 6;
3907                s++;
3908                if (base64bits >= 16) {
3909                    /* we have enough bits for a UTF-16 value */
3910                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
3911                    base64bits -= 16;
3912                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3913                    if (surrogate) {
3914                        /* expecting a second surrogate */
3915                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3916                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
3917                            if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3918                                goto onError;
3919                            surrogate = 0;
3920                            continue;
3921                        }
3922                        else {
3923                            if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3924                                goto onError;
3925                            surrogate = 0;
3926                        }
3927                    }
3928                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
3929                        /* first surrogate */
3930                        surrogate = outCh;
3931                    }
3932                    else {
3933                        if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3934                            goto onError;
3935                    }
3936                }
3937            }
3938            else { /* now leaving a base-64 section */
3939                inShift = 0;
3940                s++;
3941                if (surrogate) {
3942                    if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3943                        goto onError;
3944                    surrogate = 0;
3945                }
3946                if (base64bits > 0) { /* left-over bits */
3947                    if (base64bits >= 6) {
3948                        /* We've seen at least one base-64 character */
3949                        errmsg = "partial character in shift sequence";
3950                        goto utf7Error;
3951                    }
3952                    else {
3953                        /* Some bits remain; they should be zero */
3954                        if (base64buffer != 0) {
3955                            errmsg = "non-zero padding bits in shift sequence";
3956                            goto utf7Error;
3957                        }
3958                    }
3959                }
3960                if (ch != '-') {
3961                    /* '-' is absorbed; other terminating
3962                       characters are preserved */
3963                    if (unicode_putchar(&unicode, &outpos, ch) < 0)
3964                        goto onError;
3965                }
3966            }
3967        }
3968        else if ( ch == '+' ) {
3969            startinpos = s-starts;
3970            s++; /* consume '+' */
3971            if (s < e && *s == '-') { /* '+-' encodes '+' */
3972                s++;
3973                if (unicode_putchar(&unicode, &outpos, '+') < 0)
3974                    goto onError;
3975            }
3976            else { /* begin base64-encoded section */
3977                inShift = 1;
3978                shiftOutStart = outpos;
3979                base64bits = 0;
3980            }
3981        }
3982        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3983            if (unicode_putchar(&unicode, &outpos, ch) < 0)
3984                goto onError;
3985            s++;
3986        }
3987        else {
3988            startinpos = s-starts;
3989            s++;
3990            errmsg = "unexpected special character";
3991            goto utf7Error;
3992        }
3993        continue;
3994utf7Error:
3995        endinpos = s-starts;
3996        if (unicode_decode_call_errorhandler(
3997                errors, &errorHandler,
3998                "utf7", errmsg,
3999                &starts, &e, &startinpos, &endinpos, &exc, &s,
4000                &unicode, &outpos))
4001            goto onError;
4002    }
4003
4004    /* end of string */
4005
4006    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4007        /* if we're in an inconsistent state, that's an error */
4008        if (surrogate ||
4009                (base64bits >= 6) ||
4010                (base64bits > 0 && base64buffer != 0)) {
4011            endinpos = size;
4012            if (unicode_decode_call_errorhandler(
4013                    errors, &errorHandler,
4014                    "utf7", "unterminated shift sequence",
4015                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4016                    &unicode, &outpos))
4017                goto onError;
4018            if (s < e)
4019                goto restart;
4020        }
4021    }
4022
4023    /* return state */
4024    if (consumed) {
4025        if (inShift) {
4026            outpos = shiftOutStart; /* back off output */
4027            *consumed = startinpos;
4028        }
4029        else {
4030            *consumed = s-starts;
4031        }
4032    }
4033
4034    if (unicode_resize(&unicode, outpos) < 0)
4035        goto onError;
4036
4037    Py_XDECREF(errorHandler);
4038    Py_XDECREF(exc);
4039    return unicode_result(unicode);
4040
4041  onError:
4042    Py_XDECREF(errorHandler);
4043    Py_XDECREF(exc);
4044    Py_DECREF(unicode);
4045    return NULL;
4046}
4047
4048
4049PyObject *
4050_PyUnicode_EncodeUTF7(PyObject *str,
4051                      int base64SetO,
4052                      int base64WhiteSpace,
4053                      const char *errors)
4054{
4055    int kind;
4056    void *data;
4057    Py_ssize_t len;
4058    PyObject *v;
4059    Py_ssize_t allocated;
4060    int inShift = 0;
4061    Py_ssize_t i;
4062    unsigned int base64bits = 0;
4063    unsigned long base64buffer = 0;
4064    char * out;
4065    char * start;
4066
4067    if (PyUnicode_READY(str) < 0)
4068        return NULL;
4069    kind = PyUnicode_KIND(str);
4070    data = PyUnicode_DATA(str);
4071    len = PyUnicode_GET_LENGTH(str);
4072
4073    if (len == 0)
4074        return PyBytes_FromStringAndSize(NULL, 0);
4075
4076    /* It might be possible to tighten this worst case */
4077    allocated = 8 * len;
4078    if (allocated / 8 != len)
4079        return PyErr_NoMemory();
4080
4081    v = PyBytes_FromStringAndSize(NULL, allocated);
4082    if (v == NULL)
4083        return NULL;
4084
4085    start = out = PyBytes_AS_STRING(v);
4086    for (i = 0; i < len; ++i) {
4087        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4088
4089        if (inShift) {
4090            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4091                /* shifting out */
4092                if (base64bits) { /* output remaining bits */
4093                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4094                    base64buffer = 0;
4095                    base64bits = 0;
4096                }
4097                inShift = 0;
4098                /* Characters not in the BASE64 set implicitly unshift the sequence
4099                   so no '-' is required, except if the character is itself a '-' */
4100                if (IS_BASE64(ch) || ch == '-') {
4101                    *out++ = '-';
4102                }
4103                *out++ = (char) ch;
4104            }
4105            else {
4106                goto encode_char;
4107            }
4108        }
4109        else { /* not in a shift sequence */
4110            if (ch == '+') {
4111                *out++ = '+';
4112                        *out++ = '-';
4113            }
4114            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4115                *out++ = (char) ch;
4116            }
4117            else {
4118                *out++ = '+';
4119                inShift = 1;
4120                goto encode_char;
4121            }
4122        }
4123        continue;
4124encode_char:
4125        if (ch >= 0x10000) {
4126            assert(ch <= MAX_UNICODE);
4127
4128            /* code first surrogate */
4129            base64bits += 16;
4130            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4131            while (base64bits >= 6) {
4132                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4133                base64bits -= 6;
4134            }
4135            /* prepare second surrogate */
4136            ch = Py_UNICODE_LOW_SURROGATE(ch);
4137        }
4138        base64bits += 16;
4139        base64buffer = (base64buffer << 16) | ch;
4140        while (base64bits >= 6) {
4141            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4142            base64bits -= 6;
4143        }
4144    }
4145    if (base64bits)
4146        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4147    if (inShift)
4148        *out++ = '-';
4149    if (_PyBytes_Resize(&v, out - start) < 0)
4150        return NULL;
4151    return v;
4152}
4153PyObject *
4154PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4155                     Py_ssize_t size,
4156                     int base64SetO,
4157                     int base64WhiteSpace,
4158                     const char *errors)
4159{
4160    PyObject *result;
4161    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4162    if (tmp == NULL)
4163        return NULL;
4164    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4165                                   base64WhiteSpace, errors);
4166    Py_DECREF(tmp);
4167    return result;
4168}
4169
4170#undef IS_BASE64
4171#undef FROM_BASE64
4172#undef TO_BASE64
4173#undef DECODE_DIRECT
4174#undef ENCODE_DIRECT
4175
4176/* --- UTF-8 Codec -------------------------------------------------------- */
4177
4178static
4179char utf8_code_length[256] = {
4180    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
4181       illegal prefix.  See RFC 3629 for details */
4182    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4183    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4184    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4185    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4186    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4187    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4188    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4189    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4190    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
4191    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4192    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4193    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4194    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4195    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4196    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4197    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
4198};
4199
4200PyObject *
4201PyUnicode_DecodeUTF8(const char *s,
4202                     Py_ssize_t size,
4203                     const char *errors)
4204{
4205    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4206}
4207
4208#include "stringlib/ucs1lib.h"
4209#include "stringlib/codecs.h"
4210#include "stringlib/undef.h"
4211
4212#include "stringlib/ucs2lib.h"
4213#include "stringlib/codecs.h"
4214#include "stringlib/undef.h"
4215
4216#include "stringlib/ucs4lib.h"
4217#include "stringlib/codecs.h"
4218#include "stringlib/undef.h"
4219
4220/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4221#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4222
4223/* Mask to quickly check whether a C 'long' contains a
4224   non-ASCII, UTF8-encoded char. */
4225#if (SIZEOF_LONG == 8)
4226# define ASCII_CHAR_MASK 0x8080808080808080L
4227#elif (SIZEOF_LONG == 4)
4228# define ASCII_CHAR_MASK 0x80808080L
4229#else
4230# error C 'long' size should be either 4 or 8!
4231#endif
4232
4233/* Scans a UTF-8 string and returns the maximum character to be expected
4234   and the size of the decoded unicode string.
4235
4236   This function doesn't check for errors, these checks are performed in
4237   PyUnicode_DecodeUTF8Stateful.
4238   */
4239static Py_UCS4
4240utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4241                                  Py_ssize_t *unicode_size)
4242{
4243    Py_ssize_t char_count = 0;
4244    const unsigned char *p = (const unsigned char *)s;
4245    const unsigned char *end = p + string_size;
4246    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4247
4248    assert(unicode_size != NULL);
4249
4250    /* By having a cascade of independent loops which fallback onto each
4251       other, we minimize the amount of work done in the average loop
4252       iteration, and we also maximize the CPU's ability to predict
4253       branches correctly (because a given condition will have always the
4254       same boolean outcome except perhaps in the last iteration of the
4255       corresponding loop).
4256       In the general case this brings us rather close to decoding
4257       performance pre-PEP 393, despite the two-pass decoding.
4258
4259       Note that the pure ASCII loop is not duplicated once a non-ASCII
4260       character has been encountered. It is actually a pessimization (by
4261       a significant factor) to use this loop on text with many non-ASCII
4262       characters, and it is important to avoid bad performance on valid
4263       utf-8 data (invalid utf-8 being a different can of worms).
4264    */
4265
4266    /* ASCII */
4267    for (; p < end; ++p) {
4268        /* Only check value if it's not a ASCII char... */
4269        if (*p < 0x80) {
4270            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4271               an explanation. */
4272            if (!((size_t) p & LONG_PTR_MASK)) {
4273                /* Help register allocation */
4274                register const unsigned char *_p = p;
4275                while (_p < aligned_end) {
4276                    unsigned long value = *(unsigned long *) _p;
4277                    if (value & ASCII_CHAR_MASK)
4278                        break;
4279                    _p += SIZEOF_LONG;
4280                    char_count += SIZEOF_LONG;
4281                }
4282                p = _p;
4283                if (p == end)
4284                    break;
4285            }
4286        }
4287        if (*p < 0x80)
4288            ++char_count;
4289        else
4290            goto _ucs1loop;
4291    }
4292    *unicode_size = char_count;
4293    return 127;
4294
4295_ucs1loop:
4296    for (; p < end; ++p) {
4297        if (*p < 0xc4)
4298            char_count += ((*p & 0xc0) != 0x80);
4299        else
4300            goto _ucs2loop;
4301    }
4302    *unicode_size = char_count;
4303    return 255;
4304
4305_ucs2loop:
4306    for (; p < end; ++p) {
4307        if (*p < 0xf0)
4308            char_count += ((*p & 0xc0) != 0x80);
4309        else
4310            goto _ucs4loop;
4311    }
4312    *unicode_size = char_count;
4313    return 65535;
4314
4315_ucs4loop:
4316    for (; p < end; ++p) {
4317        char_count += ((*p & 0xc0) != 0x80);
4318    }
4319    *unicode_size = char_count;
4320    return 65537;
4321}
4322
4323/* Called when we encountered some error that wasn't detected in the original
4324   scan, e.g. an encoded surrogate character. The original maxchar computation
4325   may have been incorrect, so redo it. */
4326static int
4327refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4328{
4329    PyObject *tmp;
4330    Py_ssize_t k;
4331    Py_UCS4 maxchar;
4332    for (k = 0, maxchar = 0; k < n; k++)
4333        maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4334    tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4335    if (tmp == NULL)
4336        return -1;
4337    PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4338    Py_DECREF(*unicode);
4339    *unicode = tmp;
4340    return 0;
4341}
4342
4343/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4344   in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4345   onError. Potential resizing overallocates, so the result needs to shrink
4346   at the end.
4347*/
4348#define WRITE_MAYBE_FAIL(index, value)                                  \
4349    do {                                                                \
4350        if (has_errors) {                                               \
4351            Py_ssize_t pos = index;                                     \
4352            if (pos > PyUnicode_GET_LENGTH(unicode) &&                  \
4353                unicode_resize(&unicode, pos + pos/8) < 0)              \
4354                goto onError;                                           \
4355            if (unicode_putchar(&unicode, &pos, value) < 0)             \
4356                goto onError;                                           \
4357        }                                                               \
4358        else                                                            \
4359            PyUnicode_WRITE(kind, data, index, value);                  \
4360    } while (0)
4361
4362PyObject *
4363PyUnicode_DecodeUTF8Stateful(const char *s,
4364                             Py_ssize_t size,
4365                             const char *errors,
4366                             Py_ssize_t *consumed)
4367{
4368    const char *starts = s;
4369    int n;
4370    int k;
4371    Py_ssize_t startinpos;
4372    Py_ssize_t endinpos;
4373    const char *e, *aligned_end;
4374    PyObject *unicode;
4375    const char *errmsg = "";
4376    PyObject *errorHandler = NULL;
4377    PyObject *exc = NULL;
4378    Py_UCS4 maxchar = 0;
4379    Py_ssize_t unicode_size;
4380    Py_ssize_t i;
4381    int kind;
4382    void *data;
4383    int has_errors = 0;
4384
4385    if (size == 0) {
4386        if (consumed)
4387            *consumed = 0;
4388        return (PyObject *)PyUnicode_New(0, 0);
4389    }
4390    maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
4391    /* When the string is ASCII only, just use memcpy and return.
4392       unicode_size may be != size if there is an incomplete UTF-8
4393       sequence at the end of the ASCII block.  */
4394    if (maxchar < 128 && size == unicode_size) {
4395        if (consumed)
4396            *consumed = size;
4397
4398        if (size == 1)
4399            return get_latin1_char((unsigned char)s[0]);
4400
4401        unicode = PyUnicode_New(unicode_size, maxchar);
4402        if (!unicode)
4403            return NULL;
4404        Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4405        assert(_PyUnicode_CheckConsistency(unicode, 1));
4406        return unicode;
4407    }
4408
4409    /* In case of errors, maxchar and size computation might be incorrect;
4410       code below refits and resizes as necessary. */
4411    unicode = PyUnicode_New(unicode_size, maxchar);
4412    if (!unicode)
4413        return NULL;
4414    kind = PyUnicode_KIND(unicode);
4415    data = PyUnicode_DATA(unicode);
4416
4417    /* Unpack UTF-8 encoded data */
4418    i = 0;
4419    e = s + size;
4420    switch (kind) {
4421    case PyUnicode_1BYTE_KIND:
4422        has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4423        break;
4424    case PyUnicode_2BYTE_KIND:
4425        has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4426        break;
4427    case PyUnicode_4BYTE_KIND:
4428        has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4429        break;
4430    }
4431    if (!has_errors) {
4432        /* Ensure the unicode size calculation was correct */
4433        assert(i == unicode_size);
4434        assert(s == e);
4435        if (consumed)
4436            *consumed = s-starts;
4437        return unicode;
4438    }
4439    /* Fall through to the generic decoding loop for the rest of
4440       the string */
4441    if (refit_partial_string(&unicode, kind, data, i) < 0)
4442        goto onError;
4443
4444    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4445
4446    while (s < e) {
4447        Py_UCS4 ch = (unsigned char)*s;
4448
4449        if (ch < 0x80) {
4450            /* Fast path for runs of ASCII characters. Given that common UTF-8
4451               input will consist of an overwhelming majority of ASCII
4452               characters, we try to optimize for this case by checking
4453               as many characters as a C 'long' can contain.
4454               First, check if we can do an aligned read, as most CPUs have
4455               a penalty for unaligned reads.
4456            */
4457            if (!((size_t) s & LONG_PTR_MASK)) {
4458                /* Help register allocation */
4459                register const char *_s = s;
4460                register Py_ssize_t _i = i;
4461                while (_s < aligned_end) {
4462                    /* Read a whole long at a time (either 4 or 8 bytes),
4463                       and do a fast unrolled copy if it only contains ASCII
4464                       characters. */
4465                    unsigned long value = *(unsigned long *) _s;
4466                    if (value & ASCII_CHAR_MASK)
4467                        break;
4468                    WRITE_MAYBE_FAIL(_i+0, _s[0]);
4469                    WRITE_MAYBE_FAIL(_i+1, _s[1]);
4470                    WRITE_MAYBE_FAIL(_i+2, _s[2]);
4471                    WRITE_MAYBE_FAIL(_i+3, _s[3]);
4472#if (SIZEOF_LONG == 8)
4473                    WRITE_MAYBE_FAIL(_i+4, _s[4]);
4474                    WRITE_MAYBE_FAIL(_i+5, _s[5]);
4475                    WRITE_MAYBE_FAIL(_i+6, _s[6]);
4476                    WRITE_MAYBE_FAIL(_i+7, _s[7]);
4477#endif
4478                    _s += SIZEOF_LONG;
4479                    _i += SIZEOF_LONG;
4480                }
4481                s = _s;
4482                i = _i;
4483                if (s == e)
4484                    break;
4485                ch = (unsigned char)*s;
4486            }
4487        }
4488
4489        if (ch < 0x80) {
4490            WRITE_MAYBE_FAIL(i++, ch);
4491            s++;
4492            continue;
4493        }
4494
4495        n = utf8_code_length[ch];
4496
4497        if (s + n > e) {
4498            if (consumed)
4499                break;
4500            else {
4501                errmsg = "unexpected end of data";
4502                startinpos = s-starts;
4503                endinpos = startinpos+1;
4504                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4505                    endinpos++;
4506                goto utf8Error;
4507            }
4508        }
4509
4510        switch (n) {
4511
4512        case 0:
4513            errmsg = "invalid start byte";
4514            startinpos = s-starts;
4515            endinpos = startinpos+1;
4516            goto utf8Error;
4517
4518        case 1:
4519            errmsg = "internal error";
4520            startinpos = s-starts;
4521            endinpos = startinpos+1;
4522            goto utf8Error;
4523
4524        case 2:
4525            if ((s[1] & 0xc0) != 0x80) {
4526                errmsg = "invalid continuation byte";
4527                startinpos = s-starts;
4528                endinpos = startinpos + 1;
4529                goto utf8Error;
4530            }
4531            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4532            assert ((ch > 0x007F) && (ch <= 0x07FF));
4533            WRITE_MAYBE_FAIL(i++, ch);
4534            break;
4535
4536        case 3:
4537            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4538               will result in surrogates in range d800-dfff. Surrogates are
4539               not valid UTF-8 so they are rejected.
4540               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4541               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4542            if ((s[1] & 0xc0) != 0x80 ||
4543                (s[2] & 0xc0) != 0x80 ||
4544                ((unsigned char)s[0] == 0xE0 &&
4545                 (unsigned char)s[1] < 0xA0) ||
4546                ((unsigned char)s[0] == 0xED &&
4547                 (unsigned char)s[1] > 0x9F)) {
4548                errmsg = "invalid continuation byte";
4549                startinpos = s-starts;
4550                endinpos = startinpos + 1;
4551
4552                /* if s[1] first two bits are 1 and 0, then the invalid
4553                   continuation byte is s[2], so increment endinpos by 1,
4554                   if not, s[1] is invalid and endinpos doesn't need to
4555                   be incremented. */
4556                if ((s[1] & 0xC0) == 0x80)
4557                    endinpos++;
4558                goto utf8Error;
4559            }
4560            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4561            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4562            WRITE_MAYBE_FAIL(i++, ch);
4563            break;
4564
4565        case 4:
4566            if ((s[1] & 0xc0) != 0x80 ||
4567                (s[2] & 0xc0) != 0x80 ||
4568                (s[3] & 0xc0) != 0x80 ||
4569                ((unsigned char)s[0] == 0xF0 &&
4570                 (unsigned char)s[1] < 0x90) ||
4571                ((unsigned char)s[0] == 0xF4 &&
4572                 (unsigned char)s[1] > 0x8F)) {
4573                errmsg = "invalid continuation byte";
4574                startinpos = s-starts;
4575                endinpos = startinpos + 1;
4576                if ((s[1] & 0xC0) == 0x80) {
4577                    endinpos++;
4578                    if ((s[2] & 0xC0) == 0x80)
4579                        endinpos++;
4580                }
4581                goto utf8Error;
4582            }
4583            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4584                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4585            assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
4586
4587            WRITE_MAYBE_FAIL(i++, ch);
4588            break;
4589        }
4590        s += n;
4591        continue;
4592
4593      utf8Error:
4594        if (!has_errors) {
4595            if (refit_partial_string(&unicode, kind, data, i) < 0)
4596                goto onError;
4597            has_errors = 1;
4598        }
4599        if (unicode_decode_call_errorhandler(
4600                errors, &errorHandler,
4601                "utf8", errmsg,
4602                &starts, &e, &startinpos, &endinpos, &exc, &s,
4603                &unicode, &i))
4604            goto onError;
4605        /* Update data because unicode_decode_call_errorhandler might have
4606           re-created or resized the unicode object. */
4607        data = PyUnicode_DATA(unicode);
4608        kind = PyUnicode_KIND(unicode);
4609        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4610    }
4611    /* Ensure the unicode_size calculation above was correct: */
4612    assert(has_errors || i == unicode_size);
4613
4614    if (consumed)
4615        *consumed = s-starts;
4616
4617    /* Adjust length and ready string when it contained errors and
4618       is of the old resizable kind. */
4619    if (has_errors) {
4620        if (PyUnicode_Resize(&unicode, i) < 0)
4621            goto onError;
4622    }
4623
4624    Py_XDECREF(errorHandler);
4625    Py_XDECREF(exc);
4626    assert(_PyUnicode_CheckConsistency(unicode, 1));
4627    return unicode;
4628
4629  onError:
4630    Py_XDECREF(errorHandler);
4631    Py_XDECREF(exc);
4632    Py_DECREF(unicode);
4633    return NULL;
4634}
4635
4636#undef WRITE_MAYBE_FAIL
4637
4638#ifdef __APPLE__
4639
4640/* Simplified UTF-8 decoder using surrogateescape error handler,
4641   used to decode the command line arguments on Mac OS X. */
4642
4643wchar_t*
4644_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4645{
4646    int n;
4647    const char *e;
4648    wchar_t *unicode, *p;
4649
4650    /* Note: size will always be longer than the resulting Unicode
4651       character count */
4652    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4653        PyErr_NoMemory();
4654        return NULL;
4655    }
4656    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4657    if (!unicode)
4658        return NULL;
4659
4660    /* Unpack UTF-8 encoded data */
4661    p = unicode;
4662    e = s + size;
4663    while (s < e) {
4664        Py_UCS4 ch = (unsigned char)*s;
4665
4666        if (ch < 0x80) {
4667            *p++ = (wchar_t)ch;
4668            s++;
4669            continue;
4670        }
4671
4672        n = utf8_code_length[ch];
4673        if (s + n > e) {
4674            goto surrogateescape;
4675        }
4676
4677        switch (n) {
4678        case 0:
4679        case 1:
4680            goto surrogateescape;
4681
4682        case 2:
4683            if ((s[1] & 0xc0) != 0x80)
4684                goto surrogateescape;
4685            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4686            assert ((ch > 0x007F) && (ch <= 0x07FF));
4687            *p++ = (wchar_t)ch;
4688            break;
4689
4690        case 3:
4691            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4692               will result in surrogates in range d800-dfff. Surrogates are
4693               not valid UTF-8 so they are rejected.
4694               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4695               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4696            if ((s[1] & 0xc0) != 0x80 ||
4697                (s[2] & 0xc0) != 0x80 ||
4698                ((unsigned char)s[0] == 0xE0 &&
4699                 (unsigned char)s[1] < 0xA0) ||
4700                ((unsigned char)s[0] == 0xED &&
4701                 (unsigned char)s[1] > 0x9F)) {
4702
4703                goto surrogateescape;
4704            }
4705            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4706            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4707            *p++ = (wchar_t)ch;
4708            break;
4709
4710        case 4:
4711            if ((s[1] & 0xc0) != 0x80 ||
4712                (s[2] & 0xc0) != 0x80 ||
4713                (s[3] & 0xc0) != 0x80 ||
4714                ((unsigned char)s[0] == 0xF0 &&
4715                 (unsigned char)s[1] < 0x90) ||
4716                ((unsigned char)s[0] == 0xF4 &&
4717                 (unsigned char)s[1] > 0x8F)) {
4718                goto surrogateescape;
4719            }
4720            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4721                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4722            assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
4723
4724#if SIZEOF_WCHAR_T == 4
4725            *p++ = (wchar_t)ch;
4726#else
4727            /*  compute and append the two surrogates: */
4728            *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4729            *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4730#endif
4731            break;
4732        }
4733        s += n;
4734        continue;
4735
4736      surrogateescape:
4737        *p++ = 0xDC00 + ch;
4738        s++;
4739    }
4740    *p = L'\0';
4741    return unicode;
4742}
4743
4744#endif /* __APPLE__ */
4745
4746/* Primary internal function which creates utf8 encoded bytes objects.
4747
4748   Allocation strategy:  if the string is short, convert into a stack buffer
4749   and allocate exactly as much space needed at the end.  Else allocate the
4750   maximum possible needed (4 result bytes per Unicode character), and return
4751   the excess memory at the end.
4752*/
4753PyObject *
4754_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4755{
4756#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4757
4758    Py_ssize_t i;                /* index into s of next input byte */
4759    PyObject *result;            /* result string object */
4760    char *p;                     /* next free byte in output buffer */
4761    Py_ssize_t nallocated;      /* number of result bytes allocated */
4762    Py_ssize_t nneeded;            /* number of result bytes needed */
4763    char stackbuf[MAX_SHORT_UNICHARS * 4];
4764    PyObject *errorHandler = NULL;
4765    PyObject *exc = NULL;
4766    int kind;
4767    void *data;
4768    Py_ssize_t size;
4769    PyObject *rep = NULL;
4770
4771    if (!PyUnicode_Check(unicode)) {
4772        PyErr_BadArgument();
4773        return NULL;
4774    }
4775
4776    if (PyUnicode_READY(unicode) == -1)
4777        return NULL;
4778
4779    if (PyUnicode_UTF8(unicode))
4780        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4781                                         PyUnicode_UTF8_LENGTH(unicode));
4782
4783    kind = PyUnicode_KIND(unicode);
4784    data = PyUnicode_DATA(unicode);
4785    size = PyUnicode_GET_LENGTH(unicode);
4786
4787    assert(size >= 0);
4788
4789    if (size <= MAX_SHORT_UNICHARS) {
4790        /* Write into the stack buffer; nallocated can't overflow.
4791         * At the end, we'll allocate exactly as much heap space as it
4792         * turns out we need.
4793         */
4794        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4795        result = NULL;   /* will allocate after we're done */
4796        p = stackbuf;
4797    }
4798    else {
4799        /* Overallocate on the heap, and give the excess back at the end. */
4800        nallocated = size * 4;
4801        if (nallocated / 4 != size)  /* overflow! */
4802            return PyErr_NoMemory();
4803        result = PyBytes_FromStringAndSize(NULL, nallocated);
4804        if (result == NULL)
4805            return NULL;
4806        p = PyBytes_AS_STRING(result);
4807    }
4808
4809    for (i = 0; i < size;) {
4810        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4811
4812        if (ch < 0x80)
4813            /* Encode ASCII */
4814            *p++ = (char) ch;
4815
4816        else if (ch < 0x0800) {
4817            /* Encode Latin-1 */
4818            *p++ = (char)(0xc0 | (ch >> 6));
4819            *p++ = (char)(0x80 | (ch & 0x3f));
4820        } else if (Py_UNICODE_IS_SURROGATE(ch)) {
4821            Py_ssize_t newpos;
4822            Py_ssize_t repsize, k, startpos;
4823            startpos = i-1;
4824            rep = unicode_encode_call_errorhandler(
4825                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4826                  unicode, &exc, startpos, startpos+1, &newpos);
4827            if (!rep)
4828                goto error;
4829
4830            if (PyBytes_Check(rep))
4831                repsize = PyBytes_GET_SIZE(rep);
4832            else
4833                repsize = PyUnicode_GET_LENGTH(rep);
4834
4835            if (repsize > 4) {
4836                Py_ssize_t offset;
4837
4838                if (result == NULL)
4839                    offset = p - stackbuf;
4840                else
4841                    offset = p - PyBytes_AS_STRING(result);
4842
4843                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4844                    /* integer overflow */
4845                    PyErr_NoMemory();
4846                    goto error;
4847                }
4848                nallocated += repsize - 4;
4849                if (result != NULL) {
4850                    if (_PyBytes_Resize(&result, nallocated) < 0)
4851                        goto error;
4852                } else {
4853                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4854                    if (result == NULL)
4855                        goto error;
4856                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4857                }
4858                p = PyBytes_AS_STRING(result) + offset;
4859            }
4860
4861            if (PyBytes_Check(rep)) {
4862                char *prep = PyBytes_AS_STRING(rep);
4863                for(k = repsize; k > 0; k--)
4864                    *p++ = *prep++;
4865            } else /* rep is unicode */ {
4866                enum PyUnicode_Kind repkind;
4867                void *repdata;
4868
4869                if (PyUnicode_READY(rep) < 0)
4870                    goto error;
4871                repkind = PyUnicode_KIND(rep);
4872                repdata = PyUnicode_DATA(rep);
4873
4874                for(k=0; k<repsize; k++) {
4875                    Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
4876                    if (0x80 <= c) {
4877                        raise_encode_exception(&exc, "utf-8",
4878                                               unicode,
4879                                               i-1, i,
4880                                               "surrogates not allowed");
4881                        goto error;
4882                    }
4883                    *p++ = (char)c;
4884                }
4885            }
4886            Py_CLEAR(rep);
4887        } else if (ch < 0x10000) {
4888            *p++ = (char)(0xe0 | (ch >> 12));
4889            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4890            *p++ = (char)(0x80 | (ch & 0x3f));
4891        } else /* ch >= 0x10000 */ {
4892            assert(ch <= MAX_UNICODE);
4893            /* Encode UCS4 Unicode ordinals */
4894            *p++ = (char)(0xf0 | (ch >> 18));
4895            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4896            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4897            *p++ = (char)(0x80 | (ch & 0x3f));
4898        }
4899    }
4900
4901    if (result == NULL) {
4902        /* This was stack allocated. */
4903        nneeded = p - stackbuf;
4904        assert(nneeded <= nallocated);
4905        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4906    }
4907    else {
4908        /* Cut back to size actually needed. */
4909        nneeded = p - PyBytes_AS_STRING(result);
4910        assert(nneeded <= nallocated);
4911        _PyBytes_Resize(&result, nneeded);
4912    }
4913
4914    Py_XDECREF(errorHandler);
4915    Py_XDECREF(exc);
4916    return result;
4917 error:
4918    Py_XDECREF(rep);
4919    Py_XDECREF(errorHandler);
4920    Py_XDECREF(exc);
4921    Py_XDECREF(result);
4922    return NULL;
4923
4924#undef MAX_SHORT_UNICHARS
4925}
4926
4927PyObject *
4928PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4929                     Py_ssize_t size,
4930                     const char *errors)
4931{
4932    PyObject *v, *unicode;
4933
4934    unicode = PyUnicode_FromUnicode(s, size);
4935    if (unicode == NULL)
4936        return NULL;
4937    v = _PyUnicode_AsUTF8String(unicode, errors);
4938    Py_DECREF(unicode);
4939    return v;
4940}
4941
4942PyObject *
4943PyUnicode_AsUTF8String(PyObject *unicode)
4944{
4945    return _PyUnicode_AsUTF8String(unicode, NULL);
4946}
4947
4948/* --- UTF-32 Codec ------------------------------------------------------- */
4949
4950PyObject *
4951PyUnicode_DecodeUTF32(const char *s,
4952                      Py_ssize_t size,
4953                      const char *errors,
4954                      int *byteorder)
4955{
4956    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4957}
4958
4959PyObject *
4960PyUnicode_DecodeUTF32Stateful(const char *s,
4961                              Py_ssize_t size,
4962                              const char *errors,
4963                              int *byteorder,
4964                              Py_ssize_t *consumed)
4965{
4966    const char *starts = s;
4967    Py_ssize_t startinpos;
4968    Py_ssize_t endinpos;
4969    Py_ssize_t outpos;
4970    PyObject *unicode;
4971    const unsigned char *q, *e;
4972    int bo = 0;       /* assume native ordering by default */
4973    const char *errmsg = "";
4974    /* Offsets from q for retrieving bytes in the right order. */
4975#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4976    int iorder[] = {0, 1, 2, 3};
4977#else
4978    int iorder[] = {3, 2, 1, 0};
4979#endif
4980    PyObject *errorHandler = NULL;
4981    PyObject *exc = NULL;
4982
4983    q = (unsigned char *)s;
4984    e = q + size;
4985
4986    if (byteorder)
4987        bo = *byteorder;
4988
4989    /* Check for BOM marks (U+FEFF) in the input and adjust current
4990       byte order setting accordingly. In native mode, the leading BOM
4991       mark is skipped, in all other modes, it is copied to the output
4992       stream as-is (giving a ZWNBSP character). */
4993    if (bo == 0) {
4994        if (size >= 4) {
4995            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4996                (q[iorder[1]] << 8) | q[iorder[0]];
4997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4998            if (bom == 0x0000FEFF) {
4999                q += 4;
5000                bo = -1;
5001            }
5002            else if (bom == 0xFFFE0000) {
5003                q += 4;
5004                bo = 1;
5005            }
5006#else
5007            if (bom == 0x0000FEFF) {
5008                q += 4;
5009                bo = 1;
5010            }
5011            else if (bom == 0xFFFE0000) {
5012                q += 4;
5013                bo = -1;
5014            }
5015#endif
5016        }
5017    }
5018
5019    if (bo == -1) {
5020        /* force LE */
5021        iorder[0] = 0;
5022        iorder[1] = 1;
5023        iorder[2] = 2;
5024        iorder[3] = 3;
5025    }
5026    else if (bo == 1) {
5027        /* force BE */
5028        iorder[0] = 3;
5029        iorder[1] = 2;
5030        iorder[2] = 1;
5031        iorder[3] = 0;
5032    }
5033
5034    /* This might be one to much, because of a BOM */
5035    unicode = PyUnicode_New((size+3)/4, 127);
5036    if (!unicode)
5037        return NULL;
5038    if (size == 0)
5039        return unicode;
5040    outpos = 0;
5041
5042    while (q < e) {
5043        Py_UCS4 ch;
5044        /* remaining bytes at the end? (size should be divisible by 4) */
5045        if (e-q<4) {
5046            if (consumed)
5047                break;
5048            errmsg = "truncated data";
5049            startinpos = ((const char *)q)-starts;
5050            endinpos = ((const char *)e)-starts;
5051            goto utf32Error;
5052            /* The remaining input chars are ignored if the callback
5053               chooses to skip the input */
5054        }
5055        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5056            (q[iorder[1]] << 8) | q[iorder[0]];
5057
5058        if (ch >= 0x110000)
5059        {
5060            errmsg = "codepoint not in range(0x110000)";
5061            startinpos = ((const char *)q)-starts;
5062            endinpos = startinpos+4;
5063            goto utf32Error;
5064        }
5065        if (unicode_putchar(&unicode, &outpos, ch) < 0)
5066            goto onError;
5067        q += 4;
5068        continue;
5069      utf32Error:
5070        if (unicode_decode_call_errorhandler(
5071                errors, &errorHandler,
5072                "utf32", errmsg,
5073                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5074                &unicode, &outpos))
5075            goto onError;
5076    }
5077
5078    if (byteorder)
5079        *byteorder = bo;
5080
5081    if (consumed)
5082        *consumed = (const char *)q-starts;
5083
5084    /* Adjust length */
5085    if (PyUnicode_Resize(&unicode, outpos) < 0)
5086        goto onError;
5087
5088    Py_XDECREF(errorHandler);
5089    Py_XDECREF(exc);
5090    return unicode_result(unicode);
5091
5092  onError:
5093    Py_DECREF(unicode);
5094    Py_XDECREF(errorHandler);
5095    Py_XDECREF(exc);
5096    return NULL;
5097}
5098
5099PyObject *
5100_PyUnicode_EncodeUTF32(PyObject *str,
5101                       const char *errors,
5102                       int byteorder)
5103{
5104    int kind;
5105    void *data;
5106    Py_ssize_t len;
5107    PyObject *v;
5108    unsigned char *p;
5109    Py_ssize_t nsize, bytesize, i;
5110    /* Offsets from p for storing byte pairs in the right order. */
5111#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5112    int iorder[] = {0, 1, 2, 3};
5113#else
5114    int iorder[] = {3, 2, 1, 0};
5115#endif
5116
5117#define STORECHAR(CH)                           \
5118    do {                                        \
5119        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5120        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5121        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5122        p[iorder[0]] = (CH) & 0xff;             \
5123        p += 4;                                 \
5124    } while(0)
5125
5126    if (!PyUnicode_Check(str)) {
5127        PyErr_BadArgument();
5128        return NULL;
5129    }
5130    if (PyUnicode_READY(str) < 0)
5131        return NULL;
5132    kind = PyUnicode_KIND(str);
5133    data = PyUnicode_DATA(str);
5134    len = PyUnicode_GET_LENGTH(str);
5135
5136    nsize = len + (byteorder == 0);
5137    bytesize = nsize * 4;
5138    if (bytesize / 4 != nsize)
5139        return PyErr_NoMemory();
5140    v = PyBytes_FromStringAndSize(NULL, bytesize);
5141    if (v == NULL)
5142        return NULL;
5143
5144    p = (unsigned char *)PyBytes_AS_STRING(v);
5145    if (byteorder == 0)
5146        STORECHAR(0xFEFF);
5147    if (len == 0)
5148        goto done;
5149
5150    if (byteorder == -1) {
5151        /* force LE */
5152        iorder[0] = 0;
5153        iorder[1] = 1;
5154        iorder[2] = 2;
5155        iorder[3] = 3;
5156    }
5157    else if (byteorder == 1) {
5158        /* force BE */
5159        iorder[0] = 3;
5160        iorder[1] = 2;
5161        iorder[2] = 1;
5162        iorder[3] = 0;
5163    }
5164
5165    for (i = 0; i < len; i++)
5166        STORECHAR(PyUnicode_READ(kind, data, i));
5167
5168  done:
5169    return v;
5170#undef STORECHAR
5171}
5172
5173PyObject *
5174PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5175                      Py_ssize_t size,
5176                      const char *errors,
5177                      int byteorder)
5178{
5179    PyObject *result;
5180    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5181    if (tmp == NULL)
5182        return NULL;
5183    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5184    Py_DECREF(tmp);
5185    return result;
5186}
5187
5188PyObject *
5189PyUnicode_AsUTF32String(PyObject *unicode)
5190{
5191    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5192}
5193
5194/* --- UTF-16 Codec ------------------------------------------------------- */
5195
5196PyObject *
5197PyUnicode_DecodeUTF16(const char *s,
5198                      Py_ssize_t size,
5199                      const char *errors,
5200                      int *byteorder)
5201{
5202    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5203}
5204
5205/* Two masks for fast checking of whether a C 'long' may contain
5206   UTF16-encoded surrogate characters. This is an efficient heuristic,
5207   assuming that non-surrogate characters with a code point >= 0x8000 are
5208   rare in most input.
5209   FAST_CHAR_MASK is used when the input is in native byte ordering,
5210   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
5211*/
5212#if (SIZEOF_LONG == 8)
5213# define FAST_CHAR_MASK         0x8000800080008000L
5214# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5215#elif (SIZEOF_LONG == 4)
5216# define FAST_CHAR_MASK         0x80008000L
5217# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5218#else
5219# error C 'long' size should be either 4 or 8!
5220#endif
5221
5222PyObject *
5223PyUnicode_DecodeUTF16Stateful(const char *s,
5224                              Py_ssize_t size,
5225                              const char *errors,
5226                              int *byteorder,
5227                              Py_ssize_t *consumed)
5228{
5229    const char *starts = s;
5230    Py_ssize_t startinpos;
5231    Py_ssize_t endinpos;
5232    Py_ssize_t outpos;
5233    PyObject *unicode;
5234    const unsigned char *q, *e, *aligned_end;
5235    int bo = 0;       /* assume native ordering by default */
5236    int native_ordering = 0;
5237    const char *errmsg = "";
5238    /* Offsets from q for retrieving byte pairs in the right order. */
5239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5240    int ihi = 1, ilo = 0;
5241#else
5242    int ihi = 0, ilo = 1;
5243#endif
5244    PyObject *errorHandler = NULL;
5245    PyObject *exc = NULL;
5246
5247    /* Note: size will always be longer than the resulting Unicode
5248       character count */
5249    unicode = PyUnicode_New(size, 127);
5250    if (!unicode)
5251        return NULL;
5252    if (size == 0)
5253        return unicode;
5254    outpos = 0;
5255
5256    q = (unsigned char *)s;
5257    e = q + size - 1;
5258
5259    if (byteorder)
5260        bo = *byteorder;
5261
5262    /* Check for BOM marks (U+FEFF) in the input and adjust current
5263       byte order setting accordingly. In native mode, the leading BOM
5264       mark is skipped, in all other modes, it is copied to the output
5265       stream as-is (giving a ZWNBSP character). */
5266    if (bo == 0) {
5267        if (size >= 2) {
5268            const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
5269#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5270            if (bom == 0xFEFF) {
5271                q += 2;
5272                bo = -1;
5273            }
5274            else if (bom == 0xFFFE) {
5275                q += 2;
5276                bo = 1;
5277            }
5278#else
5279            if (bom == 0xFEFF) {
5280                q += 2;
5281                bo = 1;
5282            }
5283            else if (bom == 0xFFFE) {
5284                q += 2;
5285                bo = -1;
5286            }
5287#endif
5288        }
5289    }
5290
5291    if (bo == -1) {
5292        /* force LE */
5293        ihi = 1;
5294        ilo = 0;
5295    }
5296    else if (bo == 1) {
5297        /* force BE */
5298        ihi = 0;
5299        ilo = 1;
5300    }
5301#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5302    native_ordering = ilo < ihi;
5303#else
5304    native_ordering = ilo > ihi;
5305#endif
5306
5307    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5308    while (q < e) {
5309        Py_UCS4 ch;
5310        /* First check for possible aligned read of a C 'long'. Unaligned
5311           reads are more expensive, better to defer to another iteration. */
5312        if (!((size_t) q & LONG_PTR_MASK)) {
5313            /* Fast path for runs of non-surrogate chars. */
5314            register const unsigned char *_q = q;
5315            int kind = PyUnicode_KIND(unicode);
5316            void *data = PyUnicode_DATA(unicode);
5317            while (_q < aligned_end) {
5318                unsigned long block = * (unsigned long *) _q;
5319                unsigned short *pblock = (unsigned short*)&block;
5320                Py_UCS4 maxch;
5321                if (native_ordering) {
5322                    /* Can use buffer directly */
5323                    if (block & FAST_CHAR_MASK)
5324                        break;
5325                }
5326                else {
5327                    /* Need to byte-swap */
5328                    unsigned char *_p = (unsigned char*)pblock;
5329                    if (block & SWAPPED_FAST_CHAR_MASK)
5330                        break;
5331                    _p[0] = _q[1];
5332                    _p[1] = _q[0];
5333                    _p[2] = _q[3];
5334                    _p[3] = _q[2];
5335#if (SIZEOF_LONG == 8)
5336                    _p[4] = _q[5];
5337                    _p[5] = _q[4];
5338                    _p[6] = _q[7];
5339                    _p[7] = _q[6];
5340#endif
5341                }
5342                maxch = Py_MAX(pblock[0], pblock[1]);
5343#if SIZEOF_LONG == 8
5344                maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5345#endif
5346                if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5347                    if (unicode_widen(&unicode, maxch) < 0)
5348                        goto onError;
5349                    kind = PyUnicode_KIND(unicode);
5350                    data = PyUnicode_DATA(unicode);
5351                }
5352                PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5353                PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5354#if SIZEOF_LONG == 8
5355                PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5356                PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5357#endif
5358                _q += SIZEOF_LONG;
5359            }
5360            q = _q;
5361            if (q >= e)
5362                break;
5363        }
5364        ch = (q[ihi] << 8) | q[ilo];
5365
5366        q += 2;
5367
5368        if (!Py_UNICODE_IS_SURROGATE(ch)) {
5369            if (unicode_putchar(&unicode, &outpos, ch) < 0)
5370                goto onError;
5371            continue;
5372        }
5373
5374        /* UTF-16 code pair: */
5375        if (q > e) {
5376            errmsg = "unexpected end of data";
5377            startinpos = (((const char *)q) - 2) - starts;
5378            endinpos = ((const char *)e) + 1 - starts;
5379            goto utf16Error;
5380        }
5381        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5382            Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
5383            q += 2;
5384            if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
5385                if (unicode_putchar(&unicode, &outpos,
5386                                    Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
5387                    goto onError;
5388                continue;
5389            }
5390            else {
5391                errmsg = "illegal UTF-16 surrogate";
5392                startinpos = (((const char *)q)-4)-starts;
5393                endinpos = startinpos+2;
5394                goto utf16Error;
5395            }
5396
5397        }
5398        errmsg = "illegal encoding";
5399        startinpos = (((const char *)q)-2)-starts;
5400        endinpos = startinpos+2;
5401        /* Fall through to report the error */
5402
5403      utf16Error:
5404        if (unicode_decode_call_errorhandler(
5405                errors,
5406                &errorHandler,
5407                "utf16", errmsg,
5408                &starts,
5409                (const char **)&e,
5410                &startinpos,
5411                &endinpos,
5412                &exc,
5413                (const char **)&q,
5414                &unicode,
5415                &outpos))
5416            goto onError;
5417    }
5418    /* remaining byte at the end? (size should be even) */
5419    if (e == q) {
5420        if (!consumed) {
5421            errmsg = "truncated data";
5422            startinpos = ((const char *)q) - starts;
5423            endinpos = ((const char *)e) + 1 - starts;
5424            if (unicode_decode_call_errorhandler(
5425                    errors,
5426                    &errorHandler,
5427                    "utf16", errmsg,
5428                    &starts,
5429                    (const char **)&e,
5430                    &startinpos,
5431                    &endinpos,
5432                    &exc,
5433                    (const char **)&q,
5434                    &unicode,
5435                    &outpos))
5436                goto onError;
5437            /* The remaining input chars are ignored if the callback
5438               chooses to skip the input */
5439        }
5440    }
5441
5442    if (byteorder)
5443        *byteorder = bo;
5444
5445    if (consumed)
5446        *consumed = (const char *)q-starts;
5447
5448    /* Adjust length */
5449    if (PyUnicode_Resize(&unicode, outpos) < 0)
5450        goto onError;
5451
5452    Py_XDECREF(errorHandler);
5453    Py_XDECREF(exc);
5454    return unicode_result(unicode);
5455
5456  onError:
5457    Py_DECREF(unicode);
5458    Py_XDECREF(errorHandler);
5459    Py_XDECREF(exc);
5460    return NULL;
5461}
5462
5463#undef FAST_CHAR_MASK
5464#undef SWAPPED_FAST_CHAR_MASK
5465
5466PyObject *
5467_PyUnicode_EncodeUTF16(PyObject *str,
5468                       const char *errors,
5469                       int byteorder)
5470{
5471    int kind;
5472    void *data;
5473    Py_ssize_t len;
5474    PyObject *v;
5475    unsigned char *p;
5476    Py_ssize_t nsize, bytesize;
5477    Py_ssize_t i, pairs;
5478    /* Offsets from p for storing byte pairs in the right order. */
5479#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5480    int ihi = 1, ilo = 0;
5481#else
5482    int ihi = 0, ilo = 1;
5483#endif
5484
5485#define STORECHAR(CH)                           \
5486    do {                                        \
5487        p[ihi] = ((CH) >> 8) & 0xff;            \
5488        p[ilo] = (CH) & 0xff;                   \
5489        p += 2;                                 \
5490    } while(0)
5491
5492    if (!PyUnicode_Check(str)) {
5493        PyErr_BadArgument();
5494        return NULL;
5495    }
5496    if (PyUnicode_READY(str) < 0)
5497        return NULL;
5498    kind = PyUnicode_KIND(str);
5499    data = PyUnicode_DATA(str);
5500    len = PyUnicode_GET_LENGTH(str);
5501
5502    pairs = 0;
5503    if (kind == PyUnicode_4BYTE_KIND)
5504        for (i = 0; i < len; i++)
5505            if (PyUnicode_READ(kind, data, i) >= 0x10000)
5506                pairs++;
5507    /* 2 * (len + pairs + (byteorder == 0)) */
5508    if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5509        return PyErr_NoMemory();
5510    nsize = len + pairs + (byteorder == 0);
5511    bytesize = nsize * 2;
5512    if (bytesize / 2 != nsize)
5513        return PyErr_NoMemory();
5514    v = PyBytes_FromStringAndSize(NULL, bytesize);
5515    if (v == NULL)
5516        return NULL;
5517
5518    p = (unsigned char *)PyBytes_AS_STRING(v);
5519    if (byteorder == 0)
5520        STORECHAR(0xFEFF);
5521    if (len == 0)
5522        goto done;
5523
5524    if (byteorder == -1) {
5525        /* force LE */
5526        ihi = 1;
5527        ilo = 0;
5528    }
5529    else if (byteorder == 1) {
5530        /* force BE */
5531        ihi = 0;
5532        ilo = 1;
5533    }
5534
5535    for (i = 0; i < len; i++) {
5536        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5537        Py_UCS4 ch2 = 0;
5538        if (ch >= 0x10000) {
5539            ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5540            ch  = Py_UNICODE_HIGH_SURROGATE(ch);
5541        }
5542        STORECHAR(ch);
5543        if (ch2)
5544            STORECHAR(ch2);
5545    }
5546
5547  done:
5548    return v;
5549#undef STORECHAR
5550}
5551
5552PyObject *
5553PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5554                      Py_ssize_t size,
5555                      const char *errors,
5556                      int byteorder)
5557{
5558    PyObject *result;
5559    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5560    if (tmp == NULL)
5561        return NULL;
5562    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5563    Py_DECREF(tmp);
5564    return result;
5565}
5566
5567PyObject *
5568PyUnicode_AsUTF16String(PyObject *unicode)
5569{
5570    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5571}
5572
5573/* --- Unicode Escape Codec ----------------------------------------------- */
5574
5575/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5576   if all the escapes in the string make it still a valid ASCII string.
5577   Returns -1 if any escapes were found which cause the string to
5578   pop out of ASCII range.  Otherwise returns the length of the
5579   required buffer to hold the string.
5580   */
5581static Py_ssize_t
5582length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5583{
5584    const unsigned char *p = (const unsigned char *)s;
5585    const unsigned char *end = p + size;
5586    Py_ssize_t length = 0;
5587
5588    if (size < 0)
5589        return -1;
5590
5591    for (; p < end; ++p) {
5592        if (*p > 127) {
5593            /* Non-ASCII */
5594            return -1;
5595        }
5596        else if (*p != '\\') {
5597            /* Normal character */
5598            ++length;
5599        }
5600        else {
5601            /* Backslash-escape, check next char */
5602            ++p;
5603            /* Escape sequence reaches till end of string or
5604               non-ASCII follow-up. */
5605            if (p >= end || *p > 127)
5606                return -1;
5607            switch (*p) {
5608            case '\n':
5609                /* backslash + \n result in zero characters */
5610                break;
5611            case '\\': case '\'': case '\"':
5612            case 'b': case 'f': case 't':
5613            case 'n': case 'r': case 'v': case 'a':
5614                ++length;
5615                break;
5616            case '0': case '1': case '2': case '3':
5617            case '4': case '5': case '6': case '7':
5618            case 'x': case 'u': case 'U': case 'N':
5619                /* these do not guarantee ASCII characters */
5620                return -1;
5621            default:
5622                /* count the backslash + the other character */
5623                length += 2;
5624            }
5625        }
5626    }
5627    return length;
5628}
5629
5630static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5631
5632PyObject *
5633PyUnicode_DecodeUnicodeEscape(const char *s,
5634                              Py_ssize_t size,
5635                              const char *errors)
5636{
5637    const char *starts = s;
5638    Py_ssize_t startinpos;
5639    Py_ssize_t endinpos;
5640    int j;
5641    PyObject *v;
5642    const char *end;
5643    char* message;
5644    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5645    PyObject *errorHandler = NULL;
5646    PyObject *exc = NULL;
5647    Py_ssize_t len;
5648    Py_ssize_t i;
5649
5650    len = length_of_escaped_ascii_string(s, size);
5651
5652    /* After length_of_escaped_ascii_string() there are two alternatives,
5653       either the string is pure ASCII with named escapes like \n, etc.
5654       and we determined it's exact size (common case)
5655       or it contains \x, \u, ... escape sequences.  then we create a
5656       legacy wchar string and resize it at the end of this function. */
5657    if (len >= 0) {
5658        v = PyUnicode_New(len, 127);
5659        if (!v)
5660            goto onError;
5661        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5662    }
5663    else {
5664        /* Escaped strings will always be longer than the resulting
5665           Unicode string, so we start with size here and then reduce the
5666           length after conversion to the true value.
5667           (but if the error callback returns a long replacement string
5668           we'll have to allocate more space) */
5669        v = PyUnicode_New(size, 127);
5670        if (!v)
5671            goto onError;
5672        len = size;
5673    }
5674
5675    if (size == 0)
5676        return v;
5677    i = 0;
5678    end = s + size;
5679
5680    while (s < end) {
5681        unsigned char c;
5682        Py_UCS4 x;
5683        int digits;
5684
5685        /* The only case in which i == ascii_length is a backslash
5686           followed by a newline. */
5687        assert(i <= len);
5688
5689        /* Non-escape characters are interpreted as Unicode ordinals */
5690        if (*s != '\\') {
5691            if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5692                goto onError;
5693            continue;
5694        }
5695
5696        startinpos = s-starts;
5697        /* \ - Escapes */
5698        s++;
5699        c = *s++;
5700        if (s > end)
5701            c = '\0'; /* Invalid after \ */
5702
5703        /* The only case in which i == ascii_length is a backslash
5704           followed by a newline. */
5705        assert(i < len || (i == len && c == '\n'));
5706
5707        switch (c) {
5708
5709            /* \x escapes */
5710#define WRITECHAR(ch)                                   \
5711            do {                                        \
5712                if (unicode_putchar(&v, &i, ch) < 0)    \
5713                    goto onError;                       \
5714            }while(0)
5715
5716        case '\n': break;
5717        case '\\': WRITECHAR('\\'); break;
5718        case '\'': WRITECHAR('\''); break;
5719        case '\"': WRITECHAR('\"'); break;
5720        case 'b': WRITECHAR('\b'); break;
5721        /* FF */
5722        case 'f': WRITECHAR('\014'); break;
5723        case 't': WRITECHAR('\t'); break;
5724        case 'n': WRITECHAR('\n'); break;
5725        case 'r': WRITECHAR('\r'); break;
5726        /* VT */
5727        case 'v': WRITECHAR('\013'); break;
5728        /* BEL, not classic C */
5729        case 'a': WRITECHAR('\007'); break;
5730
5731            /* \OOO (octal) escapes */
5732        case '0': case '1': case '2': case '3':
5733        case '4': case '5': case '6': case '7':
5734            x = s[-1] - '0';
5735            if (s < end && '0' <= *s && *s <= '7') {
5736                x = (x<<3) + *s++ - '0';
5737                if (s < end && '0' <= *s && *s <= '7')
5738                    x = (x<<3) + *s++ - '0';
5739            }
5740            WRITECHAR(x);
5741            break;
5742
5743            /* hex escapes */
5744            /* \xXX */
5745        case 'x':
5746            digits = 2;
5747            message = "truncated \\xXX escape";
5748            goto hexescape;
5749
5750            /* \uXXXX */
5751        case 'u':
5752            digits = 4;
5753            message = "truncated \\uXXXX escape";
5754            goto hexescape;
5755
5756            /* \UXXXXXXXX */
5757        case 'U':
5758            digits = 8;
5759            message = "truncated \\UXXXXXXXX escape";
5760        hexescape:
5761            chr = 0;
5762            if (s+digits>end) {
5763                endinpos = size;
5764                if (unicode_decode_call_errorhandler(
5765                        errors, &errorHandler,
5766                        "unicodeescape", "end of string in escape sequence",
5767                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5768                        &v, &i))
5769                    goto onError;
5770                goto nextByte;
5771            }
5772            for (j = 0; j < digits; ++j) {
5773                c = (unsigned char) s[j];
5774                if (!Py_ISXDIGIT(c)) {
5775                    endinpos = (s+j+1)-starts;
5776                    if (unicode_decode_call_errorhandler(
5777                            errors, &errorHandler,
5778                            "unicodeescape", message,
5779                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5780                            &v, &i))
5781                        goto onError;
5782                    len = PyUnicode_GET_LENGTH(v);
5783                    goto nextByte;
5784                }
5785                chr = (chr<<4) & ~0xF;
5786                if (c >= '0' && c <= '9')
5787                    chr += c - '0';
5788                else if (c >= 'a' && c <= 'f')
5789                    chr += 10 + c - 'a';
5790                else
5791                    chr += 10 + c - 'A';
5792            }
5793            s += j;
5794            if (chr == 0xffffffff && PyErr_Occurred())
5795                /* _decoding_error will have already written into the
5796                   target buffer. */
5797                break;
5798        store:
5799            /* when we get here, chr is a 32-bit unicode character */
5800            if (chr <= MAX_UNICODE) {
5801                WRITECHAR(chr);
5802            } else {
5803                endinpos = s-starts;
5804                if (unicode_decode_call_errorhandler(
5805                        errors, &errorHandler,
5806                        "unicodeescape", "illegal Unicode character",
5807                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5808                        &v, &i))
5809                    goto onError;
5810            }
5811            break;
5812
5813            /* \N{name} */
5814        case 'N':
5815            message = "malformed \\N character escape";
5816            if (ucnhash_CAPI == NULL) {
5817                /* load the unicode data module */
5818                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5819                                                PyUnicodeData_CAPSULE_NAME, 1);
5820                if (ucnhash_CAPI == NULL)
5821                    goto ucnhashError;
5822            }
5823            if (*s == '{') {
5824                const char *start = s+1;
5825                /* look for the closing brace */
5826                while (*s != '}' && s < end)
5827                    s++;
5828                if (s > start && s < end && *s == '}') {
5829                    /* found a name.  look it up in the unicode database */
5830                    message = "unknown Unicode character name";
5831                    s++;
5832                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5833                                              &chr, 0))
5834                        goto store;
5835                }
5836            }
5837            endinpos = s-starts;
5838            if (unicode_decode_call_errorhandler(
5839                    errors, &errorHandler,
5840                    "unicodeescape", message,
5841                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5842                    &v, &i))
5843                goto onError;
5844            break;
5845
5846        default:
5847            if (s > end) {
5848                message = "\\ at end of string";
5849                s--;
5850                endinpos = s-starts;
5851                if (unicode_decode_call_errorhandler(
5852                        errors, &errorHandler,
5853                        "unicodeescape", message,
5854                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5855                        &v, &i))
5856                    goto onError;
5857            }
5858            else {
5859                WRITECHAR('\\');
5860                WRITECHAR(s[-1]);
5861            }
5862            break;
5863        }
5864      nextByte:
5865        ;
5866    }
5867#undef WRITECHAR
5868
5869    if (PyUnicode_Resize(&v, i) < 0)
5870        goto onError;
5871    Py_XDECREF(errorHandler);
5872    Py_XDECREF(exc);
5873    return unicode_result(v);
5874
5875  ucnhashError:
5876    PyErr_SetString(
5877        PyExc_UnicodeError,
5878        "\\N escapes not supported (can't load unicodedata module)"
5879        );
5880    Py_XDECREF(v);
5881    Py_XDECREF(errorHandler);
5882    Py_XDECREF(exc);
5883    return NULL;
5884
5885  onError:
5886    Py_XDECREF(v);
5887    Py_XDECREF(errorHandler);
5888    Py_XDECREF(exc);
5889    return NULL;
5890}
5891
5892/* Return a Unicode-Escape string version of the Unicode object.
5893
5894   If quotes is true, the string is enclosed in u"" or u'' quotes as
5895   appropriate.
5896
5897*/
5898
5899PyObject *
5900PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5901{
5902    Py_ssize_t i, len;
5903    PyObject *repr;
5904    char *p;
5905    int kind;
5906    void *data;
5907    Py_ssize_t expandsize = 0;
5908
5909    /* Initial allocation is based on the longest-possible unichr
5910       escape.
5911
5912       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5913       unichr, so in this case it's the longest unichr escape. In
5914       narrow (UTF-16) builds this is five chars per source unichr
5915       since there are two unichrs in the surrogate pair, so in narrow
5916       (UTF-16) builds it's not the longest unichr escape.
5917
5918       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5919       so in the narrow (UTF-16) build case it's the longest unichr
5920       escape.
5921    */
5922
5923    if (!PyUnicode_Check(unicode)) {
5924        PyErr_BadArgument();
5925        return NULL;
5926    }
5927    if (PyUnicode_READY(unicode) < 0)
5928        return NULL;
5929    len = PyUnicode_GET_LENGTH(unicode);
5930    kind = PyUnicode_KIND(unicode);
5931    data = PyUnicode_DATA(unicode);
5932    switch(kind) {
5933    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5934    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5935    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5936    }
5937
5938    if (len == 0)
5939        return PyBytes_FromStringAndSize(NULL, 0);
5940
5941    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5942        return PyErr_NoMemory();
5943
5944    repr = PyBytes_FromStringAndSize(NULL,
5945                                     2
5946                                     + expandsize*len
5947                                     + 1);
5948    if (repr == NULL)
5949        return NULL;
5950
5951    p = PyBytes_AS_STRING(repr);
5952
5953    for (i = 0; i < len; i++) {
5954        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5955
5956        /* Escape backslashes */
5957        if (ch == '\\') {
5958            *p++ = '\\';
5959            *p++ = (char) ch;
5960            continue;
5961        }
5962
5963        /* Map 21-bit characters to '\U00xxxxxx' */
5964        else if (ch >= 0x10000) {
5965            assert(ch <= MAX_UNICODE);
5966            *p++ = '\\';
5967            *p++ = 'U';
5968            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5969            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5970            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5971            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5972            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5973            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5974            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5975            *p++ = Py_hexdigits[ch & 0x0000000F];
5976            continue;
5977        }
5978
5979        /* Map 16-bit characters to '\uxxxx' */
5980        if (ch >= 256) {
5981            *p++ = '\\';
5982            *p++ = 'u';
5983            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5984            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5985            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5986            *p++ = Py_hexdigits[ch & 0x000F];
5987        }
5988
5989        /* Map special whitespace to '\t', \n', '\r' */
5990        else if (ch == '\t') {
5991            *p++ = '\\';
5992            *p++ = 't';
5993        }
5994        else if (ch == '\n') {
5995            *p++ = '\\';
5996            *p++ = 'n';
5997        }
5998        else if (ch == '\r') {
5999            *p++ = '\\';
6000            *p++ = 'r';
6001        }
6002
6003        /* Map non-printable US ASCII to '\xhh' */
6004        else if (ch < ' ' || ch >= 0x7F) {
6005            *p++ = '\\';
6006            *p++ = 'x';
6007            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6008            *p++ = Py_hexdigits[ch & 0x000F];
6009        }
6010
6011        /* Copy everything else as-is */
6012        else
6013            *p++ = (char) ch;
6014    }
6015
6016    assert(p - PyBytes_AS_STRING(repr) > 0);
6017    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6018        return NULL;
6019    return repr;
6020}
6021
6022PyObject *
6023PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6024                              Py_ssize_t size)
6025{
6026    PyObject *result;
6027    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6028    if (tmp == NULL)
6029        return NULL;
6030    result = PyUnicode_AsUnicodeEscapeString(tmp);
6031    Py_DECREF(tmp);
6032    return result;
6033}
6034
6035/* --- Raw Unicode Escape Codec ------------------------------------------- */
6036
6037PyObject *
6038PyUnicode_DecodeRawUnicodeEscape(const char *s,
6039                                 Py_ssize_t size,
6040                                 const char *errors)
6041{
6042    const char *starts = s;
6043    Py_ssize_t startinpos;
6044    Py_ssize_t endinpos;
6045    Py_ssize_t outpos;
6046    PyObject *v;
6047    const char *end;
6048    const char *bs;
6049    PyObject *errorHandler = NULL;
6050    PyObject *exc = NULL;
6051
6052    /* Escaped strings will always be longer than the resulting
6053       Unicode string, so we start with size here and then reduce the
6054       length after conversion to the true value. (But decoding error
6055       handler might have to resize the string) */
6056    v = PyUnicode_New(size, 127);
6057    if (v == NULL)
6058        goto onError;
6059    if (size == 0)
6060        return v;
6061    outpos = 0;
6062    end = s + size;
6063    while (s < end) {
6064        unsigned char c;
6065        Py_UCS4 x;
6066        int i;
6067        int count;
6068
6069        /* Non-escape characters are interpreted as Unicode ordinals */
6070        if (*s != '\\') {
6071            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6072                goto onError;
6073            continue;
6074        }
6075        startinpos = s-starts;
6076
6077        /* \u-escapes are only interpreted iff the number of leading
6078           backslashes if odd */
6079        bs = s;
6080        for (;s < end;) {
6081            if (*s != '\\')
6082                break;
6083            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6084                goto onError;
6085        }
6086        if (((s - bs) & 1) == 0 ||
6087            s >= end ||
6088            (*s != 'u' && *s != 'U')) {
6089            continue;
6090        }
6091        outpos--;
6092        count = *s=='u' ? 4 : 8;
6093        s++;
6094
6095        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6096        for (x = 0, i = 0; i < count; ++i, ++s) {
6097            c = (unsigned char)*s;
6098            if (!Py_ISXDIGIT(c)) {
6099                endinpos = s-starts;
6100                if (unicode_decode_call_errorhandler(
6101                        errors, &errorHandler,
6102                        "rawunicodeescape", "truncated \\uXXXX",
6103                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6104                        &v, &outpos))
6105                    goto onError;
6106                goto nextByte;
6107            }
6108            x = (x<<4) & ~0xF;
6109            if (c >= '0' && c <= '9')
6110                x += c - '0';
6111            else if (c >= 'a' && c <= 'f')
6112                x += 10 + c - 'a';
6113            else
6114                x += 10 + c - 'A';
6115        }
6116        if (x <= MAX_UNICODE) {
6117            if (unicode_putchar(&v, &outpos, x) < 0)
6118                goto onError;
6119        } else {
6120            endinpos = s-starts;
6121            if (unicode_decode_call_errorhandler(
6122                    errors, &errorHandler,
6123                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6124                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6125                    &v, &outpos))
6126                goto onError;
6127        }
6128      nextByte:
6129        ;
6130    }
6131    if (PyUnicode_Resize(&v, outpos) < 0)
6132        goto onError;
6133    Py_XDECREF(errorHandler);
6134    Py_XDECREF(exc);
6135    return unicode_result(v);
6136
6137  onError:
6138    Py_XDECREF(v);
6139    Py_XDECREF(errorHandler);
6140    Py_XDECREF(exc);
6141    return NULL;
6142}
6143
6144
6145PyObject *
6146PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6147{
6148    PyObject *repr;
6149    char *p;
6150    char *q;
6151    Py_ssize_t expandsize, pos;
6152    int kind;
6153    void *data;
6154    Py_ssize_t len;
6155
6156    if (!PyUnicode_Check(unicode)) {
6157        PyErr_BadArgument();
6158        return NULL;
6159    }
6160    if (PyUnicode_READY(unicode) < 0)
6161        return NULL;
6162    kind = PyUnicode_KIND(unicode);
6163    data = PyUnicode_DATA(unicode);
6164    len = PyUnicode_GET_LENGTH(unicode);
6165    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6166       bytes, and 1 byte characters 4. */
6167    expandsize = kind * 2 + 2;
6168
6169    if (len > PY_SSIZE_T_MAX / expandsize)
6170        return PyErr_NoMemory();
6171
6172    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6173    if (repr == NULL)
6174        return NULL;
6175    if (len == 0)
6176        return repr;
6177
6178    p = q = PyBytes_AS_STRING(repr);
6179    for (pos = 0; pos < len; pos++) {
6180        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6181        /* Map 32-bit characters to '\Uxxxxxxxx' */
6182        if (ch >= 0x10000) {
6183            assert(ch <= MAX_UNICODE);
6184            *p++ = '\\';
6185            *p++ = 'U';
6186            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6187            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6188            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6189            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6190            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6191            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6192            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6193            *p++ = Py_hexdigits[ch & 15];
6194        }
6195        /* Map 16-bit characters to '\uxxxx' */
6196        else if (ch >= 256) {
6197            *p++ = '\\';
6198            *p++ = 'u';
6199            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6200            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6201            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6202            *p++ = Py_hexdigits[ch & 15];
6203        }
6204        /* Copy everything else as-is */
6205        else
6206            *p++ = (char) ch;
6207    }
6208
6209    assert(p > q);
6210    if (_PyBytes_Resize(&repr, p - q) < 0)
6211        return NULL;
6212    return repr;
6213}
6214
6215PyObject *
6216PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6217                                 Py_ssize_t size)
6218{
6219    PyObject *result;
6220    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6221    if (tmp == NULL)
6222        return NULL;
6223    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6224    Py_DECREF(tmp);
6225    return result;
6226}
6227
6228/* --- Unicode Internal Codec ------------------------------------------- */
6229
6230PyObject *
6231_PyUnicode_DecodeUnicodeInternal(const char *s,
6232                                 Py_ssize_t size,
6233                                 const char *errors)
6234{
6235    const char *starts = s;
6236    Py_ssize_t startinpos;
6237    Py_ssize_t endinpos;
6238    Py_ssize_t outpos;
6239    PyObject *v;
6240    const char *end;
6241    const char *reason;
6242    PyObject *errorHandler = NULL;
6243    PyObject *exc = NULL;
6244
6245    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6246                     "unicode_internal codec has been deprecated",
6247                     1))
6248        return NULL;
6249
6250    /* XXX overflow detection missing */
6251    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
6252    if (v == NULL)
6253        goto onError;
6254    if (PyUnicode_GET_LENGTH(v) == 0)
6255        return v;
6256    outpos = 0;
6257    end = s + size;
6258
6259    while (s < end) {
6260        Py_UNICODE uch;
6261        Py_UCS4 ch;
6262        /* We copy the raw representation one byte at a time because the
6263           pointer may be unaligned (see test_codeccallbacks). */
6264        ((char *) &uch)[0] = s[0];
6265        ((char *) &uch)[1] = s[1];
6266#ifdef Py_UNICODE_WIDE
6267        ((char *) &uch)[2] = s[2];
6268        ((char *) &uch)[3] = s[3];
6269#endif
6270        ch = uch;
6271
6272        /* We have to sanity check the raw data, otherwise doom looms for
6273           some malformed UCS-4 data. */
6274        if (
6275#ifdef Py_UNICODE_WIDE
6276            ch > 0x10ffff ||
6277#endif
6278            end-s < Py_UNICODE_SIZE
6279            )
6280        {
6281            startinpos = s - starts;
6282            if (end-s < Py_UNICODE_SIZE) {
6283                endinpos = end-starts;
6284                reason = "truncated input";
6285            }
6286            else {
6287                endinpos = s - starts + Py_UNICODE_SIZE;
6288                reason = "illegal code point (> 0x10FFFF)";
6289            }
6290            if (unicode_decode_call_errorhandler(
6291                    errors, &errorHandler,
6292                    "unicode_internal", reason,
6293                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6294                    &v, &outpos))
6295                goto onError;
6296            continue;
6297        }
6298
6299        s += Py_UNICODE_SIZE;
6300#ifndef Py_UNICODE_WIDE
6301        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
6302        {
6303            Py_UNICODE uch2;
6304            ((char *) &uch2)[0] = s[0];
6305            ((char *) &uch2)[1] = s[1];
6306            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6307            {
6308                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6309                s += Py_UNICODE_SIZE;
6310            }
6311        }
6312#endif
6313
6314        if (unicode_putchar(&v, &outpos, ch) < 0)
6315            goto onError;
6316    }
6317
6318    if (PyUnicode_Resize(&v, outpos) < 0)
6319        goto onError;
6320    Py_XDECREF(errorHandler);
6321    Py_XDECREF(exc);
6322    return unicode_result(v);
6323
6324  onError:
6325    Py_XDECREF(v);
6326    Py_XDECREF(errorHandler);
6327    Py_XDECREF(exc);
6328    return NULL;
6329}
6330
6331/* --- Latin-1 Codec ------------------------------------------------------ */
6332
6333PyObject *
6334PyUnicode_DecodeLatin1(const char *s,
6335                       Py_ssize_t size,
6336                       const char *errors)
6337{
6338    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6339    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6340}
6341
6342/* create or adjust a UnicodeEncodeError */
6343static void
6344make_encode_exception(PyObject **exceptionObject,
6345                      const char *encoding,
6346                      PyObject *unicode,
6347                      Py_ssize_t startpos, Py_ssize_t endpos,
6348                      const char *reason)
6349{
6350    if (*exceptionObject == NULL) {
6351        *exceptionObject = PyObject_CallFunction(
6352            PyExc_UnicodeEncodeError, "sOnns",
6353            encoding, unicode, startpos, endpos, reason);
6354    }
6355    else {
6356        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6357            goto onError;
6358        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6359            goto onError;
6360        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6361            goto onError;
6362        return;
6363      onError:
6364        Py_DECREF(*exceptionObject);
6365        *exceptionObject = NULL;
6366    }
6367}
6368
6369/* raises a UnicodeEncodeError */
6370static void
6371raise_encode_exception(PyObject **exceptionObject,
6372                       const char *encoding,
6373                       PyObject *unicode,
6374                       Py_ssize_t startpos, Py_ssize_t endpos,
6375                       const char *reason)
6376{
6377    make_encode_exception(exceptionObject,
6378                          encoding, unicode, startpos, endpos, reason);
6379    if (*exceptionObject != NULL)
6380        PyCodec_StrictErrors(*exceptionObject);
6381}
6382
6383/* error handling callback helper:
6384   build arguments, call the callback and check the arguments,
6385   put the result into newpos and return the replacement string, which
6386   has to be freed by the caller */
6387static PyObject *
6388unicode_encode_call_errorhandler(const char *errors,
6389                                 PyObject **errorHandler,
6390                                 const char *encoding, const char *reason,
6391                                 PyObject *unicode, PyObject **exceptionObject,
6392                                 Py_ssize_t startpos, Py_ssize_t endpos,
6393                                 Py_ssize_t *newpos)
6394{
6395    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6396    Py_ssize_t len;
6397    PyObject *restuple;
6398    PyObject *resunicode;
6399
6400    if (*errorHandler == NULL) {
6401        *errorHandler = PyCodec_LookupError(errors);
6402        if (*errorHandler == NULL)
6403            return NULL;
6404    }
6405
6406    if (PyUnicode_READY(unicode) < 0)
6407        return NULL;
6408    len = PyUnicode_GET_LENGTH(unicode);
6409
6410    make_encode_exception(exceptionObject,
6411                          encoding, unicode, startpos, endpos, reason);
6412    if (*exceptionObject == NULL)
6413        return NULL;
6414
6415    restuple = PyObject_CallFunctionObjArgs(
6416        *errorHandler, *exceptionObject, NULL);
6417    if (restuple == NULL)
6418        return NULL;
6419    if (!PyTuple_Check(restuple)) {
6420        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6421        Py_DECREF(restuple);
6422        return NULL;
6423    }
6424    if (!PyArg_ParseTuple(restuple, argparse,
6425                          &resunicode, newpos)) {
6426        Py_DECREF(restuple);
6427        return NULL;
6428    }
6429    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6430        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6431        Py_DECREF(restuple);
6432        return NULL;
6433    }
6434    if (*newpos<0)
6435        *newpos = len + *newpos;
6436    if (*newpos<0 || *newpos>len) {
6437        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6438        Py_DECREF(restuple);
6439        return NULL;
6440    }
6441    Py_INCREF(resunicode);
6442    Py_DECREF(restuple);
6443    return resunicode;
6444}
6445
6446static PyObject *
6447unicode_encode_ucs1(PyObject *unicode,
6448                    const char *errors,
6449                    unsigned int limit)
6450{
6451    /* input state */
6452    Py_ssize_t pos=0, size;
6453    int kind;
6454    void *data;
6455    /* output object */
6456    PyObject *res;
6457    /* pointer into the output */
6458    char *str;
6459    /* current output position */
6460    Py_ssize_t ressize;
6461    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6462    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6463    PyObject *errorHandler = NULL;
6464    PyObject *exc = NULL;
6465    /* the following variable is used for caching string comparisons
6466     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6467    int known_errorHandler = -1;
6468
6469    if (PyUnicode_READY(unicode) < 0)
6470        return NULL;
6471    size = PyUnicode_GET_LENGTH(unicode);
6472    kind = PyUnicode_KIND(unicode);
6473    data = PyUnicode_DATA(unicode);
6474    /* allocate enough for a simple encoding without
6475       replacements, if we need more, we'll resize */
6476    if (size == 0)
6477        return PyBytes_FromStringAndSize(NULL, 0);
6478    res = PyBytes_FromStringAndSize(NULL, size);
6479    if (res == NULL)
6480        return NULL;
6481    str = PyBytes_AS_STRING(res);
6482    ressize = size;
6483
6484    while (pos < size) {
6485        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6486
6487        /* can we encode this? */
6488        if (c<limit) {
6489            /* no overflow check, because we know that the space is enough */
6490            *str++ = (char)c;
6491            ++pos;
6492        }
6493        else {
6494            Py_ssize_t requiredsize;
6495            PyObject *repunicode;
6496            Py_ssize_t repsize, newpos, respos, i;
6497            /* startpos for collecting unencodable chars */
6498            Py_ssize_t collstart = pos;
6499            Py_ssize_t collend = pos;
6500            /* find all unecodable characters */
6501            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6502                ++collend;
6503            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6504            if (known_errorHandler==-1) {
6505                if ((errors==NULL) || (!strcmp(errors, "strict")))
6506                    known_errorHandler = 1;
6507                else if (!strcmp(errors, "replace"))
6508                    known_errorHandler = 2;
6509                else if (!strcmp(errors, "ignore"))
6510                    known_errorHandler = 3;
6511                else if (!strcmp(errors, "xmlcharrefreplace"))
6512                    known_errorHandler = 4;
6513                else
6514                    known_errorHandler = 0;
6515            }
6516            switch (known_errorHandler) {
6517            case 1: /* strict */
6518                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6519                goto onError;
6520            case 2: /* replace */
6521                while (collstart++<collend)
6522                    *str++ = '?'; /* fall through */
6523            case 3: /* ignore */
6524                pos = collend;
6525                break;
6526            case 4: /* xmlcharrefreplace */
6527                respos = str - PyBytes_AS_STRING(res);
6528                /* determine replacement size */
6529                for (i = collstart, repsize = 0; i < collend; ++i) {
6530                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6531                    if (ch < 10)
6532                        repsize += 2+1+1;
6533                    else if (ch < 100)
6534                        repsize += 2+2+1;
6535                    else if (ch < 1000)
6536                        repsize += 2+3+1;
6537                    else if (ch < 10000)
6538                        repsize += 2+4+1;
6539                    else if (ch < 100000)
6540                        repsize += 2+5+1;
6541                    else if (ch < 1000000)
6542                        repsize += 2+6+1;
6543                    else {
6544                        assert(ch <= MAX_UNICODE);
6545                        repsize += 2+7+1;
6546                    }
6547                }
6548                requiredsize = respos+repsize+(size-collend);
6549                if (requiredsize > ressize) {
6550                    if (requiredsize<2*ressize)
6551                        requiredsize = 2*ressize;
6552                    if (_PyBytes_Resize(&res, requiredsize))
6553                        goto onError;
6554                    str = PyBytes_AS_STRING(res) + respos;
6555                    ressize = requiredsize;
6556                }
6557                /* generate replacement */
6558                for (i = collstart; i < collend; ++i) {
6559                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6560                }
6561                pos = collend;
6562                break;
6563            default:
6564                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6565                                                              encoding, reason, unicode, &exc,
6566                                                              collstart, collend, &newpos);
6567                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6568                                           PyUnicode_READY(repunicode) < 0))
6569                    goto onError;
6570                if (PyBytes_Check(repunicode)) {
6571                    /* Directly copy bytes result to output. */
6572                    repsize = PyBytes_Size(repunicode);
6573                    if (repsize > 1) {
6574                        /* Make room for all additional bytes. */
6575                        respos = str - PyBytes_AS_STRING(res);
6576                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6577                            Py_DECREF(repunicode);
6578                            goto onError;
6579                        }
6580                        str = PyBytes_AS_STRING(res) + respos;
6581                        ressize += repsize-1;
6582                    }
6583                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6584                    str += repsize;
6585                    pos = newpos;
6586                    Py_DECREF(repunicode);
6587                    break;
6588                }
6589                /* need more space? (at least enough for what we
6590                   have+the replacement+the rest of the string, so
6591                   we won't have to check space for encodable characters) */
6592                respos = str - PyBytes_AS_STRING(res);
6593                repsize = PyUnicode_GET_LENGTH(repunicode);
6594                requiredsize = respos+repsize+(size-collend);
6595                if (requiredsize > ressize) {
6596                    if (requiredsize<2*ressize)
6597                        requiredsize = 2*ressize;
6598                    if (_PyBytes_Resize(&res, requiredsize)) {
6599                        Py_DECREF(repunicode);
6600                        goto onError;
6601                    }
6602                    str = PyBytes_AS_STRING(res) + respos;
6603                    ressize = requiredsize;
6604                }
6605                /* check if there is anything unencodable in the replacement
6606                   and copy it to the output */
6607                for (i = 0; repsize-->0; ++i, ++str) {
6608                    c = PyUnicode_READ_CHAR(repunicode, i);
6609                    if (c >= limit) {
6610                        raise_encode_exception(&exc, encoding, unicode,
6611                                               pos, pos+1, reason);
6612                        Py_DECREF(repunicode);
6613                        goto onError;
6614                    }
6615                    *str = (char)c;
6616                }
6617                pos = newpos;
6618                Py_DECREF(repunicode);
6619            }
6620        }
6621    }
6622    /* Resize if we allocated to much */
6623    size = str - PyBytes_AS_STRING(res);
6624    if (size < ressize) { /* If this falls res will be NULL */
6625        assert(size >= 0);
6626        if (_PyBytes_Resize(&res, size) < 0)
6627            goto onError;
6628    }
6629
6630    Py_XDECREF(errorHandler);
6631    Py_XDECREF(exc);
6632    return res;
6633
6634  onError:
6635    Py_XDECREF(res);
6636    Py_XDECREF(errorHandler);
6637    Py_XDECREF(exc);
6638    return NULL;
6639}
6640
6641/* Deprecated */
6642PyObject *
6643PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6644                       Py_ssize_t size,
6645                       const char *errors)
6646{
6647    PyObject *result;
6648    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6649    if (unicode == NULL)
6650        return NULL;
6651    result = unicode_encode_ucs1(unicode, errors, 256);
6652    Py_DECREF(unicode);
6653    return result;
6654}
6655
6656PyObject *
6657_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6658{
6659    if (!PyUnicode_Check(unicode)) {
6660        PyErr_BadArgument();
6661        return NULL;
6662    }
6663    if (PyUnicode_READY(unicode) == -1)
6664        return NULL;
6665    /* Fast path: if it is a one-byte string, construct
6666       bytes object directly. */
6667    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6668        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6669                                         PyUnicode_GET_LENGTH(unicode));
6670    /* Non-Latin-1 characters present. Defer to above function to
6671       raise the exception. */
6672    return unicode_encode_ucs1(unicode, errors, 256);
6673}
6674
6675PyObject*
6676PyUnicode_AsLatin1String(PyObject *unicode)
6677{
6678    return _PyUnicode_AsLatin1String(unicode, NULL);
6679}
6680
6681/* --- 7-bit ASCII Codec -------------------------------------------------- */
6682
6683PyObject *
6684PyUnicode_DecodeASCII(const char *s,
6685                      Py_ssize_t size,
6686                      const char *errors)
6687{
6688    const char *starts = s;
6689    PyObject *v;
6690    int kind;
6691    void *data;
6692    Py_ssize_t startinpos;
6693    Py_ssize_t endinpos;
6694    Py_ssize_t outpos;
6695    const char *e;
6696    int has_error;
6697    const unsigned char *p = (const unsigned char *)s;
6698    const unsigned char *end = p + size;
6699    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
6700    PyObject *errorHandler = NULL;
6701    PyObject *exc = NULL;
6702
6703    if (size == 0) {
6704        Py_INCREF(unicode_empty);
6705        return unicode_empty;
6706    }
6707
6708    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6709    if (size == 1 && (unsigned char)s[0] < 128)
6710        return get_latin1_char((unsigned char)s[0]);
6711
6712    has_error = 0;
6713    while (p < end && !has_error) {
6714        /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6715           an explanation. */
6716        if (!((size_t) p & LONG_PTR_MASK)) {
6717            /* Help register allocation */
6718            register const unsigned char *_p = p;
6719            while (_p < aligned_end) {
6720                unsigned long value = *(unsigned long *) _p;
6721                if (value & ASCII_CHAR_MASK) {
6722                    has_error = 1;
6723                    break;
6724                }
6725                _p += SIZEOF_LONG;
6726            }
6727            if (_p == end)
6728                break;
6729            if (has_error)
6730                break;
6731            p = _p;
6732        }
6733        if (*p & 0x80) {
6734            has_error = 1;
6735            break;
6736        }
6737        else {
6738            ++p;
6739        }
6740    }
6741    if (!has_error)
6742        return unicode_fromascii((const unsigned char *)s, size);
6743
6744    v = PyUnicode_New(size, 127);
6745    if (v == NULL)
6746        goto onError;
6747    if (size == 0)
6748        return v;
6749    kind = PyUnicode_KIND(v);
6750    data = PyUnicode_DATA(v);
6751    outpos = 0;
6752    e = s + size;
6753    while (s < e) {
6754        register unsigned char c = (unsigned char)*s;
6755        if (c < 128) {
6756            PyUnicode_WRITE(kind, data, outpos++, c);
6757            ++s;
6758        }
6759        else {
6760            startinpos = s-starts;
6761            endinpos = startinpos + 1;
6762            if (unicode_decode_call_errorhandler(
6763                    errors, &errorHandler,
6764                    "ascii", "ordinal not in range(128)",
6765                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6766                    &v, &outpos))
6767                goto onError;
6768            kind = PyUnicode_KIND(v);
6769            data = PyUnicode_DATA(v);
6770        }
6771    }
6772    if (PyUnicode_Resize(&v, outpos) < 0)
6773        goto onError;
6774    Py_XDECREF(errorHandler);
6775    Py_XDECREF(exc);
6776    assert(_PyUnicode_CheckConsistency(v, 1));
6777    return v;
6778
6779  onError:
6780    Py_XDECREF(v);
6781    Py_XDECREF(errorHandler);
6782    Py_XDECREF(exc);
6783    return NULL;
6784}
6785
6786/* Deprecated */
6787PyObject *
6788PyUnicode_EncodeASCII(const Py_UNICODE *p,
6789                      Py_ssize_t size,
6790                      const char *errors)
6791{
6792    PyObject *result;
6793    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6794    if (unicode == NULL)
6795        return NULL;
6796    result = unicode_encode_ucs1(unicode, errors, 128);
6797    Py_DECREF(unicode);
6798    return result;
6799}
6800
6801PyObject *
6802_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6803{
6804    if (!PyUnicode_Check(unicode)) {
6805        PyErr_BadArgument();
6806        return NULL;
6807    }
6808    if (PyUnicode_READY(unicode) == -1)
6809        return NULL;
6810    /* Fast path: if it is an ASCII-only string, construct bytes object
6811       directly. Else defer to above function to raise the exception. */
6812    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6813        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6814                                         PyUnicode_GET_LENGTH(unicode));
6815    return unicode_encode_ucs1(unicode, errors, 128);
6816}
6817
6818PyObject *
6819PyUnicode_AsASCIIString(PyObject *unicode)
6820{
6821    return _PyUnicode_AsASCIIString(unicode, NULL);
6822}
6823
6824#ifdef HAVE_MBCS
6825
6826/* --- MBCS codecs for Windows -------------------------------------------- */
6827
6828#if SIZEOF_INT < SIZEOF_SIZE_T
6829#define NEED_RETRY
6830#endif
6831
6832#ifndef WC_ERR_INVALID_CHARS
6833#  define WC_ERR_INVALID_CHARS 0x0080
6834#endif
6835
6836static char*
6837code_page_name(UINT code_page, PyObject **obj)
6838{
6839    *obj = NULL;
6840    if (code_page == CP_ACP)
6841        return "mbcs";
6842    if (code_page == CP_UTF7)
6843        return "CP_UTF7";
6844    if (code_page == CP_UTF8)
6845        return "CP_UTF8";
6846
6847    *obj = PyBytes_FromFormat("cp%u", code_page);
6848    if (*obj == NULL)
6849        return NULL;
6850    return PyBytes_AS_STRING(*obj);
6851}
6852
6853static int
6854is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6855{
6856    const char *curr = s + offset;
6857    const char *prev;
6858
6859    if (!IsDBCSLeadByteEx(code_page, *curr))
6860        return 0;
6861
6862    prev = CharPrevExA(code_page, s, curr, 0);
6863    if (prev == curr)
6864        return 1;
6865    /* FIXME: This code is limited to "true" double-byte encodings,
6866       as it assumes an incomplete character consists of a single
6867       byte. */
6868    if (curr - prev == 2)
6869        return 1;
6870    if (!IsDBCSLeadByteEx(code_page, *prev))
6871        return 1;
6872    return 0;
6873}
6874
6875static DWORD
6876decode_code_page_flags(UINT code_page)
6877{
6878    if (code_page == CP_UTF7) {
6879        /* The CP_UTF7 decoder only supports flags=0 */
6880        return 0;
6881    }
6882    else
6883        return MB_ERR_INVALID_CHARS;
6884}
6885
6886/*
6887 * Decode a byte string from a Windows code page into unicode object in strict
6888 * mode.
6889 *
6890 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6891 * WindowsError and returns -1 on other error.
6892 */
6893static int
6894decode_code_page_strict(UINT code_page,
6895                        PyObject **v,
6896                        const char *in,
6897                        int insize)
6898{
6899    const DWORD flags = decode_code_page_flags(code_page);
6900    wchar_t *out;
6901    DWORD outsize;
6902
6903    /* First get the size of the result */
6904    assert(insize > 0);
6905    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6906    if (outsize <= 0)
6907        goto error;
6908
6909    if (*v == NULL) {
6910        /* Create unicode object */
6911        *v = (PyObject*)_PyUnicode_New(outsize);
6912        if (*v == NULL)
6913            return -1;
6914        out = PyUnicode_AS_UNICODE(*v);
6915    }
6916    else {
6917        /* Extend unicode object */
6918        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6919        if (PyUnicode_Resize(v, n + outsize) < 0)
6920            return -1;
6921        out = PyUnicode_AS_UNICODE(*v) + n;
6922    }
6923
6924    /* Do the conversion */
6925    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6926    if (outsize <= 0)
6927        goto error;
6928    return insize;
6929
6930error:
6931    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6932        return -2;
6933    PyErr_SetFromWindowsErr(0);
6934    return -1;
6935}
6936
6937/*
6938 * Decode a byte string from a code page into unicode object with an error
6939 * handler.
6940 *
6941 * Returns consumed size if succeed, or raise a WindowsError or
6942 * UnicodeDecodeError exception and returns -1 on error.
6943 */
6944static int
6945decode_code_page_errors(UINT code_page,
6946                        PyObject **v,
6947                        const char *in, const int size,
6948                        const char *errors)
6949{
6950    const char *startin = in;
6951    const char *endin = in + size;
6952    const DWORD flags = decode_code_page_flags(code_page);
6953    /* Ideally, we should get reason from FormatMessage. This is the Windows
6954       2000 English version of the message. */
6955    const char *reason = "No mapping for the Unicode character exists "
6956                         "in the target code page.";
6957    /* each step cannot decode more than 1 character, but a character can be
6958       represented as a surrogate pair */
6959    wchar_t buffer[2], *startout, *out;
6960    int insize, outsize;
6961    PyObject *errorHandler = NULL;
6962    PyObject *exc = NULL;
6963    PyObject *encoding_obj = NULL;
6964    char *encoding;
6965    DWORD err;
6966    int ret = -1;
6967
6968    assert(size > 0);
6969
6970    encoding = code_page_name(code_page, &encoding_obj);
6971    if (encoding == NULL)
6972        return -1;
6973
6974    if (errors == NULL || strcmp(errors, "strict") == 0) {
6975        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6976           UnicodeDecodeError. */
6977        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6978        if (exc != NULL) {
6979            PyCodec_StrictErrors(exc);
6980            Py_CLEAR(exc);
6981        }
6982        goto error;
6983    }
6984
6985    if (*v == NULL) {
6986        /* Create unicode object */
6987        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6988            PyErr_NoMemory();
6989            goto error;
6990        }
6991        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6992        if (*v == NULL)
6993            goto error;
6994        startout = PyUnicode_AS_UNICODE(*v);
6995    }
6996    else {
6997        /* Extend unicode object */
6998        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6999        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7000            PyErr_NoMemory();
7001            goto error;
7002        }
7003        if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7004            goto error;
7005        startout = PyUnicode_AS_UNICODE(*v) + n;
7006    }
7007
7008    /* Decode the byte string character per character */
7009    out = startout;
7010    while (in < endin)
7011    {
7012        /* Decode a character */
7013        insize = 1;
7014        do
7015        {
7016            outsize = MultiByteToWideChar(code_page, flags,
7017                                          in, insize,
7018                                          buffer, Py_ARRAY_LENGTH(buffer));
7019            if (outsize > 0)
7020                break;
7021            err = GetLastError();
7022            if (err != ERROR_NO_UNICODE_TRANSLATION
7023                && err != ERROR_INSUFFICIENT_BUFFER)
7024            {
7025                PyErr_SetFromWindowsErr(0);
7026                goto error;
7027            }
7028            insize++;
7029        }
7030        /* 4=maximum length of a UTF-8 sequence */
7031        while (insize <= 4 && (in + insize) <= endin);
7032
7033        if (outsize <= 0) {
7034            Py_ssize_t startinpos, endinpos, outpos;
7035
7036            startinpos = in - startin;
7037            endinpos = startinpos + 1;
7038            outpos = out - PyUnicode_AS_UNICODE(*v);
7039            if (unicode_decode_call_errorhandler(
7040                    errors, &errorHandler,
7041                    encoding, reason,
7042                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7043                    v, &outpos))
7044            {
7045                goto error;
7046            }
7047            out = PyUnicode_AS_UNICODE(*v) + outpos;
7048        }
7049        else {
7050            in += insize;
7051            memcpy(out, buffer, outsize * sizeof(wchar_t));
7052            out += outsize;
7053        }
7054    }
7055
7056    /* write a NUL character at the end */
7057    *out = 0;
7058
7059    /* Extend unicode object */
7060    outsize = out - startout;
7061    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7062    if (PyUnicode_Resize(v, outsize) < 0)
7063        goto error;
7064    ret = size;
7065
7066error:
7067    Py_XDECREF(encoding_obj);
7068    Py_XDECREF(errorHandler);
7069    Py_XDECREF(exc);
7070    return ret;
7071}
7072
7073static PyObject *
7074decode_code_page_stateful(int code_page,
7075                          const char *s, Py_ssize_t size,
7076                          const char *errors, Py_ssize_t *consumed)
7077{
7078    PyObject *v = NULL;
7079    int chunk_size, final, converted, done;
7080
7081    if (code_page < 0) {
7082        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7083        return NULL;
7084    }
7085
7086    if (consumed)
7087        *consumed = 0;
7088
7089    do
7090    {
7091#ifdef NEED_RETRY
7092        if (size > INT_MAX) {
7093            chunk_size = INT_MAX;
7094            final = 0;
7095            done = 0;
7096        }
7097        else
7098#endif
7099        {
7100            chunk_size = (int)size;
7101            final = (consumed == NULL);
7102            done = 1;
7103        }
7104
7105        /* Skip trailing lead-byte unless 'final' is set */
7106        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7107            --chunk_size;
7108
7109        if (chunk_size == 0 && done) {
7110            if (v != NULL)
7111                break;
7112            Py_INCREF(unicode_empty);
7113            return unicode_empty;
7114        }
7115
7116
7117        converted = decode_code_page_strict(code_page, &v,
7118                                            s, chunk_size);
7119        if (converted == -2)
7120            converted = decode_code_page_errors(code_page, &v,
7121                                                s, chunk_size,
7122                                                errors);
7123        assert(converted != 0);
7124
7125        if (converted < 0) {
7126            Py_XDECREF(v);
7127            return NULL;
7128        }
7129
7130        if (consumed)
7131            *consumed += converted;
7132
7133        s += converted;
7134        size -= converted;
7135    } while (!done);
7136
7137    return unicode_result(v);
7138}
7139
7140PyObject *
7141PyUnicode_DecodeCodePageStateful(int code_page,
7142                                 const char *s,
7143                                 Py_ssize_t size,
7144                                 const char *errors,
7145                                 Py_ssize_t *consumed)
7146{
7147    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7148}
7149
7150PyObject *
7151PyUnicode_DecodeMBCSStateful(const char *s,
7152                             Py_ssize_t size,
7153                             const char *errors,
7154                             Py_ssize_t *consumed)
7155{
7156    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7157}
7158
7159PyObject *
7160PyUnicode_DecodeMBCS(const char *s,
7161                     Py_ssize_t size,
7162                     const char *errors)
7163{
7164    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7165}
7166
7167static DWORD
7168encode_code_page_flags(UINT code_page, const char *errors)
7169{
7170    if (code_page == CP_UTF8) {
7171        if (winver.dwMajorVersion >= 6)
7172            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7173               and later */
7174            return WC_ERR_INVALID_CHARS;
7175        else
7176            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7177            return 0;
7178    }
7179    else if (code_page == CP_UTF7) {
7180        /* CP_UTF7 only supports flags=0 */
7181        return 0;
7182    }
7183    else {
7184        if (errors != NULL && strcmp(errors, "replace") == 0)
7185            return 0;
7186        else
7187            return WC_NO_BEST_FIT_CHARS;
7188    }
7189}
7190
7191/*
7192 * Encode a Unicode string to a Windows code page into a byte string in strict
7193 * mode.
7194 *
7195 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7196 * a WindowsError and returns -1 on other error.
7197 */
7198static int
7199encode_code_page_strict(UINT code_page, PyObject **outbytes,
7200                        PyObject *unicode, Py_ssize_t offset, int len,
7201                        const char* errors)
7202{
7203    BOOL usedDefaultChar = FALSE;
7204    BOOL *pusedDefaultChar = &usedDefaultChar;
7205    int outsize;
7206    PyObject *exc = NULL;
7207    wchar_t *p;
7208    Py_ssize_t size;
7209    const DWORD flags = encode_code_page_flags(code_page, NULL);
7210    char *out;
7211    /* Create a substring so that we can get the UTF-16 representation
7212       of just the slice under consideration. */
7213    PyObject *substring;
7214
7215    assert(len > 0);
7216
7217    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7218        pusedDefaultChar = &usedDefaultChar;
7219    else
7220        pusedDefaultChar = NULL;
7221
7222    substring = PyUnicode_Substring(unicode, offset, offset+len);
7223    if (substring == NULL)
7224        return -1;
7225    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7226    if (p == NULL) {
7227        Py_DECREF(substring);
7228        return -1;
7229    }
7230
7231    /* First get the size of the result */
7232    outsize = WideCharToMultiByte(code_page, flags,
7233                                  p, size,
7234                                  NULL, 0,
7235                                  NULL, pusedDefaultChar);
7236    if (outsize <= 0)
7237        goto error;
7238    /* If we used a default char, then we failed! */
7239    if (pusedDefaultChar && *pusedDefaultChar) {
7240        Py_DECREF(substring);
7241        return -2;
7242    }
7243
7244    if (*outbytes == NULL) {
7245        /* Create string object */
7246        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7247        if (*outbytes == NULL) {
7248            Py_DECREF(substring);
7249            return -1;
7250        }
7251        out = PyBytes_AS_STRING(*outbytes);
7252    }
7253    else {
7254        /* Extend string object */
7255        const Py_ssize_t n = PyBytes_Size(*outbytes);
7256        if (outsize > PY_SSIZE_T_MAX - n) {
7257            PyErr_NoMemory();
7258            Py_DECREF(substring);
7259            return -1;
7260        }
7261        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7262            Py_DECREF(substring);
7263            return -1;
7264        }
7265        out = PyBytes_AS_STRING(*outbytes) + n;
7266    }
7267
7268    /* Do the conversion */
7269    outsize = WideCharToMultiByte(code_page, flags,
7270                                  p, size,
7271                                  out, outsize,
7272                                  NULL, pusedDefaultChar);
7273    Py_CLEAR(substring);
7274    if (outsize <= 0)
7275        goto error;
7276    if (pusedDefaultChar && *pusedDefaultChar)
7277        return -2;
7278    return 0;
7279
7280error:
7281    Py_XDECREF(substring);
7282    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7283        return -2;
7284    PyErr_SetFromWindowsErr(0);
7285    return -1;
7286}
7287
7288/*
7289 * Encode a Unicode string to a Windows code page into a byte string using a
7290 * error handler.
7291 *
7292 * Returns consumed characters if succeed, or raise a WindowsError and returns
7293 * -1 on other error.
7294 */
7295static int
7296encode_code_page_errors(UINT code_page, PyObject **outbytes,
7297                        PyObject *unicode, Py_ssize_t unicode_offset,
7298                        Py_ssize_t insize, const char* errors)
7299{
7300    const DWORD flags = encode_code_page_flags(code_page, errors);
7301    Py_ssize_t pos = unicode_offset;
7302    Py_ssize_t endin = unicode_offset + insize;
7303    /* Ideally, we should get reason from FormatMessage. This is the Windows
7304       2000 English version of the message. */
7305    const char *reason = "invalid character";
7306    /* 4=maximum length of a UTF-8 sequence */
7307    char buffer[4];
7308    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7309    Py_ssize_t outsize;
7310    char *out;
7311    PyObject *errorHandler = NULL;
7312    PyObject *exc = NULL;
7313    PyObject *encoding_obj = NULL;
7314    char *encoding;
7315    Py_ssize_t newpos, newoutsize;
7316    PyObject *rep;
7317    int ret = -1;
7318
7319    assert(insize > 0);
7320
7321    encoding = code_page_name(code_page, &encoding_obj);
7322    if (encoding == NULL)
7323        return -1;
7324
7325    if (errors == NULL || strcmp(errors, "strict") == 0) {
7326        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7327           then we raise a UnicodeEncodeError. */
7328        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7329        if (exc != NULL) {
7330            PyCodec_StrictErrors(exc);
7331            Py_DECREF(exc);
7332        }
7333        Py_XDECREF(encoding_obj);
7334        return -1;
7335    }
7336
7337    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7338        pusedDefaultChar = &usedDefaultChar;
7339    else
7340        pusedDefaultChar = NULL;
7341
7342    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7343        PyErr_NoMemory();
7344        goto error;
7345    }
7346    outsize = insize * Py_ARRAY_LENGTH(buffer);
7347
7348    if (*outbytes == NULL) {
7349        /* Create string object */
7350        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7351        if (*outbytes == NULL)
7352            goto error;
7353        out = PyBytes_AS_STRING(*outbytes);
7354    }
7355    else {
7356        /* Extend string object */
7357        Py_ssize_t n = PyBytes_Size(*outbytes);
7358        if (n > PY_SSIZE_T_MAX - outsize) {
7359            PyErr_NoMemory();
7360            goto error;
7361        }
7362        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7363            goto error;
7364        out = PyBytes_AS_STRING(*outbytes) + n;
7365    }
7366
7367    /* Encode the string character per character */
7368    while (pos < endin)
7369    {
7370        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7371        wchar_t chars[2];
7372        int charsize;
7373        if (ch < 0x10000) {
7374            chars[0] = (wchar_t)ch;
7375            charsize = 1;
7376        }
7377        else {
7378            ch -= 0x10000;
7379            chars[0] = 0xd800 + (ch >> 10);
7380            chars[1] = 0xdc00 + (ch & 0x3ff);
7381            charsize = 2;
7382        }
7383
7384        outsize = WideCharToMultiByte(code_page, flags,
7385                                      chars, charsize,
7386                                      buffer, Py_ARRAY_LENGTH(buffer),
7387                                      NULL, pusedDefaultChar);
7388        if (outsize > 0) {
7389            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7390            {
7391                pos++;
7392                memcpy(out, buffer, outsize);
7393                out += outsize;
7394                continue;
7395            }
7396        }
7397        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7398            PyErr_SetFromWindowsErr(0);
7399            goto error;
7400        }
7401
7402        rep = unicode_encode_call_errorhandler(
7403                  errors, &errorHandler, encoding, reason,
7404                  unicode, &exc,
7405                  pos, pos + 1, &newpos);
7406        if (rep == NULL)
7407            goto error;
7408        pos = newpos;
7409
7410        if (PyBytes_Check(rep)) {
7411            outsize = PyBytes_GET_SIZE(rep);
7412            if (outsize != 1) {
7413                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7414                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7415                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7416                    Py_DECREF(rep);
7417                    goto error;
7418                }
7419                out = PyBytes_AS_STRING(*outbytes) + offset;
7420            }
7421            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7422            out += outsize;
7423        }
7424        else {
7425            Py_ssize_t i;
7426            enum PyUnicode_Kind kind;
7427            void *data;
7428
7429            if (PyUnicode_READY(rep) < 0) {
7430                Py_DECREF(rep);
7431                goto error;
7432            }
7433
7434            outsize = PyUnicode_GET_LENGTH(rep);
7435            if (outsize != 1) {
7436                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7437                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7438                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7439                    Py_DECREF(rep);
7440                    goto error;
7441                }
7442                out = PyBytes_AS_STRING(*outbytes) + offset;
7443            }
7444            kind = PyUnicode_KIND(rep);
7445            data = PyUnicode_DATA(rep);
7446            for (i=0; i < outsize; i++) {
7447                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7448                if (ch > 127) {
7449                    raise_encode_exception(&exc,
7450                        encoding, unicode,
7451                        pos, pos + 1,
7452                        "unable to encode error handler result to ASCII");
7453                    Py_DECREF(rep);
7454                    goto error;
7455                }
7456                *out = (unsigned char)ch;
7457                out++;
7458            }
7459        }
7460        Py_DECREF(rep);
7461    }
7462    /* write a NUL byte */
7463    *out = 0;
7464    outsize = out - PyBytes_AS_STRING(*outbytes);
7465    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7466    if (_PyBytes_Resize(outbytes, outsize) < 0)
7467        goto error;
7468    ret = 0;
7469
7470error:
7471    Py_XDECREF(encoding_obj);
7472    Py_XDECREF(errorHandler);
7473    Py_XDECREF(exc);
7474    return ret;
7475}
7476
7477static PyObject *
7478encode_code_page(int code_page,
7479                 PyObject *unicode,
7480                 const char *errors)
7481{
7482    Py_ssize_t len;
7483    PyObject *outbytes = NULL;
7484    Py_ssize_t offset;
7485    int chunk_len, ret, done;
7486
7487    if (PyUnicode_READY(unicode) < 0)
7488        return NULL;
7489    len = PyUnicode_GET_LENGTH(unicode);
7490
7491    if (code_page < 0) {
7492        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7493        return NULL;
7494    }
7495
7496    if (len == 0)
7497        return PyBytes_FromStringAndSize(NULL, 0);
7498
7499    offset = 0;
7500    do
7501    {
7502#ifdef NEED_RETRY
7503        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7504           chunks. */
7505        if (len > INT_MAX/2) {
7506            chunk_len = INT_MAX/2;
7507            done = 0;
7508        }
7509        else
7510#endif
7511        {
7512            chunk_len = (int)len;
7513            done = 1;
7514        }
7515
7516        ret = encode_code_page_strict(code_page, &outbytes,
7517                                      unicode, offset, chunk_len,
7518                                      errors);
7519        if (ret == -2)
7520            ret = encode_code_page_errors(code_page, &outbytes,
7521                                          unicode, offset,
7522                                          chunk_len, errors);
7523        if (ret < 0) {
7524            Py_XDECREF(outbytes);
7525            return NULL;
7526        }
7527
7528        offset += chunk_len;
7529        len -= chunk_len;
7530    } while (!done);
7531
7532    return outbytes;
7533}
7534
7535PyObject *
7536PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7537                     Py_ssize_t size,
7538                     const char *errors)
7539{
7540    PyObject *unicode, *res;
7541    unicode = PyUnicode_FromUnicode(p, size);
7542    if (unicode == NULL)
7543        return NULL;
7544    res = encode_code_page(CP_ACP, unicode, errors);
7545    Py_DECREF(unicode);
7546    return res;
7547}
7548
7549PyObject *
7550PyUnicode_EncodeCodePage(int code_page,
7551                         PyObject *unicode,
7552                         const char *errors)
7553{
7554    return encode_code_page(code_page, unicode, errors);
7555}
7556
7557PyObject *
7558PyUnicode_AsMBCSString(PyObject *unicode)
7559{
7560    if (!PyUnicode_Check(unicode)) {
7561        PyErr_BadArgument();
7562        return NULL;
7563    }
7564    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7565}
7566
7567#undef NEED_RETRY
7568
7569#endif /* HAVE_MBCS */
7570
7571/* --- Character Mapping Codec -------------------------------------------- */
7572
7573PyObject *
7574PyUnicode_DecodeCharmap(const char *s,
7575                        Py_ssize_t size,
7576                        PyObject *mapping,
7577                        const char *errors)
7578{
7579    const char *starts = s;
7580    Py_ssize_t startinpos;
7581    Py_ssize_t endinpos;
7582    Py_ssize_t outpos;
7583    const char *e;
7584    PyObject *v;
7585    Py_ssize_t extrachars = 0;
7586    PyObject *errorHandler = NULL;
7587    PyObject *exc = NULL;
7588
7589    /* Default to Latin-1 */
7590    if (mapping == NULL)
7591        return PyUnicode_DecodeLatin1(s, size, errors);
7592
7593    v = PyUnicode_New(size, 127);
7594    if (v == NULL)
7595        goto onError;
7596    if (size == 0)
7597        return v;
7598    outpos = 0;
7599    e = s + size;
7600    if (PyUnicode_CheckExact(mapping)) {
7601        Py_ssize_t maplen;
7602        enum PyUnicode_Kind kind;
7603        void *data;
7604        Py_UCS4 x;
7605
7606        if (PyUnicode_READY(mapping) < 0)
7607            return NULL;
7608
7609        maplen = PyUnicode_GET_LENGTH(mapping);
7610        data = PyUnicode_DATA(mapping);
7611        kind = PyUnicode_KIND(mapping);
7612        while (s < e) {
7613            unsigned char ch = *s;
7614
7615            if (ch < maplen)
7616                x = PyUnicode_READ(kind, data, ch);
7617            else
7618                x = 0xfffe; /* invalid value */
7619
7620            if (x == 0xfffe)
7621            {
7622                /* undefined mapping */
7623                startinpos = s-starts;
7624                endinpos = startinpos+1;
7625                if (unicode_decode_call_errorhandler(
7626                        errors, &errorHandler,
7627                        "charmap", "character maps to <undefined>",
7628                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7629                        &v, &outpos)) {
7630                    goto onError;
7631                }
7632                continue;
7633            }
7634
7635            if (unicode_putchar(&v, &outpos, x) < 0)
7636                goto onError;
7637            ++s;
7638        }
7639    }
7640    else {
7641        while (s < e) {
7642            unsigned char ch = *s;
7643            PyObject *w, *x;
7644
7645            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7646            w = PyLong_FromLong((long)ch);
7647            if (w == NULL)
7648                goto onError;
7649            x = PyObject_GetItem(mapping, w);
7650            Py_DECREF(w);
7651            if (x == NULL) {
7652                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7653                    /* No mapping found means: mapping is undefined. */
7654                    PyErr_Clear();
7655                    x = Py_None;
7656                    Py_INCREF(x);
7657                } else
7658                    goto onError;
7659            }
7660
7661            /* Apply mapping */
7662            if (PyLong_Check(x)) {
7663                long value = PyLong_AS_LONG(x);
7664                if (value < 0 || value > 65535) {
7665                    PyErr_SetString(PyExc_TypeError,
7666                                    "character mapping must be in range(65536)");
7667                    Py_DECREF(x);
7668                    goto onError;
7669                }
7670                if (unicode_putchar(&v, &outpos, value) < 0)
7671                    goto onError;
7672            }
7673            else if (x == Py_None) {
7674                /* undefined mapping */
7675                startinpos = s-starts;
7676                endinpos = startinpos+1;
7677                if (unicode_decode_call_errorhandler(
7678                        errors, &errorHandler,
7679                        "charmap", "character maps to <undefined>",
7680                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7681                        &v, &outpos)) {
7682                    Py_DECREF(x);
7683                    goto onError;
7684                }
7685                Py_DECREF(x);
7686                continue;
7687            }
7688            else if (PyUnicode_Check(x)) {
7689                Py_ssize_t targetsize;
7690
7691                if (PyUnicode_READY(x) < 0)
7692                    goto onError;
7693                targetsize = PyUnicode_GET_LENGTH(x);
7694
7695                if (targetsize == 1) {
7696                    /* 1-1 mapping */
7697                    if (unicode_putchar(&v, &outpos,
7698                                        PyUnicode_READ_CHAR(x, 0)) < 0)
7699                        goto onError;
7700                }
7701                else if (targetsize > 1) {
7702                    /* 1-n mapping */
7703                    if (targetsize > extrachars) {
7704                        /* resize first */
7705                        Py_ssize_t needed = (targetsize - extrachars) + \
7706                            (targetsize << 2);
7707                        extrachars += needed;
7708                        /* XXX overflow detection missing */
7709                        if (PyUnicode_Resize(&v,
7710                                             PyUnicode_GET_LENGTH(v) + needed) < 0) {
7711                            Py_DECREF(x);
7712                            goto onError;
7713                        }
7714                    }
7715                    if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7716                        goto onError;
7717                    PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7718                    outpos += targetsize;
7719                    extrachars -= targetsize;
7720                }
7721                /* 1-0 mapping: skip the character */
7722            }
7723            else {
7724                /* wrong return value */
7725                PyErr_SetString(PyExc_TypeError,
7726                                "character mapping must return integer, None or str");
7727                Py_DECREF(x);
7728                goto onError;
7729            }
7730            Py_DECREF(x);
7731            ++s;
7732        }
7733    }
7734    if (PyUnicode_Resize(&v, outpos) < 0)
7735        goto onError;
7736    Py_XDECREF(errorHandler);
7737    Py_XDECREF(exc);
7738    return unicode_result(v);
7739
7740  onError:
7741    Py_XDECREF(errorHandler);
7742    Py_XDECREF(exc);
7743    Py_XDECREF(v);
7744    return NULL;
7745}
7746
7747/* Charmap encoding: the lookup table */
7748
7749struct encoding_map {
7750    PyObject_HEAD
7751    unsigned char level1[32];
7752    int count2, count3;
7753    unsigned char level23[1];
7754};
7755
7756static PyObject*
7757encoding_map_size(PyObject *obj, PyObject* args)
7758{
7759    struct encoding_map *map = (struct encoding_map*)obj;
7760    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7761                           128*map->count3);
7762}
7763
7764static PyMethodDef encoding_map_methods[] = {
7765    {"size", encoding_map_size, METH_NOARGS,
7766     PyDoc_STR("Return the size (in bytes) of this object") },
7767    { 0 }
7768};
7769
7770static void
7771encoding_map_dealloc(PyObject* o)
7772{
7773    PyObject_FREE(o);
7774}
7775
7776static PyTypeObject EncodingMapType = {
7777    PyVarObject_HEAD_INIT(NULL, 0)
7778    "EncodingMap",          /*tp_name*/
7779    sizeof(struct encoding_map),   /*tp_basicsize*/
7780    0,                      /*tp_itemsize*/
7781    /* methods */
7782    encoding_map_dealloc,   /*tp_dealloc*/
7783    0,                      /*tp_print*/
7784    0,                      /*tp_getattr*/
7785    0,                      /*tp_setattr*/
7786    0,                      /*tp_reserved*/
7787    0,                      /*tp_repr*/
7788    0,                      /*tp_as_number*/
7789    0,                      /*tp_as_sequence*/
7790    0,                      /*tp_as_mapping*/
7791    0,                      /*tp_hash*/
7792    0,                      /*tp_call*/
7793    0,                      /*tp_str*/
7794    0,                      /*tp_getattro*/
7795    0,                      /*tp_setattro*/
7796    0,                      /*tp_as_buffer*/
7797    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7798    0,                      /*tp_doc*/
7799    0,                      /*tp_traverse*/
7800    0,                      /*tp_clear*/
7801    0,                      /*tp_richcompare*/
7802    0,                      /*tp_weaklistoffset*/
7803    0,                      /*tp_iter*/
7804    0,                      /*tp_iternext*/
7805    encoding_map_methods,   /*tp_methods*/
7806    0,                      /*tp_members*/
7807    0,                      /*tp_getset*/
7808    0,                      /*tp_base*/
7809    0,                      /*tp_dict*/
7810    0,                      /*tp_descr_get*/
7811    0,                      /*tp_descr_set*/
7812    0,                      /*tp_dictoffset*/
7813    0,                      /*tp_init*/
7814    0,                      /*tp_alloc*/
7815    0,                      /*tp_new*/
7816    0,                      /*tp_free*/
7817    0,                      /*tp_is_gc*/
7818};
7819
7820PyObject*
7821PyUnicode_BuildEncodingMap(PyObject* string)
7822{
7823    PyObject *result;
7824    struct encoding_map *mresult;
7825    int i;
7826    int need_dict = 0;
7827    unsigned char level1[32];
7828    unsigned char level2[512];
7829    unsigned char *mlevel1, *mlevel2, *mlevel3;
7830    int count2 = 0, count3 = 0;
7831    int kind;
7832    void *data;
7833    Py_UCS4 ch;
7834
7835    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7836        PyErr_BadArgument();
7837        return NULL;
7838    }
7839    kind = PyUnicode_KIND(string);
7840    data = PyUnicode_DATA(string);
7841    memset(level1, 0xFF, sizeof level1);
7842    memset(level2, 0xFF, sizeof level2);
7843
7844    /* If there isn't a one-to-one mapping of NULL to \0,
7845       or if there are non-BMP characters, we need to use
7846       a mapping dictionary. */
7847    if (PyUnicode_READ(kind, data, 0) != 0)
7848        need_dict = 1;
7849    for (i = 1; i < 256; i++) {
7850        int l1, l2;
7851        ch = PyUnicode_READ(kind, data, i);
7852        if (ch == 0 || ch > 0xFFFF) {
7853            need_dict = 1;
7854            break;
7855        }
7856        if (ch == 0xFFFE)
7857            /* unmapped character */
7858            continue;
7859        l1 = ch >> 11;
7860        l2 = ch >> 7;
7861        if (level1[l1] == 0xFF)
7862            level1[l1] = count2++;
7863        if (level2[l2] == 0xFF)
7864            level2[l2] = count3++;
7865    }
7866
7867    if (count2 >= 0xFF || count3 >= 0xFF)
7868        need_dict = 1;
7869
7870    if (need_dict) {
7871        PyObject *result = PyDict_New();
7872        PyObject *key, *value;
7873        if (!result)
7874            return NULL;
7875        for (i = 0; i < 256; i++) {
7876            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7877            value = PyLong_FromLong(i);
7878            if (!key || !value)
7879                goto failed1;
7880            if (PyDict_SetItem(result, key, value) == -1)
7881                goto failed1;
7882            Py_DECREF(key);
7883            Py_DECREF(value);
7884        }
7885        return result;
7886      failed1:
7887        Py_XDECREF(key);
7888        Py_XDECREF(value);
7889        Py_DECREF(result);
7890        return NULL;
7891    }
7892
7893    /* Create a three-level trie */
7894    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7895                             16*count2 + 128*count3 - 1);
7896    if (!result)
7897        return PyErr_NoMemory();
7898    PyObject_Init(result, &EncodingMapType);
7899    mresult = (struct encoding_map*)result;
7900    mresult->count2 = count2;
7901    mresult->count3 = count3;
7902    mlevel1 = mresult->level1;
7903    mlevel2 = mresult->level23;
7904    mlevel3 = mresult->level23 + 16*count2;
7905    memcpy(mlevel1, level1, 32);
7906    memset(mlevel2, 0xFF, 16*count2);
7907    memset(mlevel3, 0, 128*count3);
7908    count3 = 0;
7909    for (i = 1; i < 256; i++) {
7910        int o1, o2, o3, i2, i3;
7911        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7912            /* unmapped character */
7913            continue;
7914        o1 = PyUnicode_READ(kind, data, i)>>11;
7915        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7916        i2 = 16*mlevel1[o1] + o2;
7917        if (mlevel2[i2] == 0xFF)
7918            mlevel2[i2] = count3++;
7919        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7920        i3 = 128*mlevel2[i2] + o3;
7921        mlevel3[i3] = i;
7922    }
7923    return result;
7924}
7925
7926static int
7927encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7928{
7929    struct encoding_map *map = (struct encoding_map*)mapping;
7930    int l1 = c>>11;
7931    int l2 = (c>>7) & 0xF;
7932    int l3 = c & 0x7F;
7933    int i;
7934
7935    if (c > 0xFFFF)
7936        return -1;
7937    if (c == 0)
7938        return 0;
7939    /* level 1*/
7940    i = map->level1[l1];
7941    if (i == 0xFF) {
7942        return -1;
7943    }
7944    /* level 2*/
7945    i = map->level23[16*i+l2];
7946    if (i == 0xFF) {
7947        return -1;
7948    }
7949    /* level 3 */
7950    i = map->level23[16*map->count2 + 128*i + l3];
7951    if (i == 0) {
7952        return -1;
7953    }
7954    return i;
7955}
7956
7957/* Lookup the character ch in the mapping. If the character
7958   can't be found, Py_None is returned (or NULL, if another
7959   error occurred). */
7960static PyObject *
7961charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7962{
7963    PyObject *w = PyLong_FromLong((long)c);
7964    PyObject *x;
7965
7966    if (w == NULL)
7967        return NULL;
7968    x = PyObject_GetItem(mapping, w);
7969    Py_DECREF(w);
7970    if (x == NULL) {
7971        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7972            /* No mapping found means: mapping is undefined. */
7973            PyErr_Clear();
7974            x = Py_None;
7975            Py_INCREF(x);
7976            return x;
7977        } else
7978            return NULL;
7979    }
7980    else if (x == Py_None)
7981        return x;
7982    else if (PyLong_Check(x)) {
7983        long value = PyLong_AS_LONG(x);
7984        if (value < 0 || value > 255) {
7985            PyErr_SetString(PyExc_TypeError,
7986                            "character mapping must be in range(256)");
7987            Py_DECREF(x);
7988            return NULL;
7989        }
7990        return x;
7991    }
7992    else if (PyBytes_Check(x))
7993        return x;
7994    else {
7995        /* wrong return value */
7996        PyErr_Format(PyExc_TypeError,
7997                     "character mapping must return integer, bytes or None, not %.400s",
7998                     x->ob_type->tp_name);
7999        Py_DECREF(x);
8000        return NULL;
8001    }
8002}
8003
8004static int
8005charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8006{
8007    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8008    /* exponentially overallocate to minimize reallocations */
8009    if (requiredsize < 2*outsize)
8010        requiredsize = 2*outsize;
8011    if (_PyBytes_Resize(outobj, requiredsize))
8012        return -1;
8013    return 0;
8014}
8015
8016typedef enum charmapencode_result {
8017    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8018} charmapencode_result;
8019/* lookup the character, put the result in the output string and adjust
8020   various state variables. Resize the output bytes object if not enough
8021   space is available. Return a new reference to the object that
8022   was put in the output buffer, or Py_None, if the mapping was undefined
8023   (in which case no character was written) or NULL, if a
8024   reallocation error occurred. The caller must decref the result */
8025static charmapencode_result
8026charmapencode_output(Py_UCS4 c, PyObject *mapping,
8027                     PyObject **outobj, Py_ssize_t *outpos)
8028{
8029    PyObject *rep;
8030    char *outstart;
8031    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8032
8033    if (Py_TYPE(mapping) == &EncodingMapType) {
8034        int res = encoding_map_lookup(c, mapping);
8035        Py_ssize_t requiredsize = *outpos+1;
8036        if (res == -1)
8037            return enc_FAILED;
8038        if (outsize<requiredsize)
8039            if (charmapencode_resize(outobj, outpos, requiredsize))
8040                return enc_EXCEPTION;
8041        outstart = PyBytes_AS_STRING(*outobj);
8042        outstart[(*outpos)++] = (char)res;
8043        return enc_SUCCESS;
8044    }
8045
8046    rep = charmapencode_lookup(c, mapping);
8047    if (rep==NULL)
8048        return enc_EXCEPTION;
8049    else if (rep==Py_None) {
8050        Py_DECREF(rep);
8051        return enc_FAILED;
8052    } else {
8053        if (PyLong_Check(rep)) {
8054            Py_ssize_t requiredsize = *outpos+1;
8055            if (outsize<requiredsize)
8056                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8057                    Py_DECREF(rep);
8058                    return enc_EXCEPTION;
8059                }
8060            outstart = PyBytes_AS_STRING(*outobj);
8061            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8062        }
8063        else {
8064            const char *repchars = PyBytes_AS_STRING(rep);
8065            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8066            Py_ssize_t requiredsize = *outpos+repsize;
8067            if (outsize<requiredsize)
8068                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8069                    Py_DECREF(rep);
8070                    return enc_EXCEPTION;
8071                }
8072            outstart = PyBytes_AS_STRING(*outobj);
8073            memcpy(outstart + *outpos, repchars, repsize);
8074            *outpos += repsize;
8075        }
8076    }
8077    Py_DECREF(rep);
8078    return enc_SUCCESS;
8079}
8080
8081/* handle an error in PyUnicode_EncodeCharmap
8082   Return 0 on success, -1 on error */
8083static int
8084charmap_encoding_error(
8085    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8086    PyObject **exceptionObject,
8087    int *known_errorHandler, PyObject **errorHandler, const char *errors,
8088    PyObject **res, Py_ssize_t *respos)
8089{
8090    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8091    Py_ssize_t size, repsize;
8092    Py_ssize_t newpos;
8093    enum PyUnicode_Kind kind;
8094    void *data;
8095    Py_ssize_t index;
8096    /* startpos for collecting unencodable chars */
8097    Py_ssize_t collstartpos = *inpos;
8098    Py_ssize_t collendpos = *inpos+1;
8099    Py_ssize_t collpos;
8100    char *encoding = "charmap";
8101    char *reason = "character maps to <undefined>";
8102    charmapencode_result x;
8103    Py_UCS4 ch;
8104    int val;
8105
8106    if (PyUnicode_READY(unicode) < 0)
8107        return -1;
8108    size = PyUnicode_GET_LENGTH(unicode);
8109    /* find all unencodable characters */
8110    while (collendpos < size) {
8111        PyObject *rep;
8112        if (Py_TYPE(mapping) == &EncodingMapType) {
8113            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8114            val = encoding_map_lookup(ch, mapping);
8115            if (val != -1)
8116                break;
8117            ++collendpos;
8118            continue;
8119        }
8120
8121        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8122        rep = charmapencode_lookup(ch, mapping);
8123        if (rep==NULL)
8124            return -1;
8125        else if (rep!=Py_None) {
8126            Py_DECREF(rep);
8127            break;
8128        }
8129        Py_DECREF(rep);
8130        ++collendpos;
8131    }
8132    /* cache callback name lookup
8133     * (if not done yet, i.e. it's the first error) */
8134    if (*known_errorHandler==-1) {
8135        if ((errors==NULL) || (!strcmp(errors, "strict")))
8136            *known_errorHandler = 1;
8137        else if (!strcmp(errors, "replace"))
8138            *known_errorHandler = 2;
8139        else if (!strcmp(errors, "ignore"))
8140            *known_errorHandler = 3;
8141        else if (!strcmp(errors, "xmlcharrefreplace"))
8142            *known_errorHandler = 4;
8143        else
8144            *known_errorHandler = 0;
8145    }
8146    switch (*known_errorHandler) {
8147    case 1: /* strict */
8148        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8149        return -1;
8150    case 2: /* replace */
8151        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8152            x = charmapencode_output('?', mapping, res, respos);
8153            if (x==enc_EXCEPTION) {
8154                return -1;
8155            }
8156            else if (x==enc_FAILED) {
8157                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8158                return -1;
8159            }
8160        }
8161        /* fall through */
8162    case 3: /* ignore */
8163        *inpos = collendpos;
8164        break;
8165    case 4: /* xmlcharrefreplace */
8166        /* generate replacement (temporarily (mis)uses p) */
8167        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8168            char buffer[2+29+1+1];
8169            char *cp;
8170            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8171            for (cp = buffer; *cp; ++cp) {
8172                x = charmapencode_output(*cp, mapping, res, respos);
8173                if (x==enc_EXCEPTION)
8174                    return -1;
8175                else if (x==enc_FAILED) {
8176                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8177                    return -1;
8178                }
8179            }
8180        }
8181        *inpos = collendpos;
8182        break;
8183    default:
8184        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8185                                                      encoding, reason, unicode, exceptionObject,
8186                                                      collstartpos, collendpos, &newpos);
8187        if (repunicode == NULL)
8188            return -1;
8189        if (PyBytes_Check(repunicode)) {
8190            /* Directly copy bytes result to output. */
8191            Py_ssize_t outsize = PyBytes_Size(*res);
8192            Py_ssize_t requiredsize;
8193            repsize = PyBytes_Size(repunicode);
8194            requiredsize = *respos + repsize;
8195            if (requiredsize > outsize)
8196                /* Make room for all additional bytes. */
8197                if (charmapencode_resize(res, respos, requiredsize)) {
8198                    Py_DECREF(repunicode);
8199                    return -1;
8200                }
8201            memcpy(PyBytes_AsString(*res) + *respos,
8202                   PyBytes_AsString(repunicode),  repsize);
8203            *respos += repsize;
8204            *inpos = newpos;
8205            Py_DECREF(repunicode);
8206            break;
8207        }
8208        /* generate replacement  */
8209        if (PyUnicode_READY(repunicode) < 0) {
8210            Py_DECREF(repunicode);
8211            return -1;
8212        }
8213        repsize = PyUnicode_GET_LENGTH(repunicode);
8214        data = PyUnicode_DATA(repunicode);
8215        kind = PyUnicode_KIND(repunicode);
8216        for (index = 0; index < repsize; index++) {
8217            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8218            x = charmapencode_output(repch, mapping, res, respos);
8219            if (x==enc_EXCEPTION) {
8220                Py_DECREF(repunicode);
8221                return -1;
8222            }
8223            else if (x==enc_FAILED) {
8224                Py_DECREF(repunicode);
8225                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8226                return -1;
8227            }
8228        }
8229        *inpos = newpos;
8230        Py_DECREF(repunicode);
8231    }
8232    return 0;
8233}
8234
8235PyObject *
8236_PyUnicode_EncodeCharmap(PyObject *unicode,
8237                         PyObject *mapping,
8238                         const char *errors)
8239{
8240    /* output object */
8241    PyObject *res = NULL;
8242    /* current input position */
8243    Py_ssize_t inpos = 0;
8244    Py_ssize_t size;
8245    /* current output position */
8246    Py_ssize_t respos = 0;
8247    PyObject *errorHandler = NULL;
8248    PyObject *exc = NULL;
8249    /* the following variable is used for caching string comparisons
8250     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8251     * 3=ignore, 4=xmlcharrefreplace */
8252    int known_errorHandler = -1;
8253
8254    if (PyUnicode_READY(unicode) < 0)
8255        return NULL;
8256    size = PyUnicode_GET_LENGTH(unicode);
8257
8258    /* Default to Latin-1 */
8259    if (mapping == NULL)
8260        return unicode_encode_ucs1(unicode, errors, 256);
8261
8262    /* allocate enough for a simple encoding without
8263       replacements, if we need more, we'll resize */
8264    res = PyBytes_FromStringAndSize(NULL, size);
8265    if (res == NULL)
8266        goto onError;
8267    if (size == 0)
8268        return res;
8269
8270    while (inpos<size) {
8271        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
8272        /* try to encode it */
8273        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8274        if (x==enc_EXCEPTION) /* error */
8275            goto onError;
8276        if (x==enc_FAILED) { /* unencodable character */
8277            if (charmap_encoding_error(unicode, &inpos, mapping,
8278                                       &exc,
8279                                       &known_errorHandler, &errorHandler, errors,
8280                                       &res, &respos)) {
8281                goto onError;
8282            }
8283        }
8284        else
8285            /* done with this character => adjust input position */
8286            ++inpos;
8287    }
8288
8289    /* Resize if we allocated to much */
8290    if (respos<PyBytes_GET_SIZE(res))
8291        if (_PyBytes_Resize(&res, respos) < 0)
8292            goto onError;
8293
8294    Py_XDECREF(exc);
8295    Py_XDECREF(errorHandler);
8296    return res;
8297
8298  onError:
8299    Py_XDECREF(res);
8300    Py_XDECREF(exc);
8301    Py_XDECREF(errorHandler);
8302    return NULL;
8303}
8304
8305/* Deprecated */
8306PyObject *
8307PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8308                        Py_ssize_t size,
8309                        PyObject *mapping,
8310                        const char *errors)
8311{
8312    PyObject *result;
8313    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8314    if (unicode == NULL)
8315        return NULL;
8316    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8317    Py_DECREF(unicode);
8318    return result;
8319}
8320
8321PyObject *
8322PyUnicode_AsCharmapString(PyObject *unicode,
8323                          PyObject *mapping)
8324{
8325    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8326        PyErr_BadArgument();
8327        return NULL;
8328    }
8329    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8330}
8331
8332/* create or adjust a UnicodeTranslateError */
8333static void
8334make_translate_exception(PyObject **exceptionObject,
8335                         PyObject *unicode,
8336                         Py_ssize_t startpos, Py_ssize_t endpos,
8337                         const char *reason)
8338{
8339    if (*exceptionObject == NULL) {
8340        *exceptionObject = _PyUnicodeTranslateError_Create(
8341            unicode, startpos, endpos, reason);
8342    }
8343    else {
8344        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8345            goto onError;
8346        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8347            goto onError;
8348        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8349            goto onError;
8350        return;
8351      onError:
8352        Py_DECREF(*exceptionObject);
8353        *exceptionObject = NULL;
8354    }
8355}
8356
8357/* raises a UnicodeTranslateError */
8358static void
8359raise_translate_exception(PyObject **exceptionObject,
8360                          PyObject *unicode,
8361                          Py_ssize_t startpos, Py_ssize_t endpos,
8362                          const char *reason)
8363{
8364    make_translate_exception(exceptionObject,
8365                             unicode, startpos, endpos, reason);
8366    if (*exceptionObject != NULL)
8367        PyCodec_StrictErrors(*exceptionObject);
8368}
8369
8370/* error handling callback helper:
8371   build arguments, call the callback and check the arguments,
8372   put the result into newpos and return the replacement string, which
8373   has to be freed by the caller */
8374static PyObject *
8375unicode_translate_call_errorhandler(const char *errors,
8376                                    PyObject **errorHandler,
8377                                    const char *reason,
8378                                    PyObject *unicode, PyObject **exceptionObject,
8379                                    Py_ssize_t startpos, Py_ssize_t endpos,
8380                                    Py_ssize_t *newpos)
8381{
8382    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8383
8384    Py_ssize_t i_newpos;
8385    PyObject *restuple;
8386    PyObject *resunicode;
8387
8388    if (*errorHandler == NULL) {
8389        *errorHandler = PyCodec_LookupError(errors);
8390        if (*errorHandler == NULL)
8391            return NULL;
8392    }
8393
8394    make_translate_exception(exceptionObject,
8395                             unicode, startpos, endpos, reason);
8396    if (*exceptionObject == NULL)
8397        return NULL;
8398
8399    restuple = PyObject_CallFunctionObjArgs(
8400        *errorHandler, *exceptionObject, NULL);
8401    if (restuple == NULL)
8402        return NULL;
8403    if (!PyTuple_Check(restuple)) {
8404        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8405        Py_DECREF(restuple);
8406        return NULL;
8407    }
8408    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8409                          &resunicode, &i_newpos)) {
8410        Py_DECREF(restuple);
8411        return NULL;
8412    }
8413    if (i_newpos<0)
8414        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8415    else
8416        *newpos = i_newpos;
8417    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8418        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8419        Py_DECREF(restuple);
8420        return NULL;
8421    }
8422    Py_INCREF(resunicode);
8423    Py_DECREF(restuple);
8424    return resunicode;
8425}
8426
8427/* Lookup the character ch in the mapping and put the result in result,
8428   which must be decrefed by the caller.
8429   Return 0 on success, -1 on error */
8430static int
8431charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8432{
8433    PyObject *w = PyLong_FromLong((long)c);
8434    PyObject *x;
8435
8436    if (w == NULL)
8437        return -1;
8438    x = PyObject_GetItem(mapping, w);
8439    Py_DECREF(w);
8440    if (x == NULL) {
8441        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8442            /* No mapping found means: use 1:1 mapping. */
8443            PyErr_Clear();
8444            *result = NULL;
8445            return 0;
8446        } else
8447            return -1;
8448    }
8449    else if (x == Py_None) {
8450        *result = x;
8451        return 0;
8452    }
8453    else if (PyLong_Check(x)) {
8454        long value = PyLong_AS_LONG(x);
8455        long max = PyUnicode_GetMax();
8456        if (value < 0 || value > max) {
8457            PyErr_Format(PyExc_TypeError,
8458                         "character mapping must be in range(0x%x)", max+1);
8459            Py_DECREF(x);
8460            return -1;
8461        }
8462        *result = x;
8463        return 0;
8464    }
8465    else if (PyUnicode_Check(x)) {
8466        *result = x;
8467        return 0;
8468    }
8469    else {
8470        /* wrong return value */
8471        PyErr_SetString(PyExc_TypeError,
8472                        "character mapping must return integer, None or str");
8473        Py_DECREF(x);
8474        return -1;
8475    }
8476}
8477/* ensure that *outobj is at least requiredsize characters long,
8478   if not reallocate and adjust various state variables.
8479   Return 0 on success, -1 on error */
8480static int
8481charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8482                               Py_ssize_t requiredsize)
8483{
8484    Py_ssize_t oldsize = *psize;
8485    if (requiredsize > oldsize) {
8486        /* exponentially overallocate to minimize reallocations */
8487        if (requiredsize < 2 * oldsize)
8488            requiredsize = 2 * oldsize;
8489        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8490        if (*outobj == 0)
8491            return -1;
8492        *psize = requiredsize;
8493    }
8494    return 0;
8495}
8496/* lookup the character, put the result in the output string and adjust
8497   various state variables. Return a new reference to the object that
8498   was put in the output buffer in *result, or Py_None, if the mapping was
8499   undefined (in which case no character was written).
8500   The called must decref result.
8501   Return 0 on success, -1 on error. */
8502static int
8503charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8504                        PyObject *mapping, Py_UCS4 **output,
8505                        Py_ssize_t *osize, Py_ssize_t *opos,
8506                        PyObject **res)
8507{
8508    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8509    if (charmaptranslate_lookup(curinp, mapping, res))
8510        return -1;
8511    if (*res==NULL) {
8512        /* not found => default to 1:1 mapping */
8513        (*output)[(*opos)++] = curinp;
8514    }
8515    else if (*res==Py_None)
8516        ;
8517    else if (PyLong_Check(*res)) {
8518        /* no overflow check, because we know that the space is enough */
8519        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8520    }
8521    else if (PyUnicode_Check(*res)) {
8522        Py_ssize_t repsize;
8523        if (PyUnicode_READY(*res) == -1)
8524            return -1;
8525        repsize = PyUnicode_GET_LENGTH(*res);
8526        if (repsize==1) {
8527            /* no overflow check, because we know that the space is enough */
8528            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8529        }
8530        else if (repsize!=0) {
8531            /* more than one character */
8532            Py_ssize_t requiredsize = *opos +
8533                (PyUnicode_GET_LENGTH(input) - ipos) +
8534                repsize - 1;
8535            Py_ssize_t i;
8536            if (charmaptranslate_makespace(output, osize, requiredsize))
8537                return -1;
8538            for(i = 0; i < repsize; i++)
8539                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8540        }
8541    }
8542    else
8543        return -1;
8544    return 0;
8545}
8546
8547PyObject *
8548_PyUnicode_TranslateCharmap(PyObject *input,
8549                            PyObject *mapping,
8550                            const char *errors)
8551{
8552    /* input object */
8553    char *idata;
8554    Py_ssize_t size, i;
8555    int kind;
8556    /* output buffer */
8557    Py_UCS4 *output = NULL;
8558    Py_ssize_t osize;
8559    PyObject *res;
8560    /* current output position */
8561    Py_ssize_t opos;
8562    char *reason = "character maps to <undefined>";
8563    PyObject *errorHandler = NULL;
8564    PyObject *exc = NULL;
8565    /* the following variable is used for caching string comparisons
8566     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8567     * 3=ignore, 4=xmlcharrefreplace */
8568    int known_errorHandler = -1;
8569
8570    if (mapping == NULL) {
8571        PyErr_BadArgument();
8572        return NULL;
8573    }
8574
8575    if (PyUnicode_READY(input) == -1)
8576        return NULL;
8577    idata = (char*)PyUnicode_DATA(input);
8578    kind = PyUnicode_KIND(input);
8579    size = PyUnicode_GET_LENGTH(input);
8580    i = 0;
8581
8582    if (size == 0) {
8583        Py_INCREF(input);
8584        return input;
8585    }
8586
8587    /* allocate enough for a simple 1:1 translation without
8588       replacements, if we need more, we'll resize */
8589    osize = size;
8590    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8591    opos = 0;
8592    if (output == NULL) {
8593        PyErr_NoMemory();
8594        goto onError;
8595    }
8596
8597    while (i<size) {
8598        /* try to encode it */
8599        PyObject *x = NULL;
8600        if (charmaptranslate_output(input, i, mapping,
8601                                    &output, &osize, &opos, &x)) {
8602            Py_XDECREF(x);
8603            goto onError;
8604        }
8605        Py_XDECREF(x);
8606        if (x!=Py_None) /* it worked => adjust input pointer */
8607            ++i;
8608        else { /* untranslatable character */
8609            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8610            Py_ssize_t repsize;
8611            Py_ssize_t newpos;
8612            Py_ssize_t uni2;
8613            /* startpos for collecting untranslatable chars */
8614            Py_ssize_t collstart = i;
8615            Py_ssize_t collend = i+1;
8616            Py_ssize_t coll;
8617
8618            /* find all untranslatable characters */
8619            while (collend < size) {
8620                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8621                    goto onError;
8622                Py_XDECREF(x);
8623                if (x!=Py_None)
8624                    break;
8625                ++collend;
8626            }
8627            /* cache callback name lookup
8628             * (if not done yet, i.e. it's the first error) */
8629            if (known_errorHandler==-1) {
8630                if ((errors==NULL) || (!strcmp(errors, "strict")))
8631                    known_errorHandler = 1;
8632                else if (!strcmp(errors, "replace"))
8633                    known_errorHandler = 2;
8634                else if (!strcmp(errors, "ignore"))
8635                    known_errorHandler = 3;
8636                else if (!strcmp(errors, "xmlcharrefreplace"))
8637                    known_errorHandler = 4;
8638                else
8639                    known_errorHandler = 0;
8640            }
8641            switch (known_errorHandler) {
8642            case 1: /* strict */
8643                raise_translate_exception(&exc, input, collstart,
8644                                          collend, reason);
8645                goto onError;
8646            case 2: /* replace */
8647                /* No need to check for space, this is a 1:1 replacement */
8648                for (coll = collstart; coll<collend; coll++)
8649                    output[opos++] = '?';
8650                /* fall through */
8651            case 3: /* ignore */
8652                i = collend;
8653                break;
8654            case 4: /* xmlcharrefreplace */
8655                /* generate replacement (temporarily (mis)uses i) */
8656                for (i = collstart; i < collend; ++i) {
8657                    char buffer[2+29+1+1];
8658                    char *cp;
8659                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8660                    if (charmaptranslate_makespace(&output, &osize,
8661                                                   opos+strlen(buffer)+(size-collend)))
8662                        goto onError;
8663                    for (cp = buffer; *cp; ++cp)
8664                        output[opos++] = *cp;
8665                }
8666                i = collend;
8667                break;
8668            default:
8669                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8670                                                                 reason, input, &exc,
8671                                                                 collstart, collend, &newpos);
8672                if (repunicode == NULL)
8673                    goto onError;
8674                if (PyUnicode_READY(repunicode) < 0) {
8675                    Py_DECREF(repunicode);
8676                    goto onError;
8677                }
8678                /* generate replacement  */
8679                repsize = PyUnicode_GET_LENGTH(repunicode);
8680                if (charmaptranslate_makespace(&output, &osize,
8681                                               opos+repsize+(size-collend))) {
8682                    Py_DECREF(repunicode);
8683                    goto onError;
8684                }
8685                for (uni2 = 0; repsize-->0; ++uni2)
8686                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8687                i = newpos;
8688                Py_DECREF(repunicode);
8689            }
8690        }
8691    }
8692    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8693    if (!res)
8694        goto onError;
8695    PyMem_Free(output);
8696    Py_XDECREF(exc);
8697    Py_XDECREF(errorHandler);
8698    return res;
8699
8700  onError:
8701    PyMem_Free(output);
8702    Py_XDECREF(exc);
8703    Py_XDECREF(errorHandler);
8704    return NULL;
8705}
8706
8707/* Deprecated. Use PyUnicode_Translate instead. */
8708PyObject *
8709PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8710                           Py_ssize_t size,
8711                           PyObject *mapping,
8712                           const char *errors)
8713{
8714    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8715    if (!unicode)
8716        return NULL;
8717    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8718}
8719
8720PyObject *
8721PyUnicode_Translate(PyObject *str,
8722                    PyObject *mapping,
8723                    const char *errors)
8724{
8725    PyObject *result;
8726
8727    str = PyUnicode_FromObject(str);
8728    if (str == NULL)
8729        goto onError;
8730    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8731    Py_DECREF(str);
8732    return result;
8733
8734  onError:
8735    Py_XDECREF(str);
8736    return NULL;
8737}
8738
8739static Py_UCS4
8740fix_decimal_and_space_to_ascii(PyObject *self)
8741{
8742    /* No need to call PyUnicode_READY(self) because this function is only
8743       called as a callback from fixup() which does it already. */
8744    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8745    const int kind = PyUnicode_KIND(self);
8746    void *data = PyUnicode_DATA(self);
8747    Py_UCS4 maxchar = 0, ch, fixed;
8748    Py_ssize_t i;
8749
8750    for (i = 0; i < len; ++i) {
8751        ch = PyUnicode_READ(kind, data, i);
8752        fixed = 0;
8753        if (ch > 127) {
8754            if (Py_UNICODE_ISSPACE(ch))
8755                fixed = ' ';
8756            else {
8757                const int decimal = Py_UNICODE_TODECIMAL(ch);
8758                if (decimal >= 0)
8759                    fixed = '0' + decimal;
8760            }
8761            if (fixed != 0) {
8762                if (fixed > maxchar)
8763                    maxchar = fixed;
8764                PyUnicode_WRITE(kind, data, i, fixed);
8765            }
8766            else if (ch > maxchar)
8767                maxchar = ch;
8768        }
8769        else if (ch > maxchar)
8770            maxchar = ch;
8771    }
8772
8773    return maxchar;
8774}
8775
8776PyObject *
8777_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8778{
8779    if (!PyUnicode_Check(unicode)) {
8780        PyErr_BadInternalCall();
8781        return NULL;
8782    }
8783    if (PyUnicode_READY(unicode) == -1)
8784        return NULL;
8785    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8786        /* If the string is already ASCII, just return the same string */
8787        Py_INCREF(unicode);
8788        return unicode;
8789    }
8790    return fixup(unicode, fix_decimal_and_space_to_ascii);
8791}
8792
8793PyObject *
8794PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8795                                  Py_ssize_t length)
8796{
8797    PyObject *decimal;
8798    Py_ssize_t i;
8799    Py_UCS4 maxchar;
8800    enum PyUnicode_Kind kind;
8801    void *data;
8802
8803    maxchar = 0;
8804    for (i = 0; i < length; i++) {
8805        Py_UNICODE ch = s[i];
8806        if (ch > 127) {
8807            int decimal = Py_UNICODE_TODECIMAL(ch);
8808            if (decimal >= 0)
8809                ch = '0' + decimal;
8810        }
8811        maxchar = Py_MAX(maxchar, ch);
8812    }
8813
8814    /* Copy to a new string */
8815    decimal = PyUnicode_New(length, maxchar);
8816    if (decimal == NULL)
8817        return decimal;
8818    kind = PyUnicode_KIND(decimal);
8819    data = PyUnicode_DATA(decimal);
8820    /* Iterate over code points */
8821    for (i = 0; i < length; i++) {
8822        Py_UNICODE ch = s[i];
8823        if (ch > 127) {
8824            int decimal = Py_UNICODE_TODECIMAL(ch);
8825            if (decimal >= 0)
8826                ch = '0' + decimal;
8827        }
8828        PyUnicode_WRITE(kind, data, i, ch);
8829    }
8830    return unicode_result(decimal);
8831}
8832/* --- Decimal Encoder ---------------------------------------------------- */
8833
8834int
8835PyUnicode_EncodeDecimal(Py_UNICODE *s,
8836                        Py_ssize_t length,
8837                        char *output,
8838                        const char *errors)
8839{
8840    PyObject *unicode;
8841    Py_ssize_t i;
8842    enum PyUnicode_Kind kind;
8843    void *data;
8844
8845    if (output == NULL) {
8846        PyErr_BadArgument();
8847        return -1;
8848    }
8849
8850    unicode = PyUnicode_FromUnicode(s, length);
8851    if (unicode == NULL)
8852        return -1;
8853
8854    if (PyUnicode_READY(unicode) < 0) {
8855        Py_DECREF(unicode);
8856        return -1;
8857    }
8858    kind = PyUnicode_KIND(unicode);
8859    data = PyUnicode_DATA(unicode);
8860
8861    for (i=0; i < length; ) {
8862        PyObject *exc;
8863        Py_UCS4 ch;
8864        int decimal;
8865        Py_ssize_t startpos;
8866
8867        ch = PyUnicode_READ(kind, data, i);
8868
8869        if (Py_UNICODE_ISSPACE(ch)) {
8870            *output++ = ' ';
8871            i++;
8872            continue;
8873        }
8874        decimal = Py_UNICODE_TODECIMAL(ch);
8875        if (decimal >= 0) {
8876            *output++ = '0' + decimal;
8877            i++;
8878            continue;
8879        }
8880        if (0 < ch && ch < 256) {
8881            *output++ = (char)ch;
8882            i++;
8883            continue;
8884        }
8885
8886        startpos = i;
8887        exc = NULL;
8888        raise_encode_exception(&exc, "decimal", unicode,
8889                               startpos, startpos+1,
8890                               "invalid decimal Unicode string");
8891        Py_XDECREF(exc);
8892        Py_DECREF(unicode);
8893        return -1;
8894    }
8895    /* 0-terminate the output string */
8896    *output++ = '\0';
8897    Py_DECREF(unicode);
8898    return 0;
8899}
8900
8901/* --- Helpers ------------------------------------------------------------ */
8902
8903static Py_ssize_t
8904any_find_slice(int direction, PyObject* s1, PyObject* s2,
8905               Py_ssize_t start,
8906               Py_ssize_t end)
8907{
8908    int kind1, kind2, kind;
8909    void *buf1, *buf2;
8910    Py_ssize_t len1, len2, result;
8911
8912    kind1 = PyUnicode_KIND(s1);
8913    kind2 = PyUnicode_KIND(s2);
8914    kind = kind1 > kind2 ? kind1 : kind2;
8915    buf1 = PyUnicode_DATA(s1);
8916    buf2 = PyUnicode_DATA(s2);
8917    if (kind1 != kind)
8918        buf1 = _PyUnicode_AsKind(s1, kind);
8919    if (!buf1)
8920        return -2;
8921    if (kind2 != kind)
8922        buf2 = _PyUnicode_AsKind(s2, kind);
8923    if (!buf2) {
8924        if (kind1 != kind) PyMem_Free(buf1);
8925        return -2;
8926    }
8927    len1 = PyUnicode_GET_LENGTH(s1);
8928    len2 = PyUnicode_GET_LENGTH(s2);
8929
8930    if (direction > 0) {
8931        switch(kind) {
8932        case PyUnicode_1BYTE_KIND:
8933            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8934                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8935            else
8936                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8937            break;
8938        case PyUnicode_2BYTE_KIND:
8939            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8940            break;
8941        case PyUnicode_4BYTE_KIND:
8942            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8943            break;
8944        default:
8945            assert(0); result = -2;
8946        }
8947    }
8948    else {
8949        switch(kind) {
8950        case PyUnicode_1BYTE_KIND:
8951            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8952                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8953            else
8954                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8955            break;
8956        case PyUnicode_2BYTE_KIND:
8957            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8958            break;
8959        case PyUnicode_4BYTE_KIND:
8960            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8961            break;
8962        default:
8963            assert(0); result = -2;
8964        }
8965    }
8966
8967    if (kind1 != kind)
8968        PyMem_Free(buf1);
8969    if (kind2 != kind)
8970        PyMem_Free(buf2);
8971
8972    return result;
8973}
8974
8975Py_ssize_t
8976_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
8977                                   Py_ssize_t n_buffer,
8978                                   void *digits, Py_ssize_t n_digits,
8979                                   Py_ssize_t min_width,
8980                                   const char *grouping,
8981                                   const char *thousands_sep)
8982{
8983    switch(kind) {
8984    case PyUnicode_1BYTE_KIND:
8985        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8986            return _PyUnicode_ascii_InsertThousandsGrouping(
8987                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8988                min_width, grouping, thousands_sep);
8989        else
8990            return _PyUnicode_ucs1_InsertThousandsGrouping(
8991                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8992                min_width, grouping, thousands_sep);
8993    case PyUnicode_2BYTE_KIND:
8994        return _PyUnicode_ucs2_InsertThousandsGrouping(
8995            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8996            min_width, grouping, thousands_sep);
8997    case PyUnicode_4BYTE_KIND:
8998        return _PyUnicode_ucs4_InsertThousandsGrouping(
8999            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9000            min_width, grouping, thousands_sep);
9001    }
9002    assert(0);
9003    return -1;
9004}
9005
9006
9007/* helper macro to fixup start/end slice values */
9008#define ADJUST_INDICES(start, end, len)         \
9009    if (end > len)                              \
9010        end = len;                              \
9011    else if (end < 0) {                         \
9012        end += len;                             \
9013        if (end < 0)                            \
9014            end = 0;                            \
9015    }                                           \
9016    if (start < 0) {                            \
9017        start += len;                           \
9018        if (start < 0)                          \
9019            start = 0;                          \
9020    }
9021
9022Py_ssize_t
9023PyUnicode_Count(PyObject *str,
9024                PyObject *substr,
9025                Py_ssize_t start,
9026                Py_ssize_t end)
9027{
9028    Py_ssize_t result;
9029    PyObject* str_obj;
9030    PyObject* sub_obj;
9031    int kind1, kind2, kind;
9032    void *buf1 = NULL, *buf2 = NULL;
9033    Py_ssize_t len1, len2;
9034
9035    str_obj = PyUnicode_FromObject(str);
9036    if (!str_obj || PyUnicode_READY(str_obj) == -1)
9037        return -1;
9038    sub_obj = PyUnicode_FromObject(substr);
9039    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
9040        Py_DECREF(str_obj);
9041        return -1;
9042    }
9043
9044    kind1 = PyUnicode_KIND(str_obj);
9045    kind2 = PyUnicode_KIND(sub_obj);
9046    kind = kind1 > kind2 ? kind1 : kind2;
9047    buf1 = PyUnicode_DATA(str_obj);
9048    if (kind1 != kind)
9049        buf1 = _PyUnicode_AsKind(str_obj, kind);
9050    if (!buf1)
9051        goto onError;
9052    buf2 = PyUnicode_DATA(sub_obj);
9053    if (kind2 != kind)
9054        buf2 = _PyUnicode_AsKind(sub_obj, kind);
9055    if (!buf2)
9056        goto onError;
9057    len1 = PyUnicode_GET_LENGTH(str_obj);
9058    len2 = PyUnicode_GET_LENGTH(sub_obj);
9059
9060    ADJUST_INDICES(start, end, len1);
9061    switch(kind) {
9062    case PyUnicode_1BYTE_KIND:
9063        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9064            result = asciilib_count(
9065                ((Py_UCS1*)buf1) + start, end - start,
9066                buf2, len2, PY_SSIZE_T_MAX
9067                );
9068        else
9069            result = ucs1lib_count(
9070                ((Py_UCS1*)buf1) + start, end - start,
9071                buf2, len2, PY_SSIZE_T_MAX
9072                );
9073        break;
9074    case PyUnicode_2BYTE_KIND:
9075        result = ucs2lib_count(
9076            ((Py_UCS2*)buf1) + start, end - start,
9077            buf2, len2, PY_SSIZE_T_MAX
9078            );
9079        break;
9080    case PyUnicode_4BYTE_KIND:
9081        result = ucs4lib_count(
9082            ((Py_UCS4*)buf1) + start, end - start,
9083            buf2, len2, PY_SSIZE_T_MAX
9084            );
9085        break;
9086    default:
9087        assert(0); result = 0;
9088    }
9089
9090    Py_DECREF(sub_obj);
9091    Py_DECREF(str_obj);
9092
9093    if (kind1 != kind)
9094        PyMem_Free(buf1);
9095    if (kind2 != kind)
9096        PyMem_Free(buf2);
9097
9098    return result;
9099  onError:
9100    Py_DECREF(sub_obj);
9101    Py_DECREF(str_obj);
9102    if (kind1 != kind && buf1)
9103        PyMem_Free(buf1);
9104    if (kind2 != kind && buf2)
9105        PyMem_Free(buf2);
9106    return -1;
9107}
9108
9109Py_ssize_t
9110PyUnicode_Find(PyObject *str,
9111               PyObject *sub,
9112               Py_ssize_t start,
9113               Py_ssize_t end,
9114               int direction)
9115{
9116    Py_ssize_t result;
9117
9118    str = PyUnicode_FromObject(str);
9119    if (!str || PyUnicode_READY(str) == -1)
9120        return -2;
9121    sub = PyUnicode_FromObject(sub);
9122    if (!sub || PyUnicode_READY(sub) == -1) {
9123        Py_DECREF(str);
9124        return -2;
9125    }
9126
9127    result = any_find_slice(direction,
9128        str, sub, start, end
9129        );
9130
9131    Py_DECREF(str);
9132    Py_DECREF(sub);
9133
9134    return result;
9135}
9136
9137Py_ssize_t
9138PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9139                   Py_ssize_t start, Py_ssize_t end,
9140                   int direction)
9141{
9142    int kind;
9143    Py_ssize_t result;
9144    if (PyUnicode_READY(str) == -1)
9145        return -2;
9146    if (start < 0 || end < 0) {
9147        PyErr_SetString(PyExc_IndexError, "string index out of range");
9148        return -2;
9149    }
9150    if (end > PyUnicode_GET_LENGTH(str))
9151        end = PyUnicode_GET_LENGTH(str);
9152    kind = PyUnicode_KIND(str);
9153    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9154                      kind, end-start, ch, direction);
9155    if (result == -1)
9156        return -1;
9157    else
9158        return start + result;
9159}
9160
9161static int
9162tailmatch(PyObject *self,
9163          PyObject *substring,
9164          Py_ssize_t start,
9165          Py_ssize_t end,
9166          int direction)
9167{
9168    int kind_self;
9169    int kind_sub;
9170    void *data_self;
9171    void *data_sub;
9172    Py_ssize_t offset;
9173    Py_ssize_t i;
9174    Py_ssize_t end_sub;
9175
9176    if (PyUnicode_READY(self) == -1 ||
9177        PyUnicode_READY(substring) == -1)
9178        return 0;
9179
9180    if (PyUnicode_GET_LENGTH(substring) == 0)
9181        return 1;
9182
9183    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9184    end -= PyUnicode_GET_LENGTH(substring);
9185    if (end < start)
9186        return 0;
9187
9188    kind_self = PyUnicode_KIND(self);
9189    data_self = PyUnicode_DATA(self);
9190    kind_sub = PyUnicode_KIND(substring);
9191    data_sub = PyUnicode_DATA(substring);
9192    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9193
9194    if (direction > 0)
9195        offset = end;
9196    else
9197        offset = start;
9198
9199    if (PyUnicode_READ(kind_self, data_self, offset) ==
9200        PyUnicode_READ(kind_sub, data_sub, 0) &&
9201        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9202        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9203        /* If both are of the same kind, memcmp is sufficient */
9204        if (kind_self == kind_sub) {
9205            return ! memcmp((char *)data_self +
9206                                (offset * PyUnicode_KIND(substring)),
9207                            data_sub,
9208                            PyUnicode_GET_LENGTH(substring) *
9209                                PyUnicode_KIND(substring));
9210        }
9211        /* otherwise we have to compare each character by first accesing it */
9212        else {
9213            /* We do not need to compare 0 and len(substring)-1 because
9214               the if statement above ensured already that they are equal
9215               when we end up here. */
9216            // TODO: honor direction and do a forward or backwards search
9217            for (i = 1; i < end_sub; ++i) {
9218                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9219                    PyUnicode_READ(kind_sub, data_sub, i))
9220                    return 0;
9221            }
9222            return 1;
9223        }
9224    }
9225
9226    return 0;
9227}
9228
9229Py_ssize_t
9230PyUnicode_Tailmatch(PyObject *str,
9231                    PyObject *substr,
9232                    Py_ssize_t start,
9233                    Py_ssize_t end,
9234                    int direction)
9235{
9236    Py_ssize_t result;
9237
9238    str = PyUnicode_FromObject(str);
9239    if (str == NULL)
9240        return -1;
9241    substr = PyUnicode_FromObject(substr);
9242    if (substr == NULL) {
9243        Py_DECREF(str);
9244        return -1;
9245    }
9246
9247    result = tailmatch(str, substr,
9248                       start, end, direction);
9249    Py_DECREF(str);
9250    Py_DECREF(substr);
9251    return result;
9252}
9253
9254/* Apply fixfct filter to the Unicode object self and return a
9255   reference to the modified object */
9256
9257static PyObject *
9258fixup(PyObject *self,
9259      Py_UCS4 (*fixfct)(PyObject *s))
9260{
9261    PyObject *u;
9262    Py_UCS4 maxchar_old, maxchar_new = 0;
9263
9264    u = PyUnicode_Copy(self);
9265    if (u == NULL)
9266        return NULL;
9267    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9268
9269    /* fix functions return the new maximum character in a string,
9270       if the kind of the resulting unicode object does not change,
9271       everything is fine.  Otherwise we need to change the string kind
9272       and re-run the fix function. */
9273    maxchar_new = fixfct(u);
9274    if (maxchar_new == 0)
9275        /* do nothing, keep maxchar_new at 0 which means no changes. */;
9276    else if (maxchar_new <= 127)
9277        maxchar_new = 127;
9278    else if (maxchar_new <= 255)
9279        maxchar_new = 255;
9280    else if (maxchar_new <= 65535)
9281        maxchar_new = 65535;
9282    else
9283        maxchar_new = MAX_UNICODE;
9284
9285    if (!maxchar_new && PyUnicode_CheckExact(self)) {
9286        /* fixfct should return TRUE if it modified the buffer. If
9287           FALSE, return a reference to the original buffer instead
9288           (to save space, not time) */
9289        Py_INCREF(self);
9290        Py_DECREF(u);
9291        return self;
9292    }
9293    else if (maxchar_new == maxchar_old) {
9294        return u;
9295    }
9296    else {
9297        /* In case the maximum character changed, we need to
9298           convert the string to the new category. */
9299        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9300        if (v == NULL) {
9301            Py_DECREF(u);
9302            return NULL;
9303        }
9304        if (maxchar_new > maxchar_old) {
9305            /* If the maxchar increased so that the kind changed, not all
9306               characters are representable anymore and we need to fix the
9307               string again. This only happens in very few cases. */
9308            copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9309            maxchar_old = fixfct(v);
9310            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9311        }
9312        else {
9313            copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
9314        }
9315
9316        Py_DECREF(u);
9317        assert(_PyUnicode_CheckConsistency(v, 1));
9318        return v;
9319    }
9320}
9321
9322static Py_UCS4
9323fixupper(PyObject *self)
9324{
9325    /* No need to call PyUnicode_READY(self) because this function is only
9326       called as a callback from fixup() which does it already. */
9327    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9328    const int kind = PyUnicode_KIND(self);
9329    void *data = PyUnicode_DATA(self);
9330    int touched = 0;
9331    Py_UCS4 maxchar = 0;
9332    Py_ssize_t i;
9333
9334    for (i = 0; i < len; ++i) {
9335        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9336        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9337        if (up != ch) {
9338            if (up > maxchar)
9339                maxchar = up;
9340            PyUnicode_WRITE(kind, data, i, up);
9341            touched = 1;
9342        }
9343        else if (ch > maxchar)
9344            maxchar = ch;
9345    }
9346
9347    if (touched)
9348        return maxchar;
9349    else
9350        return 0;
9351}
9352
9353static Py_UCS4
9354fixlower(PyObject *self)
9355{
9356    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9357    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9358    const int kind = PyUnicode_KIND(self);
9359    void *data = PyUnicode_DATA(self);
9360    int touched = 0;
9361    Py_UCS4 maxchar = 0;
9362    Py_ssize_t i;
9363
9364    for(i = 0; i < len; ++i) {
9365        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9366        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9367        if (lo != ch) {
9368            if (lo > maxchar)
9369                maxchar = lo;
9370            PyUnicode_WRITE(kind, data, i, lo);
9371            touched = 1;
9372        }
9373        else if (ch > maxchar)
9374            maxchar = ch;
9375    }
9376
9377    if (touched)
9378        return maxchar;
9379    else
9380        return 0;
9381}
9382
9383static Py_UCS4
9384fixswapcase(PyObject *self)
9385{
9386    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9387    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9388    const int kind = PyUnicode_KIND(self);
9389    void *data = PyUnicode_DATA(self);
9390    int touched = 0;
9391    Py_UCS4 maxchar = 0;
9392    Py_ssize_t i;
9393
9394    for(i = 0; i < len; ++i) {
9395        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9396        Py_UCS4 nu = 0;
9397
9398        if (Py_UNICODE_ISUPPER(ch))
9399            nu = Py_UNICODE_TOLOWER(ch);
9400        else if (Py_UNICODE_ISLOWER(ch))
9401            nu = Py_UNICODE_TOUPPER(ch);
9402
9403        if (nu != 0) {
9404            if (nu > maxchar)
9405                maxchar = nu;
9406            PyUnicode_WRITE(kind, data, i, nu);
9407            touched = 1;
9408        }
9409        else if (ch > maxchar)
9410            maxchar = ch;
9411    }
9412
9413    if (touched)
9414        return maxchar;
9415    else
9416        return 0;
9417}
9418
9419static Py_UCS4
9420fixcapitalize(PyObject *self)
9421{
9422    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9423    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9424    const int kind = PyUnicode_KIND(self);
9425    void *data = PyUnicode_DATA(self);
9426    int touched = 0;
9427    Py_UCS4 maxchar = 0;
9428    Py_ssize_t i = 0;
9429    Py_UCS4 ch;
9430
9431    if (len == 0)
9432        return 0;
9433
9434    ch = PyUnicode_READ(kind, data, i);
9435    if (!Py_UNICODE_ISUPPER(ch)) {
9436        maxchar = Py_UNICODE_TOUPPER(ch);
9437        PyUnicode_WRITE(kind, data, i, maxchar);
9438        touched = 1;
9439    }
9440    ++i;
9441    for(; i < len; ++i) {
9442        ch = PyUnicode_READ(kind, data, i);
9443        if (!Py_UNICODE_ISLOWER(ch)) {
9444            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9445            if (lo > maxchar)
9446                maxchar = lo;
9447            PyUnicode_WRITE(kind, data, i, lo);
9448            touched = 1;
9449        }
9450        else if (ch > maxchar)
9451            maxchar = ch;
9452    }
9453
9454    if (touched)
9455        return maxchar;
9456    else
9457        return 0;
9458}
9459
9460static Py_UCS4
9461fixtitle(PyObject *self)
9462{
9463    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9464    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9465    const int kind = PyUnicode_KIND(self);
9466    void *data = PyUnicode_DATA(self);
9467    Py_UCS4 maxchar = 0;
9468    Py_ssize_t i = 0;
9469    int previous_is_cased;
9470
9471    /* Shortcut for single character strings */
9472    if (len == 1) {
9473        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9474        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9475        if (ti != ch) {
9476            PyUnicode_WRITE(kind, data, i, ti);
9477            return ti;
9478        }
9479        else
9480            return 0;
9481    }
9482    previous_is_cased = 0;
9483    for(; i < len; ++i) {
9484        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9485        Py_UCS4 nu;
9486
9487        if (previous_is_cased)
9488            nu = Py_UNICODE_TOLOWER(ch);
9489        else
9490            nu = Py_UNICODE_TOTITLE(ch);
9491
9492        if (nu > maxchar)
9493            maxchar = nu;
9494        PyUnicode_WRITE(kind, data, i, nu);
9495
9496        if (Py_UNICODE_ISLOWER(ch) ||
9497            Py_UNICODE_ISUPPER(ch) ||
9498            Py_UNICODE_ISTITLE(ch))
9499            previous_is_cased = 1;
9500        else
9501            previous_is_cased = 0;
9502    }
9503    return maxchar;
9504}
9505
9506PyObject *
9507PyUnicode_Join(PyObject *separator, PyObject *seq)
9508{
9509    PyObject *sep = NULL;
9510    Py_ssize_t seplen;
9511    PyObject *res = NULL; /* the result */
9512    PyObject *fseq;          /* PySequence_Fast(seq) */
9513    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9514    PyObject **items;
9515    PyObject *item;
9516    Py_ssize_t sz, i, res_offset;
9517    Py_UCS4 maxchar;
9518    Py_UCS4 item_maxchar;
9519    int use_memcpy;
9520    unsigned char *res_data = NULL, *sep_data = NULL;
9521    PyObject *last_obj;
9522    unsigned int kind = 0;
9523
9524    fseq = PySequence_Fast(seq, "");
9525    if (fseq == NULL) {
9526        return NULL;
9527    }
9528
9529    /* NOTE: the following code can't call back into Python code,
9530     * so we are sure that fseq won't be mutated.
9531     */
9532
9533    seqlen = PySequence_Fast_GET_SIZE(fseq);
9534    /* If empty sequence, return u"". */
9535    if (seqlen == 0) {
9536        Py_DECREF(fseq);
9537        Py_INCREF(unicode_empty);
9538        res = unicode_empty;
9539        return res;
9540    }
9541
9542    /* If singleton sequence with an exact Unicode, return that. */
9543    last_obj = NULL;
9544    items = PySequence_Fast_ITEMS(fseq);
9545    if (seqlen == 1) {
9546        if (PyUnicode_CheckExact(items[0])) {
9547            res = items[0];
9548            Py_INCREF(res);
9549            Py_DECREF(fseq);
9550            return res;
9551        }
9552        seplen = 0;
9553        maxchar = 0;
9554    }
9555    else {
9556        /* Set up sep and seplen */
9557        if (separator == NULL) {
9558            /* fall back to a blank space separator */
9559            sep = PyUnicode_FromOrdinal(' ');
9560            if (!sep)
9561                goto onError;
9562            seplen = 1;
9563            maxchar = 32;
9564        }
9565        else {
9566            if (!PyUnicode_Check(separator)) {
9567                PyErr_Format(PyExc_TypeError,
9568                             "separator: expected str instance,"
9569                             " %.80s found",
9570                             Py_TYPE(separator)->tp_name);
9571                goto onError;
9572            }
9573            if (PyUnicode_READY(separator))
9574                goto onError;
9575            sep = separator;
9576            seplen = PyUnicode_GET_LENGTH(separator);
9577            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9578            /* inc refcount to keep this code path symmetric with the
9579               above case of a blank separator */
9580            Py_INCREF(sep);
9581        }
9582        last_obj = sep;
9583    }
9584
9585    /* There are at least two things to join, or else we have a subclass
9586     * of str in the sequence.
9587     * Do a pre-pass to figure out the total amount of space we'll
9588     * need (sz), and see whether all argument are strings.
9589     */
9590    sz = 0;
9591#ifdef Py_DEBUG
9592    use_memcpy = 0;
9593#else
9594    use_memcpy = 1;
9595#endif
9596    for (i = 0; i < seqlen; i++) {
9597        const Py_ssize_t old_sz = sz;
9598        item = items[i];
9599        if (!PyUnicode_Check(item)) {
9600            PyErr_Format(PyExc_TypeError,
9601                         "sequence item %zd: expected str instance,"
9602                         " %.80s found",
9603                         i, Py_TYPE(item)->tp_name);
9604            goto onError;
9605        }
9606        if (PyUnicode_READY(item) == -1)
9607            goto onError;
9608        sz += PyUnicode_GET_LENGTH(item);
9609        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9610        maxchar = Py_MAX(maxchar, item_maxchar);
9611        if (i != 0)
9612            sz += seplen;
9613        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9614            PyErr_SetString(PyExc_OverflowError,
9615                            "join() result is too long for a Python string");
9616            goto onError;
9617        }
9618        if (use_memcpy && last_obj != NULL) {
9619            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9620                use_memcpy = 0;
9621        }
9622        last_obj = item;
9623    }
9624
9625    res = PyUnicode_New(sz, maxchar);
9626    if (res == NULL)
9627        goto onError;
9628
9629    /* Catenate everything. */
9630#ifdef Py_DEBUG
9631    use_memcpy = 0;
9632#else
9633    if (use_memcpy) {
9634        res_data = PyUnicode_1BYTE_DATA(res);
9635        kind = PyUnicode_KIND(res);
9636        if (seplen != 0)
9637            sep_data = PyUnicode_1BYTE_DATA(sep);
9638    }
9639#endif
9640    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9641        Py_ssize_t itemlen;
9642        item = items[i];
9643        /* Copy item, and maybe the separator. */
9644        if (i && seplen != 0) {
9645            if (use_memcpy) {
9646                Py_MEMCPY(res_data,
9647                          sep_data,
9648                          kind * seplen);
9649                res_data += kind * seplen;
9650            }
9651            else {
9652                copy_characters(res, res_offset, sep, 0, seplen);
9653                res_offset += seplen;
9654            }
9655        }
9656        itemlen = PyUnicode_GET_LENGTH(item);
9657        if (itemlen != 0) {
9658            if (use_memcpy) {
9659                Py_MEMCPY(res_data,
9660                          PyUnicode_DATA(item),
9661                          kind * itemlen);
9662                res_data += kind * itemlen;
9663            }
9664            else {
9665                copy_characters(res, res_offset, item, 0, itemlen);
9666                res_offset += itemlen;
9667            }
9668        }
9669    }
9670    if (use_memcpy)
9671        assert(res_data == PyUnicode_1BYTE_DATA(res)
9672                           + kind * PyUnicode_GET_LENGTH(res));
9673    else
9674        assert(res_offset == PyUnicode_GET_LENGTH(res));
9675
9676    Py_DECREF(fseq);
9677    Py_XDECREF(sep);
9678    assert(_PyUnicode_CheckConsistency(res, 1));
9679    return res;
9680
9681  onError:
9682    Py_DECREF(fseq);
9683    Py_XDECREF(sep);
9684    Py_XDECREF(res);
9685    return NULL;
9686}
9687
9688#define FILL(kind, data, value, start, length) \
9689    do { \
9690        Py_ssize_t i_ = 0; \
9691        assert(kind != PyUnicode_WCHAR_KIND); \
9692        switch ((kind)) { \
9693        case PyUnicode_1BYTE_KIND: { \
9694            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9695            memset(to_, (unsigned char)value, length); \
9696            break; \
9697        } \
9698        case PyUnicode_2BYTE_KIND: { \
9699            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9700            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9701            break; \
9702        } \
9703        default: { \
9704            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9705            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9706            break; \
9707        } \
9708        } \
9709    } while (0)
9710
9711static PyObject *
9712pad(PyObject *self,
9713    Py_ssize_t left,
9714    Py_ssize_t right,
9715    Py_UCS4 fill)
9716{
9717    PyObject *u;
9718    Py_UCS4 maxchar;
9719    int kind;
9720    void *data;
9721
9722    if (left < 0)
9723        left = 0;
9724    if (right < 0)
9725        right = 0;
9726
9727    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9728        Py_INCREF(self);
9729        return self;
9730    }
9731
9732    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9733        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9734        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9735        return NULL;
9736    }
9737    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9738    if (fill > maxchar)
9739        maxchar = fill;
9740    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9741    if (!u)
9742        return NULL;
9743
9744    kind = PyUnicode_KIND(u);
9745    data = PyUnicode_DATA(u);
9746    if (left)
9747        FILL(kind, data, fill, 0, left);
9748    if (right)
9749        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9750    copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
9751    assert(_PyUnicode_CheckConsistency(u, 1));
9752    return u;
9753}
9754#undef FILL
9755
9756PyObject *
9757PyUnicode_Splitlines(PyObject *string, int keepends)
9758{
9759    PyObject *list;
9760
9761    string = PyUnicode_FromObject(string);
9762    if (string == NULL || PyUnicode_READY(string) == -1)
9763        return NULL;
9764
9765    switch(PyUnicode_KIND(string)) {
9766    case PyUnicode_1BYTE_KIND:
9767        if (PyUnicode_IS_ASCII(string))
9768            list = asciilib_splitlines(
9769                string, PyUnicode_1BYTE_DATA(string),
9770                PyUnicode_GET_LENGTH(string), keepends);
9771        else
9772            list = ucs1lib_splitlines(
9773                string, PyUnicode_1BYTE_DATA(string),
9774                PyUnicode_GET_LENGTH(string), keepends);
9775        break;
9776    case PyUnicode_2BYTE_KIND:
9777        list = ucs2lib_splitlines(
9778            string, PyUnicode_2BYTE_DATA(string),
9779            PyUnicode_GET_LENGTH(string), keepends);
9780        break;
9781    case PyUnicode_4BYTE_KIND:
9782        list = ucs4lib_splitlines(
9783            string, PyUnicode_4BYTE_DATA(string),
9784            PyUnicode_GET_LENGTH(string), keepends);
9785        break;
9786    default:
9787        assert(0);
9788        list = 0;
9789    }
9790    Py_DECREF(string);
9791    return list;
9792}
9793
9794static PyObject *
9795split(PyObject *self,
9796      PyObject *substring,
9797      Py_ssize_t maxcount)
9798{
9799    int kind1, kind2, kind;
9800    void *buf1, *buf2;
9801    Py_ssize_t len1, len2;
9802    PyObject* out;
9803
9804    if (maxcount < 0)
9805        maxcount = PY_SSIZE_T_MAX;
9806
9807    if (PyUnicode_READY(self) == -1)
9808        return NULL;
9809
9810    if (substring == NULL)
9811        switch(PyUnicode_KIND(self)) {
9812        case PyUnicode_1BYTE_KIND:
9813            if (PyUnicode_IS_ASCII(self))
9814                return asciilib_split_whitespace(
9815                    self,  PyUnicode_1BYTE_DATA(self),
9816                    PyUnicode_GET_LENGTH(self), maxcount
9817                    );
9818            else
9819                return ucs1lib_split_whitespace(
9820                    self,  PyUnicode_1BYTE_DATA(self),
9821                    PyUnicode_GET_LENGTH(self), maxcount
9822                    );
9823        case PyUnicode_2BYTE_KIND:
9824            return ucs2lib_split_whitespace(
9825                self,  PyUnicode_2BYTE_DATA(self),
9826                PyUnicode_GET_LENGTH(self), maxcount
9827                );
9828        case PyUnicode_4BYTE_KIND:
9829            return ucs4lib_split_whitespace(
9830                self,  PyUnicode_4BYTE_DATA(self),
9831                PyUnicode_GET_LENGTH(self), maxcount
9832                );
9833        default:
9834            assert(0);
9835            return NULL;
9836        }
9837
9838    if (PyUnicode_READY(substring) == -1)
9839        return NULL;
9840
9841    kind1 = PyUnicode_KIND(self);
9842    kind2 = PyUnicode_KIND(substring);
9843    kind = kind1 > kind2 ? kind1 : kind2;
9844    buf1 = PyUnicode_DATA(self);
9845    buf2 = PyUnicode_DATA(substring);
9846    if (kind1 != kind)
9847        buf1 = _PyUnicode_AsKind(self, kind);
9848    if (!buf1)
9849        return NULL;
9850    if (kind2 != kind)
9851        buf2 = _PyUnicode_AsKind(substring, kind);
9852    if (!buf2) {
9853        if (kind1 != kind) PyMem_Free(buf1);
9854        return NULL;
9855    }
9856    len1 = PyUnicode_GET_LENGTH(self);
9857    len2 = PyUnicode_GET_LENGTH(substring);
9858
9859    switch(kind) {
9860    case PyUnicode_1BYTE_KIND:
9861        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9862            out = asciilib_split(
9863                self,  buf1, len1, buf2, len2, maxcount);
9864        else
9865            out = ucs1lib_split(
9866                self,  buf1, len1, buf2, len2, maxcount);
9867        break;
9868    case PyUnicode_2BYTE_KIND:
9869        out = ucs2lib_split(
9870            self,  buf1, len1, buf2, len2, maxcount);
9871        break;
9872    case PyUnicode_4BYTE_KIND:
9873        out = ucs4lib_split(
9874            self,  buf1, len1, buf2, len2, maxcount);
9875        break;
9876    default:
9877        out = NULL;
9878    }
9879    if (kind1 != kind)
9880        PyMem_Free(buf1);
9881    if (kind2 != kind)
9882        PyMem_Free(buf2);
9883    return out;
9884}
9885
9886static PyObject *
9887rsplit(PyObject *self,
9888       PyObject *substring,
9889       Py_ssize_t maxcount)
9890{
9891    int kind1, kind2, kind;
9892    void *buf1, *buf2;
9893    Py_ssize_t len1, len2;
9894    PyObject* out;
9895
9896    if (maxcount < 0)
9897        maxcount = PY_SSIZE_T_MAX;
9898
9899    if (PyUnicode_READY(self) == -1)
9900        return NULL;
9901
9902    if (substring == NULL)
9903        switch(PyUnicode_KIND(self)) {
9904        case PyUnicode_1BYTE_KIND:
9905            if (PyUnicode_IS_ASCII(self))
9906                return asciilib_rsplit_whitespace(
9907                    self,  PyUnicode_1BYTE_DATA(self),
9908                    PyUnicode_GET_LENGTH(self), maxcount
9909                    );
9910            else
9911                return ucs1lib_rsplit_whitespace(
9912                    self,  PyUnicode_1BYTE_DATA(self),
9913                    PyUnicode_GET_LENGTH(self), maxcount
9914                    );
9915        case PyUnicode_2BYTE_KIND:
9916            return ucs2lib_rsplit_whitespace(
9917                self,  PyUnicode_2BYTE_DATA(self),
9918                PyUnicode_GET_LENGTH(self), maxcount
9919                );
9920        case PyUnicode_4BYTE_KIND:
9921            return ucs4lib_rsplit_whitespace(
9922                self,  PyUnicode_4BYTE_DATA(self),
9923                PyUnicode_GET_LENGTH(self), maxcount
9924                );
9925        default:
9926            assert(0);
9927            return NULL;
9928        }
9929
9930    if (PyUnicode_READY(substring) == -1)
9931        return NULL;
9932
9933    kind1 = PyUnicode_KIND(self);
9934    kind2 = PyUnicode_KIND(substring);
9935    kind = kind1 > kind2 ? kind1 : kind2;
9936    buf1 = PyUnicode_DATA(self);
9937    buf2 = PyUnicode_DATA(substring);
9938    if (kind1 != kind)
9939        buf1 = _PyUnicode_AsKind(self, kind);
9940    if (!buf1)
9941        return NULL;
9942    if (kind2 != kind)
9943        buf2 = _PyUnicode_AsKind(substring, kind);
9944    if (!buf2) {
9945        if (kind1 != kind) PyMem_Free(buf1);
9946        return NULL;
9947    }
9948    len1 = PyUnicode_GET_LENGTH(self);
9949    len2 = PyUnicode_GET_LENGTH(substring);
9950
9951    switch(kind) {
9952    case PyUnicode_1BYTE_KIND:
9953        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9954            out = asciilib_rsplit(
9955                self,  buf1, len1, buf2, len2, maxcount);
9956        else
9957            out = ucs1lib_rsplit(
9958                self,  buf1, len1, buf2, len2, maxcount);
9959        break;
9960    case PyUnicode_2BYTE_KIND:
9961        out = ucs2lib_rsplit(
9962            self,  buf1, len1, buf2, len2, maxcount);
9963        break;
9964    case PyUnicode_4BYTE_KIND:
9965        out = ucs4lib_rsplit(
9966            self,  buf1, len1, buf2, len2, maxcount);
9967        break;
9968    default:
9969        out = NULL;
9970    }
9971    if (kind1 != kind)
9972        PyMem_Free(buf1);
9973    if (kind2 != kind)
9974        PyMem_Free(buf2);
9975    return out;
9976}
9977
9978static Py_ssize_t
9979anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9980            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9981{
9982    switch(kind) {
9983    case PyUnicode_1BYTE_KIND:
9984        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9985            return asciilib_find(buf1, len1, buf2, len2, offset);
9986        else
9987            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9988    case PyUnicode_2BYTE_KIND:
9989        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9990    case PyUnicode_4BYTE_KIND:
9991        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9992    }
9993    assert(0);
9994    return -1;
9995}
9996
9997static Py_ssize_t
9998anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9999             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10000{
10001        switch(kind) {
10002        case PyUnicode_1BYTE_KIND:
10003            if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10004                return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10005            else
10006                return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10007        case PyUnicode_2BYTE_KIND:
10008            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10009        case PyUnicode_4BYTE_KIND:
10010            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10011        }
10012        assert(0);
10013        return 0;
10014}
10015
10016static PyObject *
10017replace(PyObject *self, PyObject *str1,
10018        PyObject *str2, Py_ssize_t maxcount)
10019{
10020    PyObject *u;
10021    char *sbuf = PyUnicode_DATA(self);
10022    char *buf1 = PyUnicode_DATA(str1);
10023    char *buf2 = PyUnicode_DATA(str2);
10024    int srelease = 0, release1 = 0, release2 = 0;
10025    int skind = PyUnicode_KIND(self);
10026    int kind1 = PyUnicode_KIND(str1);
10027    int kind2 = PyUnicode_KIND(str2);
10028    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10029    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10030    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10031    int mayshrink;
10032    Py_UCS4 maxchar, maxchar_str2;
10033
10034    if (maxcount < 0)
10035        maxcount = PY_SSIZE_T_MAX;
10036    else if (maxcount == 0 || slen == 0)
10037        goto nothing;
10038
10039    if (str1 == str2)
10040        goto nothing;
10041    if (skind < kind1)
10042        /* substring too wide to be present */
10043        goto nothing;
10044
10045    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10046    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10047    /* Replacing str1 with str2 may cause a maxchar reduction in the
10048       result string. */
10049    mayshrink = (maxchar_str2 < maxchar);
10050    maxchar = Py_MAX(maxchar, maxchar_str2);
10051
10052    if (len1 == len2) {
10053        Py_ssize_t i;
10054        /* same length */
10055        if (len1 == 0)
10056            goto nothing;
10057        if (len1 == 1) {
10058            /* replace characters */
10059            Py_UCS4 u1, u2;
10060            int rkind;
10061            u1 = PyUnicode_READ_CHAR(str1, 0);
10062            if (findchar(sbuf, PyUnicode_KIND(self),
10063                         slen, u1, 1) < 0)
10064                goto nothing;
10065            u2 = PyUnicode_READ_CHAR(str2, 0);
10066            u = PyUnicode_New(slen, maxchar);
10067            if (!u)
10068                goto error;
10069            copy_characters(u, 0, self, 0, slen);
10070            rkind = PyUnicode_KIND(u);
10071            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10072                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
10073                    if (--maxcount < 0)
10074                        break;
10075                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
10076                }
10077        }
10078        else {
10079            int rkind = skind;
10080            char *res;
10081
10082            if (kind1 < rkind) {
10083                /* widen substring */
10084                buf1 = _PyUnicode_AsKind(str1, rkind);
10085                if (!buf1) goto error;
10086                release1 = 1;
10087            }
10088            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10089            if (i < 0)
10090                goto nothing;
10091            if (rkind > kind2) {
10092                /* widen replacement */
10093                buf2 = _PyUnicode_AsKind(str2, rkind);
10094                if (!buf2) goto error;
10095                release2 = 1;
10096            }
10097            else if (rkind < kind2) {
10098                /* widen self and buf1 */
10099                rkind = kind2;
10100                if (release1) PyMem_Free(buf1);
10101                sbuf = _PyUnicode_AsKind(self, rkind);
10102                if (!sbuf) goto error;
10103                srelease = 1;
10104                buf1 = _PyUnicode_AsKind(str1, rkind);
10105                if (!buf1) goto error;
10106                release1 = 1;
10107            }
10108            u = PyUnicode_New(slen, maxchar);
10109            if (!u)
10110                goto error;
10111            assert(PyUnicode_KIND(u) == rkind);
10112            res = PyUnicode_DATA(u);
10113
10114            memcpy(res, sbuf, rkind * slen);
10115            /* change everything in-place, starting with this one */
10116            memcpy(res + rkind * i,
10117                   buf2,
10118                   rkind * len2);
10119            i += len1;
10120
10121            while ( --maxcount > 0) {
10122                i = anylib_find(rkind, self,
10123                                sbuf+rkind*i, slen-i,
10124                                str1, buf1, len1, i);
10125                if (i == -1)
10126                    break;
10127                memcpy(res + rkind * i,
10128                       buf2,
10129                       rkind * len2);
10130                i += len1;
10131            }
10132        }
10133    }
10134    else {
10135        Py_ssize_t n, i, j, ires;
10136        Py_ssize_t product, new_size;
10137        int rkind = skind;
10138        char *res;
10139
10140        if (kind1 < rkind) {
10141            /* widen substring */
10142            buf1 = _PyUnicode_AsKind(str1, rkind);
10143            if (!buf1) goto error;
10144            release1 = 1;
10145        }
10146        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10147        if (n == 0)
10148            goto nothing;
10149        if (kind2 < rkind) {
10150            /* widen replacement */
10151            buf2 = _PyUnicode_AsKind(str2, rkind);
10152            if (!buf2) goto error;
10153            release2 = 1;
10154        }
10155        else if (kind2 > rkind) {
10156            /* widen self and buf1 */
10157            rkind = kind2;
10158            sbuf = _PyUnicode_AsKind(self, rkind);
10159            if (!sbuf) goto error;
10160            srelease = 1;
10161            if (release1) PyMem_Free(buf1);
10162            buf1 = _PyUnicode_AsKind(str1, rkind);
10163            if (!buf1) goto error;
10164            release1 = 1;
10165        }
10166        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10167           PyUnicode_GET_LENGTH(str1))); */
10168        product = n * (len2-len1);
10169        if ((product / (len2-len1)) != n) {
10170                PyErr_SetString(PyExc_OverflowError,
10171                                "replace string is too long");
10172                goto error;
10173        }
10174        new_size = slen + product;
10175        if (new_size == 0) {
10176            Py_INCREF(unicode_empty);
10177            u = unicode_empty;
10178            goto done;
10179        }
10180        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10181            PyErr_SetString(PyExc_OverflowError,
10182                            "replace string is too long");
10183            goto error;
10184        }
10185        u = PyUnicode_New(new_size, maxchar);
10186        if (!u)
10187            goto error;
10188        assert(PyUnicode_KIND(u) == rkind);
10189        res = PyUnicode_DATA(u);
10190        ires = i = 0;
10191        if (len1 > 0) {
10192            while (n-- > 0) {
10193                /* look for next match */
10194                j = anylib_find(rkind, self,
10195                                sbuf + rkind * i, slen-i,
10196                                str1, buf1, len1, i);
10197                if (j == -1)
10198                    break;
10199                else if (j > i) {
10200                    /* copy unchanged part [i:j] */
10201                    memcpy(res + rkind * ires,
10202                           sbuf + rkind * i,
10203                           rkind * (j-i));
10204                    ires += j - i;
10205                }
10206                /* copy substitution string */
10207                if (len2 > 0) {
10208                    memcpy(res + rkind * ires,
10209                           buf2,
10210                           rkind * len2);
10211                    ires += len2;
10212                }
10213                i = j + len1;
10214            }
10215            if (i < slen)
10216                /* copy tail [i:] */
10217                memcpy(res + rkind * ires,
10218                       sbuf + rkind * i,
10219                       rkind * (slen-i));
10220        }
10221        else {
10222            /* interleave */
10223            while (n > 0) {
10224                memcpy(res + rkind * ires,
10225                       buf2,
10226                       rkind * len2);
10227                ires += len2;
10228                if (--n <= 0)
10229                    break;
10230                memcpy(res + rkind * ires,
10231                       sbuf + rkind * i,
10232                       rkind);
10233                ires++;
10234                i++;
10235            }
10236            memcpy(res + rkind * ires,
10237                   sbuf + rkind * i,
10238                   rkind * (slen-i));
10239        }
10240    }
10241
10242    if (mayshrink) {
10243        unicode_adjust_maxchar(&u);
10244        if (u == NULL)
10245            goto error;
10246    }
10247
10248  done:
10249    if (srelease)
10250        PyMem_FREE(sbuf);
10251    if (release1)
10252        PyMem_FREE(buf1);
10253    if (release2)
10254        PyMem_FREE(buf2);
10255    assert(_PyUnicode_CheckConsistency(u, 1));
10256    return u;
10257
10258  nothing:
10259    /* nothing to replace; return original string (when possible) */
10260    if (srelease)
10261        PyMem_FREE(sbuf);
10262    if (release1)
10263        PyMem_FREE(buf1);
10264    if (release2)
10265        PyMem_FREE(buf2);
10266    if (PyUnicode_CheckExact(self)) {
10267        Py_INCREF(self);
10268        return self;
10269    }
10270    return PyUnicode_Copy(self);
10271  error:
10272    if (srelease && sbuf)
10273        PyMem_FREE(sbuf);
10274    if (release1 && buf1)
10275        PyMem_FREE(buf1);
10276    if (release2 && buf2)
10277        PyMem_FREE(buf2);
10278    return NULL;
10279}
10280
10281/* --- Unicode Object Methods --------------------------------------------- */
10282
10283PyDoc_STRVAR(title__doc__,
10284             "S.title() -> str\n\
10285\n\
10286Return a titlecased version of S, i.e. words start with title case\n\
10287characters, all remaining cased characters have lower case.");
10288
10289static PyObject*
10290unicode_title(PyObject *self)
10291{
10292    return fixup(self, fixtitle);
10293}
10294
10295PyDoc_STRVAR(capitalize__doc__,
10296             "S.capitalize() -> str\n\
10297\n\
10298Return a capitalized version of S, i.e. make the first character\n\
10299have upper case and the rest lower case.");
10300
10301static PyObject*
10302unicode_capitalize(PyObject *self)
10303{
10304    return fixup(self, fixcapitalize);
10305}
10306
10307#if 0
10308PyDoc_STRVAR(capwords__doc__,
10309             "S.capwords() -> str\n\
10310\n\
10311Apply .capitalize() to all words in S and return the result with\n\
10312normalized whitespace (all whitespace strings are replaced by ' ').");
10313
10314static PyObject*
10315unicode_capwords(PyObject *self)
10316{
10317    PyObject *list;
10318    PyObject *item;
10319    Py_ssize_t i;
10320
10321    /* Split into words */
10322    list = split(self, NULL, -1);
10323    if (!list)
10324        return NULL;
10325
10326    /* Capitalize each word */
10327    for (i = 0; i < PyList_GET_SIZE(list); i++) {
10328        item = fixup(PyList_GET_ITEM(list, i),
10329                     fixcapitalize);
10330        if (item == NULL)
10331            goto onError;
10332        Py_DECREF(PyList_GET_ITEM(list, i));
10333        PyList_SET_ITEM(list, i, item);
10334    }
10335
10336    /* Join the words to form a new string */
10337    item = PyUnicode_Join(NULL, list);
10338
10339  onError:
10340    Py_DECREF(list);
10341    return item;
10342}
10343#endif
10344
10345/* Argument converter.  Coerces to a single unicode character */
10346
10347static int
10348convert_uc(PyObject *obj, void *addr)
10349{
10350    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10351    PyObject *uniobj;
10352
10353    uniobj = PyUnicode_FromObject(obj);
10354    if (uniobj == NULL) {
10355        PyErr_SetString(PyExc_TypeError,
10356                        "The fill character cannot be converted to Unicode");
10357        return 0;
10358    }
10359    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10360        PyErr_SetString(PyExc_TypeError,
10361                        "The fill character must be exactly one character long");
10362        Py_DECREF(uniobj);
10363        return 0;
10364    }
10365    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10366    Py_DECREF(uniobj);
10367    return 1;
10368}
10369
10370PyDoc_STRVAR(center__doc__,
10371             "S.center(width[, fillchar]) -> str\n\
10372\n\
10373Return S centered in a string of length width. Padding is\n\
10374done using the specified fill character (default is a space)");
10375
10376static PyObject *
10377unicode_center(PyObject *self, PyObject *args)
10378{
10379    Py_ssize_t marg, left;
10380    Py_ssize_t width;
10381    Py_UCS4 fillchar = ' ';
10382
10383    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10384        return NULL;
10385
10386    if (PyUnicode_READY(self) == -1)
10387        return NULL;
10388
10389    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10390        Py_INCREF(self);
10391        return self;
10392    }
10393
10394    marg = width - _PyUnicode_LENGTH(self);
10395    left = marg / 2 + (marg & width & 1);
10396
10397    return pad(self, left, marg - left, fillchar);
10398}
10399
10400/* This function assumes that str1 and str2 are readied by the caller. */
10401
10402static int
10403unicode_compare(PyObject *str1, PyObject *str2)
10404{
10405    int kind1, kind2;
10406    void *data1, *data2;
10407    Py_ssize_t len1, len2, i;
10408
10409    kind1 = PyUnicode_KIND(str1);
10410    kind2 = PyUnicode_KIND(str2);
10411    data1 = PyUnicode_DATA(str1);
10412    data2 = PyUnicode_DATA(str2);
10413    len1 = PyUnicode_GET_LENGTH(str1);
10414    len2 = PyUnicode_GET_LENGTH(str2);
10415
10416    for (i = 0; i < len1 && i < len2; ++i) {
10417        Py_UCS4 c1, c2;
10418        c1 = PyUnicode_READ(kind1, data1, i);
10419        c2 = PyUnicode_READ(kind2, data2, i);
10420
10421        if (c1 != c2)
10422            return (c1 < c2) ? -1 : 1;
10423    }
10424
10425    return (len1 < len2) ? -1 : (len1 != len2);
10426}
10427
10428int
10429PyUnicode_Compare(PyObject *left, PyObject *right)
10430{
10431    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10432        if (PyUnicode_READY(left) == -1 ||
10433            PyUnicode_READY(right) == -1)
10434            return -1;
10435        return unicode_compare(left, right);
10436    }
10437    PyErr_Format(PyExc_TypeError,
10438                 "Can't compare %.100s and %.100s",
10439                 left->ob_type->tp_name,
10440                 right->ob_type->tp_name);
10441    return -1;
10442}
10443
10444int
10445PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10446{
10447    Py_ssize_t i;
10448    int kind;
10449    void *data;
10450    Py_UCS4 chr;
10451
10452    assert(_PyUnicode_CHECK(uni));
10453    if (PyUnicode_READY(uni) == -1)
10454        return -1;
10455    kind = PyUnicode_KIND(uni);
10456    data = PyUnicode_DATA(uni);
10457    /* Compare Unicode string and source character set string */
10458    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10459        if (chr != str[i])
10460            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10461    /* This check keeps Python strings that end in '\0' from comparing equal
10462     to C strings identical up to that point. */
10463    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10464        return 1; /* uni is longer */
10465    if (str[i])
10466        return -1; /* str is longer */
10467    return 0;
10468}
10469
10470
10471#define TEST_COND(cond)                         \
10472    ((cond) ? Py_True : Py_False)
10473
10474PyObject *
10475PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10476{
10477    int result;
10478
10479    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10480        PyObject *v;
10481        if (PyUnicode_READY(left) == -1 ||
10482            PyUnicode_READY(right) == -1)
10483            return NULL;
10484        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10485            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10486            if (op == Py_EQ) {
10487                Py_INCREF(Py_False);
10488                return Py_False;
10489            }
10490            if (op == Py_NE) {
10491                Py_INCREF(Py_True);
10492                return Py_True;
10493            }
10494        }
10495        if (left == right)
10496            result = 0;
10497        else
10498            result = unicode_compare(left, right);
10499
10500        /* Convert the return value to a Boolean */
10501        switch (op) {
10502        case Py_EQ:
10503            v = TEST_COND(result == 0);
10504            break;
10505        case Py_NE:
10506            v = TEST_COND(result != 0);
10507            break;
10508        case Py_LE:
10509            v = TEST_COND(result <= 0);
10510            break;
10511        case Py_GE:
10512            v = TEST_COND(result >= 0);
10513            break;
10514        case Py_LT:
10515            v = TEST_COND(result == -1);
10516            break;
10517        case Py_GT:
10518            v = TEST_COND(result == 1);
10519            break;
10520        default:
10521            PyErr_BadArgument();
10522            return NULL;
10523        }
10524        Py_INCREF(v);
10525        return v;
10526    }
10527
10528    Py_RETURN_NOTIMPLEMENTED;
10529}
10530
10531int
10532PyUnicode_Contains(PyObject *container, PyObject *element)
10533{
10534    PyObject *str, *sub;
10535    int kind1, kind2, kind;
10536    void *buf1, *buf2;
10537    Py_ssize_t len1, len2;
10538    int result;
10539
10540    /* Coerce the two arguments */
10541    sub = PyUnicode_FromObject(element);
10542    if (!sub) {
10543        PyErr_Format(PyExc_TypeError,
10544                     "'in <string>' requires string as left operand, not %s",
10545                     element->ob_type->tp_name);
10546        return -1;
10547    }
10548    if (PyUnicode_READY(sub) == -1)
10549        return -1;
10550
10551    str = PyUnicode_FromObject(container);
10552    if (!str || PyUnicode_READY(str) == -1) {
10553        Py_DECREF(sub);
10554        return -1;
10555    }
10556
10557    kind1 = PyUnicode_KIND(str);
10558    kind2 = PyUnicode_KIND(sub);
10559    kind = kind1 > kind2 ? kind1 : kind2;
10560    buf1 = PyUnicode_DATA(str);
10561    buf2 = PyUnicode_DATA(sub);
10562    if (kind1 != kind)
10563        buf1 = _PyUnicode_AsKind(str, kind);
10564    if (!buf1) {
10565        Py_DECREF(sub);
10566        return -1;
10567    }
10568    if (kind2 != kind)
10569        buf2 = _PyUnicode_AsKind(sub, kind);
10570    if (!buf2) {
10571        Py_DECREF(sub);
10572        if (kind1 != kind) PyMem_Free(buf1);
10573        return -1;
10574    }
10575    len1 = PyUnicode_GET_LENGTH(str);
10576    len2 = PyUnicode_GET_LENGTH(sub);
10577
10578    switch(kind) {
10579    case PyUnicode_1BYTE_KIND:
10580        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10581        break;
10582    case PyUnicode_2BYTE_KIND:
10583        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10584        break;
10585    case PyUnicode_4BYTE_KIND:
10586        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10587        break;
10588    default:
10589        result = -1;
10590        assert(0);
10591    }
10592
10593    Py_DECREF(str);
10594    Py_DECREF(sub);
10595
10596    if (kind1 != kind)
10597        PyMem_Free(buf1);
10598    if (kind2 != kind)
10599        PyMem_Free(buf2);
10600
10601    return result;
10602}
10603
10604/* Concat to string or Unicode object giving a new Unicode object. */
10605
10606PyObject *
10607PyUnicode_Concat(PyObject *left, PyObject *right)
10608{
10609    PyObject *u = NULL, *v = NULL, *w;
10610    Py_UCS4 maxchar, maxchar2;
10611
10612    /* Coerce the two arguments */
10613    u = PyUnicode_FromObject(left);
10614    if (u == NULL)
10615        goto onError;
10616    v = PyUnicode_FromObject(right);
10617    if (v == NULL)
10618        goto onError;
10619
10620    /* Shortcuts */
10621    if (v == unicode_empty) {
10622        Py_DECREF(v);
10623        return u;
10624    }
10625    if (u == unicode_empty) {
10626        Py_DECREF(u);
10627        return v;
10628    }
10629
10630    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10631    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10632    maxchar = Py_MAX(maxchar, maxchar2);
10633
10634    /* Concat the two Unicode strings */
10635    w = PyUnicode_New(
10636        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10637        maxchar);
10638    if (w == NULL)
10639        goto onError;
10640    copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10641    copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
10642    Py_DECREF(u);
10643    Py_DECREF(v);
10644    assert(_PyUnicode_CheckConsistency(w, 1));
10645    return w;
10646
10647  onError:
10648    Py_XDECREF(u);
10649    Py_XDECREF(v);
10650    return NULL;
10651}
10652
10653static void
10654unicode_append_inplace(PyObject **p_left, PyObject *right)
10655{
10656    Py_ssize_t left_len, right_len, new_len;
10657
10658    assert(PyUnicode_IS_READY(*p_left));
10659    assert(PyUnicode_IS_READY(right));
10660
10661    left_len = PyUnicode_GET_LENGTH(*p_left);
10662    right_len = PyUnicode_GET_LENGTH(right);
10663    if (left_len > PY_SSIZE_T_MAX - right_len) {
10664        PyErr_SetString(PyExc_OverflowError,
10665                        "strings are too large to concat");
10666        goto error;
10667    }
10668    new_len = left_len + right_len;
10669
10670    /* Now we own the last reference to 'left', so we can resize it
10671     * in-place.
10672     */
10673    if (unicode_resize(p_left, new_len) != 0) {
10674        /* XXX if _PyUnicode_Resize() fails, 'left' has been
10675         * deallocated so it cannot be put back into
10676         * 'variable'.  The MemoryError is raised when there
10677         * is no value in 'variable', which might (very
10678         * remotely) be a cause of incompatibilities.
10679         */
10680        goto error;
10681    }
10682    /* copy 'right' into the newly allocated area of 'left' */
10683    copy_characters(*p_left, left_len, right, 0, right_len);
10684    _PyUnicode_DIRTY(*p_left);
10685    return;
10686
10687error:
10688    Py_DECREF(*p_left);
10689    *p_left = NULL;
10690}
10691
10692void
10693PyUnicode_Append(PyObject **p_left, PyObject *right)
10694{
10695    PyObject *left, *res;
10696
10697    if (p_left == NULL) {
10698        if (!PyErr_Occurred())
10699            PyErr_BadInternalCall();
10700        return;
10701    }
10702    left = *p_left;
10703    if (right == NULL || !PyUnicode_Check(left)) {
10704        if (!PyErr_Occurred())
10705            PyErr_BadInternalCall();
10706        goto error;
10707    }
10708
10709    if (PyUnicode_READY(left))
10710        goto error;
10711    if (PyUnicode_READY(right))
10712        goto error;
10713
10714    if (PyUnicode_CheckExact(left) && left != unicode_empty
10715        && PyUnicode_CheckExact(right) && right != unicode_empty
10716        && unicode_resizable(left)
10717        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10718            || _PyUnicode_WSTR(left) != NULL))
10719    {
10720        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10721           to change the structure size, but characters are stored just after
10722           the structure, and so it requires to move all characters which is
10723           not so different than duplicating the string. */
10724        if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10725        {
10726            unicode_append_inplace(p_left, right);
10727            assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
10728            return;
10729        }
10730    }
10731
10732    res = PyUnicode_Concat(left, right);
10733    if (res == NULL)
10734        goto error;
10735    Py_DECREF(left);
10736    *p_left = res;
10737    return;
10738
10739error:
10740    Py_DECREF(*p_left);
10741    *p_left = NULL;
10742}
10743
10744void
10745PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10746{
10747    PyUnicode_Append(pleft, right);
10748    Py_XDECREF(right);
10749}
10750
10751PyDoc_STRVAR(count__doc__,
10752             "S.count(sub[, start[, end]]) -> int\n\
10753\n\
10754Return the number of non-overlapping occurrences of substring sub in\n\
10755string S[start:end].  Optional arguments start and end are\n\
10756interpreted as in slice notation.");
10757
10758static PyObject *
10759unicode_count(PyObject *self, PyObject *args)
10760{
10761    PyObject *substring;
10762    Py_ssize_t start = 0;
10763    Py_ssize_t end = PY_SSIZE_T_MAX;
10764    PyObject *result;
10765    int kind1, kind2, kind;
10766    void *buf1, *buf2;
10767    Py_ssize_t len1, len2, iresult;
10768
10769    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10770                                            &start, &end))
10771        return NULL;
10772
10773    kind1 = PyUnicode_KIND(self);
10774    kind2 = PyUnicode_KIND(substring);
10775    kind = kind1 > kind2 ? kind1 : kind2;
10776    buf1 = PyUnicode_DATA(self);
10777    buf2 = PyUnicode_DATA(substring);
10778    if (kind1 != kind)
10779        buf1 = _PyUnicode_AsKind(self, kind);
10780    if (!buf1) {
10781        Py_DECREF(substring);
10782        return NULL;
10783    }
10784    if (kind2 != kind)
10785        buf2 = _PyUnicode_AsKind(substring, kind);
10786    if (!buf2) {
10787        Py_DECREF(substring);
10788        if (kind1 != kind) PyMem_Free(buf1);
10789        return NULL;
10790    }
10791    len1 = PyUnicode_GET_LENGTH(self);
10792    len2 = PyUnicode_GET_LENGTH(substring);
10793
10794    ADJUST_INDICES(start, end, len1);
10795    switch(kind) {
10796    case PyUnicode_1BYTE_KIND:
10797        iresult = ucs1lib_count(
10798            ((Py_UCS1*)buf1) + start, end - start,
10799            buf2, len2, PY_SSIZE_T_MAX
10800            );
10801        break;
10802    case PyUnicode_2BYTE_KIND:
10803        iresult = ucs2lib_count(
10804            ((Py_UCS2*)buf1) + start, end - start,
10805            buf2, len2, PY_SSIZE_T_MAX
10806            );
10807        break;
10808    case PyUnicode_4BYTE_KIND:
10809        iresult = ucs4lib_count(
10810            ((Py_UCS4*)buf1) + start, end - start,
10811            buf2, len2, PY_SSIZE_T_MAX
10812            );
10813        break;
10814    default:
10815        assert(0); iresult = 0;
10816    }
10817
10818    result = PyLong_FromSsize_t(iresult);
10819
10820    if (kind1 != kind)
10821        PyMem_Free(buf1);
10822    if (kind2 != kind)
10823        PyMem_Free(buf2);
10824
10825    Py_DECREF(substring);
10826
10827    return result;
10828}
10829
10830PyDoc_STRVAR(encode__doc__,
10831             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10832\n\
10833Encode S using the codec registered for encoding. Default encoding\n\
10834is 'utf-8'. errors may be given to set a different error\n\
10835handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10836a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10837'xmlcharrefreplace' as well as any other name registered with\n\
10838codecs.register_error that can handle UnicodeEncodeErrors.");
10839
10840static PyObject *
10841unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10842{
10843    static char *kwlist[] = {"encoding", "errors", 0};
10844    char *encoding = NULL;
10845    char *errors = NULL;
10846
10847    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10848                                     kwlist, &encoding, &errors))
10849        return NULL;
10850    return PyUnicode_AsEncodedString(self, encoding, errors);
10851}
10852
10853PyDoc_STRVAR(expandtabs__doc__,
10854             "S.expandtabs([tabsize]) -> str\n\
10855\n\
10856Return a copy of S where all tab characters are expanded using spaces.\n\
10857If tabsize is not given, a tab size of 8 characters is assumed.");
10858
10859static PyObject*
10860unicode_expandtabs(PyObject *self, PyObject *args)
10861{
10862    Py_ssize_t i, j, line_pos, src_len, incr;
10863    Py_UCS4 ch;
10864    PyObject *u;
10865    void *src_data, *dest_data;
10866    int tabsize = 8;
10867    int kind;
10868    int found;
10869
10870    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10871        return NULL;
10872
10873    if (PyUnicode_READY(self) == -1)
10874        return NULL;
10875
10876    /* First pass: determine size of output string */
10877    src_len = PyUnicode_GET_LENGTH(self);
10878    i = j = line_pos = 0;
10879    kind = PyUnicode_KIND(self);
10880    src_data = PyUnicode_DATA(self);
10881    found = 0;
10882    for (; i < src_len; i++) {
10883        ch = PyUnicode_READ(kind, src_data, i);
10884        if (ch == '\t') {
10885            found = 1;
10886            if (tabsize > 0) {
10887                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10888                if (j > PY_SSIZE_T_MAX - incr)
10889                    goto overflow;
10890                line_pos += incr;
10891                j += incr;
10892            }
10893        }
10894        else {
10895            if (j > PY_SSIZE_T_MAX - 1)
10896                goto overflow;
10897            line_pos++;
10898            j++;
10899            if (ch == '\n' || ch == '\r')
10900                line_pos = 0;
10901        }
10902    }
10903    if (!found && PyUnicode_CheckExact(self)) {
10904        Py_INCREF(self);
10905        return self;
10906    }
10907
10908    /* Second pass: create output string and fill it */
10909    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10910    if (!u)
10911        return NULL;
10912    dest_data = PyUnicode_DATA(u);
10913
10914    i = j = line_pos = 0;
10915
10916    for (; i < src_len; i++) {
10917        ch = PyUnicode_READ(kind, src_data, i);
10918        if (ch == '\t') {
10919            if (tabsize > 0) {
10920                incr = tabsize - (line_pos % tabsize);
10921                line_pos += incr;
10922                while (incr--) {
10923                    PyUnicode_WRITE(kind, dest_data, j, ' ');
10924                    j++;
10925                }
10926            }
10927        }
10928        else {
10929            line_pos++;
10930            PyUnicode_WRITE(kind, dest_data, j, ch);
10931            j++;
10932            if (ch == '\n' || ch == '\r')
10933                line_pos = 0;
10934        }
10935    }
10936    assert (j == PyUnicode_GET_LENGTH(u));
10937    return unicode_result(u);
10938
10939  overflow:
10940    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10941    return NULL;
10942}
10943
10944PyDoc_STRVAR(find__doc__,
10945             "S.find(sub[, start[, end]]) -> int\n\
10946\n\
10947Return the lowest index in S where substring sub is found,\n\
10948such that sub is contained within S[start:end].  Optional\n\
10949arguments start and end are interpreted as in slice notation.\n\
10950\n\
10951Return -1 on failure.");
10952
10953static PyObject *
10954unicode_find(PyObject *self, PyObject *args)
10955{
10956    PyObject *substring;
10957    Py_ssize_t start;
10958    Py_ssize_t end;
10959    Py_ssize_t result;
10960
10961    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10962                                            &start, &end))
10963        return NULL;
10964
10965    if (PyUnicode_READY(self) == -1)
10966        return NULL;
10967    if (PyUnicode_READY(substring) == -1)
10968        return NULL;
10969
10970    result = any_find_slice(1, self, substring, start, end);
10971
10972    Py_DECREF(substring);
10973
10974    if (result == -2)
10975        return NULL;
10976
10977    return PyLong_FromSsize_t(result);
10978}
10979
10980static PyObject *
10981unicode_getitem(PyObject *self, Py_ssize_t index)
10982{
10983    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10984    if (ch == (Py_UCS4)-1)
10985        return NULL;
10986    return PyUnicode_FromOrdinal(ch);
10987}
10988
10989/* Believe it or not, this produces the same value for ASCII strings
10990   as bytes_hash(). */
10991static Py_hash_t
10992unicode_hash(PyObject *self)
10993{
10994    Py_ssize_t len;
10995    Py_uhash_t x;
10996
10997    if (_PyUnicode_HASH(self) != -1)
10998        return _PyUnicode_HASH(self);
10999    if (PyUnicode_READY(self) == -1)
11000        return -1;
11001    len = PyUnicode_GET_LENGTH(self);
11002
11003    /* The hash function as a macro, gets expanded three times below. */
11004#define HASH(P) \
11005    x = (Py_uhash_t)*P << 7; \
11006    while (--len >= 0) \
11007        x = (1000003*x) ^ (Py_uhash_t)*P++;
11008
11009    switch (PyUnicode_KIND(self)) {
11010    case PyUnicode_1BYTE_KIND: {
11011        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11012        HASH(c);
11013        break;
11014    }
11015    case PyUnicode_2BYTE_KIND: {
11016        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11017        HASH(s);
11018        break;
11019    }
11020    default: {
11021        Py_UCS4 *l;
11022        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11023               "Impossible switch case in unicode_hash");
11024        l = PyUnicode_4BYTE_DATA(self);
11025        HASH(l);
11026        break;
11027    }
11028    }
11029    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11030
11031    if (x == -1)
11032        x = -2;
11033    _PyUnicode_HASH(self) = x;
11034    return x;
11035}
11036#undef HASH
11037
11038PyDoc_STRVAR(index__doc__,
11039             "S.index(sub[, start[, end]]) -> int\n\
11040\n\
11041Like S.find() but raise ValueError when the substring is not found.");
11042
11043static PyObject *
11044unicode_index(PyObject *self, PyObject *args)
11045{
11046    Py_ssize_t result;
11047    PyObject *substring;
11048    Py_ssize_t start;
11049    Py_ssize_t end;
11050
11051    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11052                                            &start, &end))
11053        return NULL;
11054
11055    if (PyUnicode_READY(self) == -1)
11056        return NULL;
11057    if (PyUnicode_READY(substring) == -1)
11058        return NULL;
11059
11060    result = any_find_slice(1, self, substring, start, end);
11061
11062    Py_DECREF(substring);
11063
11064    if (result == -2)
11065        return NULL;
11066
11067    if (result < 0) {
11068        PyErr_SetString(PyExc_ValueError, "substring not found");
11069        return NULL;
11070    }
11071
11072    return PyLong_FromSsize_t(result);
11073}
11074
11075PyDoc_STRVAR(islower__doc__,
11076             "S.islower() -> bool\n\
11077\n\
11078Return True if all cased characters in S are lowercase and there is\n\
11079at least one cased character in S, False otherwise.");
11080
11081static PyObject*
11082unicode_islower(PyObject *self)
11083{
11084    Py_ssize_t i, length;
11085    int kind;
11086    void *data;
11087    int cased;
11088
11089    if (PyUnicode_READY(self) == -1)
11090        return NULL;
11091    length = PyUnicode_GET_LENGTH(self);
11092    kind = PyUnicode_KIND(self);
11093    data = PyUnicode_DATA(self);
11094
11095    /* Shortcut for single character strings */
11096    if (length == 1)
11097        return PyBool_FromLong(
11098            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11099
11100    /* Special case for empty strings */
11101    if (length == 0)
11102        return PyBool_FromLong(0);
11103
11104    cased = 0;
11105    for (i = 0; i < length; i++) {
11106        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11107
11108        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11109            return PyBool_FromLong(0);
11110        else if (!cased && Py_UNICODE_ISLOWER(ch))
11111            cased = 1;
11112    }
11113    return PyBool_FromLong(cased);
11114}
11115
11116PyDoc_STRVAR(isupper__doc__,
11117             "S.isupper() -> bool\n\
11118\n\
11119Return True if all cased characters in S are uppercase and there is\n\
11120at least one cased character in S, False otherwise.");
11121
11122static PyObject*
11123unicode_isupper(PyObject *self)
11124{
11125    Py_ssize_t i, length;
11126    int kind;
11127    void *data;
11128    int cased;
11129
11130    if (PyUnicode_READY(self) == -1)
11131        return NULL;
11132    length = PyUnicode_GET_LENGTH(self);
11133    kind = PyUnicode_KIND(self);
11134    data = PyUnicode_DATA(self);
11135
11136    /* Shortcut for single character strings */
11137    if (length == 1)
11138        return PyBool_FromLong(
11139            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11140
11141    /* Special case for empty strings */
11142    if (length == 0)
11143        return PyBool_FromLong(0);
11144
11145    cased = 0;
11146    for (i = 0; i < length; i++) {
11147        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11148
11149        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11150            return PyBool_FromLong(0);
11151        else if (!cased && Py_UNICODE_ISUPPER(ch))
11152            cased = 1;
11153    }
11154    return PyBool_FromLong(cased);
11155}
11156
11157PyDoc_STRVAR(istitle__doc__,
11158             "S.istitle() -> bool\n\
11159\n\
11160Return True if S is a titlecased string and there is at least one\n\
11161character in S, i.e. upper- and titlecase characters may only\n\
11162follow uncased characters and lowercase characters only cased ones.\n\
11163Return False otherwise.");
11164
11165static PyObject*
11166unicode_istitle(PyObject *self)
11167{
11168    Py_ssize_t i, length;
11169    int kind;
11170    void *data;
11171    int cased, previous_is_cased;
11172
11173    if (PyUnicode_READY(self) == -1)
11174        return NULL;
11175    length = PyUnicode_GET_LENGTH(self);
11176    kind = PyUnicode_KIND(self);
11177    data = PyUnicode_DATA(self);
11178
11179    /* Shortcut for single character strings */
11180    if (length == 1) {
11181        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11182        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11183                               (Py_UNICODE_ISUPPER(ch) != 0));
11184    }
11185
11186    /* Special case for empty strings */
11187    if (length == 0)
11188        return PyBool_FromLong(0);
11189
11190    cased = 0;
11191    previous_is_cased = 0;
11192    for (i = 0; i < length; i++) {
11193        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11194
11195        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11196            if (previous_is_cased)
11197                return PyBool_FromLong(0);
11198            previous_is_cased = 1;
11199            cased = 1;
11200        }
11201        else if (Py_UNICODE_ISLOWER(ch)) {
11202            if (!previous_is_cased)
11203                return PyBool_FromLong(0);
11204            previous_is_cased = 1;
11205            cased = 1;
11206        }
11207        else
11208            previous_is_cased = 0;
11209    }
11210    return PyBool_FromLong(cased);
11211}
11212
11213PyDoc_STRVAR(isspace__doc__,
11214             "S.isspace() -> bool\n\
11215\n\
11216Return True if all characters in S are whitespace\n\
11217and there is at least one character in S, False otherwise.");
11218
11219static PyObject*
11220unicode_isspace(PyObject *self)
11221{
11222    Py_ssize_t i, length;
11223    int kind;
11224    void *data;
11225
11226    if (PyUnicode_READY(self) == -1)
11227        return NULL;
11228    length = PyUnicode_GET_LENGTH(self);
11229    kind = PyUnicode_KIND(self);
11230    data = PyUnicode_DATA(self);
11231
11232    /* Shortcut for single character strings */
11233    if (length == 1)
11234        return PyBool_FromLong(
11235            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11236
11237    /* Special case for empty strings */
11238    if (length == 0)
11239        return PyBool_FromLong(0);
11240
11241    for (i = 0; i < length; i++) {
11242        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11243        if (!Py_UNICODE_ISSPACE(ch))
11244            return PyBool_FromLong(0);
11245    }
11246    return PyBool_FromLong(1);
11247}
11248
11249PyDoc_STRVAR(isalpha__doc__,
11250             "S.isalpha() -> bool\n\
11251\n\
11252Return True if all characters in S are alphabetic\n\
11253and there is at least one character in S, False otherwise.");
11254
11255static PyObject*
11256unicode_isalpha(PyObject *self)
11257{
11258    Py_ssize_t i, length;
11259    int kind;
11260    void *data;
11261
11262    if (PyUnicode_READY(self) == -1)
11263        return NULL;
11264    length = PyUnicode_GET_LENGTH(self);
11265    kind = PyUnicode_KIND(self);
11266    data = PyUnicode_DATA(self);
11267
11268    /* Shortcut for single character strings */
11269    if (length == 1)
11270        return PyBool_FromLong(
11271            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11272
11273    /* Special case for empty strings */
11274    if (length == 0)
11275        return PyBool_FromLong(0);
11276
11277    for (i = 0; i < length; i++) {
11278        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11279            return PyBool_FromLong(0);
11280    }
11281    return PyBool_FromLong(1);
11282}
11283
11284PyDoc_STRVAR(isalnum__doc__,
11285             "S.isalnum() -> bool\n\
11286\n\
11287Return True if all characters in S are alphanumeric\n\
11288and there is at least one character in S, False otherwise.");
11289
11290static PyObject*
11291unicode_isalnum(PyObject *self)
11292{
11293    int kind;
11294    void *data;
11295    Py_ssize_t len, i;
11296
11297    if (PyUnicode_READY(self) == -1)
11298        return NULL;
11299
11300    kind = PyUnicode_KIND(self);
11301    data = PyUnicode_DATA(self);
11302    len = PyUnicode_GET_LENGTH(self);
11303
11304    /* Shortcut for single character strings */
11305    if (len == 1) {
11306        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11307        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11308    }
11309
11310    /* Special case for empty strings */
11311    if (len == 0)
11312        return PyBool_FromLong(0);
11313
11314    for (i = 0; i < len; i++) {
11315        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11316        if (!Py_UNICODE_ISALNUM(ch))
11317            return PyBool_FromLong(0);
11318    }
11319    return PyBool_FromLong(1);
11320}
11321
11322PyDoc_STRVAR(isdecimal__doc__,
11323             "S.isdecimal() -> bool\n\
11324\n\
11325Return True if there are only decimal characters in S,\n\
11326False otherwise.");
11327
11328static PyObject*
11329unicode_isdecimal(PyObject *self)
11330{
11331    Py_ssize_t i, length;
11332    int kind;
11333    void *data;
11334
11335    if (PyUnicode_READY(self) == -1)
11336        return NULL;
11337    length = PyUnicode_GET_LENGTH(self);
11338    kind = PyUnicode_KIND(self);
11339    data = PyUnicode_DATA(self);
11340
11341    /* Shortcut for single character strings */
11342    if (length == 1)
11343        return PyBool_FromLong(
11344            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11345
11346    /* Special case for empty strings */
11347    if (length == 0)
11348        return PyBool_FromLong(0);
11349
11350    for (i = 0; i < length; i++) {
11351        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11352            return PyBool_FromLong(0);
11353    }
11354    return PyBool_FromLong(1);
11355}
11356
11357PyDoc_STRVAR(isdigit__doc__,
11358             "S.isdigit() -> bool\n\
11359\n\
11360Return True if all characters in S are digits\n\
11361and there is at least one character in S, False otherwise.");
11362
11363static PyObject*
11364unicode_isdigit(PyObject *self)
11365{
11366    Py_ssize_t i, length;
11367    int kind;
11368    void *data;
11369
11370    if (PyUnicode_READY(self) == -1)
11371        return NULL;
11372    length = PyUnicode_GET_LENGTH(self);
11373    kind = PyUnicode_KIND(self);
11374    data = PyUnicode_DATA(self);
11375
11376    /* Shortcut for single character strings */
11377    if (length == 1) {
11378        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11379        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11380    }
11381
11382    /* Special case for empty strings */
11383    if (length == 0)
11384        return PyBool_FromLong(0);
11385
11386    for (i = 0; i < length; i++) {
11387        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11388            return PyBool_FromLong(0);
11389    }
11390    return PyBool_FromLong(1);
11391}
11392
11393PyDoc_STRVAR(isnumeric__doc__,
11394             "S.isnumeric() -> bool\n\
11395\n\
11396Return True if there are only numeric characters in S,\n\
11397False otherwise.");
11398
11399static PyObject*
11400unicode_isnumeric(PyObject *self)
11401{
11402    Py_ssize_t i, length;
11403    int kind;
11404    void *data;
11405
11406    if (PyUnicode_READY(self) == -1)
11407        return NULL;
11408    length = PyUnicode_GET_LENGTH(self);
11409    kind = PyUnicode_KIND(self);
11410    data = PyUnicode_DATA(self);
11411
11412    /* Shortcut for single character strings */
11413    if (length == 1)
11414        return PyBool_FromLong(
11415            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11416
11417    /* Special case for empty strings */
11418    if (length == 0)
11419        return PyBool_FromLong(0);
11420
11421    for (i = 0; i < length; i++) {
11422        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11423            return PyBool_FromLong(0);
11424    }
11425    return PyBool_FromLong(1);
11426}
11427
11428int
11429PyUnicode_IsIdentifier(PyObject *self)
11430{
11431    int kind;
11432    void *data;
11433    Py_ssize_t i;
11434    Py_UCS4 first;
11435
11436    if (PyUnicode_READY(self) == -1) {
11437        Py_FatalError("identifier not ready");
11438        return 0;
11439    }
11440
11441    /* Special case for empty strings */
11442    if (PyUnicode_GET_LENGTH(self) == 0)
11443        return 0;
11444    kind = PyUnicode_KIND(self);
11445    data = PyUnicode_DATA(self);
11446
11447    /* PEP 3131 says that the first character must be in
11448       XID_Start and subsequent characters in XID_Continue,
11449       and for the ASCII range, the 2.x rules apply (i.e
11450       start with letters and underscore, continue with
11451       letters, digits, underscore). However, given the current
11452       definition of XID_Start and XID_Continue, it is sufficient
11453       to check just for these, except that _ must be allowed
11454       as starting an identifier.  */
11455    first = PyUnicode_READ(kind, data, 0);
11456    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11457        return 0;
11458
11459    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11460        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11461            return 0;
11462    return 1;
11463}
11464
11465PyDoc_STRVAR(isidentifier__doc__,
11466             "S.isidentifier() -> bool\n\
11467\n\
11468Return True if S is a valid identifier according\n\
11469to the language definition.");
11470
11471static PyObject*
11472unicode_isidentifier(PyObject *self)
11473{
11474    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11475}
11476
11477PyDoc_STRVAR(isprintable__doc__,
11478             "S.isprintable() -> bool\n\
11479\n\
11480Return True if all characters in S are considered\n\
11481printable in repr() or S is empty, False otherwise.");
11482
11483static PyObject*
11484unicode_isprintable(PyObject *self)
11485{
11486    Py_ssize_t i, length;
11487    int kind;
11488    void *data;
11489
11490    if (PyUnicode_READY(self) == -1)
11491        return NULL;
11492    length = PyUnicode_GET_LENGTH(self);
11493    kind = PyUnicode_KIND(self);
11494    data = PyUnicode_DATA(self);
11495
11496    /* Shortcut for single character strings */
11497    if (length == 1)
11498        return PyBool_FromLong(
11499            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11500
11501    for (i = 0; i < length; i++) {
11502        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11503            Py_RETURN_FALSE;
11504        }
11505    }
11506    Py_RETURN_TRUE;
11507}
11508
11509PyDoc_STRVAR(join__doc__,
11510             "S.join(iterable) -> str\n\
11511\n\
11512Return a string which is the concatenation of the strings in the\n\
11513iterable.  The separator between elements is S.");
11514
11515static PyObject*
11516unicode_join(PyObject *self, PyObject *data)
11517{
11518    return PyUnicode_Join(self, data);
11519}
11520
11521static Py_ssize_t
11522unicode_length(PyObject *self)
11523{
11524    if (PyUnicode_READY(self) == -1)
11525        return -1;
11526    return PyUnicode_GET_LENGTH(self);
11527}
11528
11529PyDoc_STRVAR(ljust__doc__,
11530             "S.ljust(width[, fillchar]) -> str\n\
11531\n\
11532Return S left-justified in a Unicode string of length width. Padding is\n\
11533done using the specified fill character (default is a space).");
11534
11535static PyObject *
11536unicode_ljust(PyObject *self, PyObject *args)
11537{
11538    Py_ssize_t width;
11539    Py_UCS4 fillchar = ' ';
11540
11541    if (PyUnicode_READY(self) == -1)
11542        return NULL;
11543
11544    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11545        return NULL;
11546
11547    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11548        Py_INCREF(self);
11549        return self;
11550    }
11551
11552    return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
11553}
11554
11555PyDoc_STRVAR(lower__doc__,
11556             "S.lower() -> str\n\
11557\n\
11558Return a copy of the string S converted to lowercase.");
11559
11560static PyObject*
11561unicode_lower(PyObject *self)
11562{
11563    return fixup(self, fixlower);
11564}
11565
11566#define LEFTSTRIP 0
11567#define RIGHTSTRIP 1
11568#define BOTHSTRIP 2
11569
11570/* Arrays indexed by above */
11571static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11572
11573#define STRIPNAME(i) (stripformat[i]+3)
11574
11575/* externally visible for str.strip(unicode) */
11576PyObject *
11577_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11578{
11579    void *data;
11580    int kind;
11581    Py_ssize_t i, j, len;
11582    BLOOM_MASK sepmask;
11583
11584    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11585        return NULL;
11586
11587    kind = PyUnicode_KIND(self);
11588    data = PyUnicode_DATA(self);
11589    len = PyUnicode_GET_LENGTH(self);
11590    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11591                              PyUnicode_DATA(sepobj),
11592                              PyUnicode_GET_LENGTH(sepobj));
11593
11594    i = 0;
11595    if (striptype != RIGHTSTRIP) {
11596        while (i < len &&
11597               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11598            i++;
11599        }
11600    }
11601
11602    j = len;
11603    if (striptype != LEFTSTRIP) {
11604        do {
11605            j--;
11606        } while (j >= i &&
11607                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11608        j++;
11609    }
11610
11611    return PyUnicode_Substring(self, i, j);
11612}
11613
11614PyObject*
11615PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11616{
11617    unsigned char *data;
11618    int kind;
11619    Py_ssize_t length;
11620
11621    if (PyUnicode_READY(self) == -1)
11622        return NULL;
11623
11624    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11625
11626    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
11627    {
11628        if (PyUnicode_CheckExact(self)) {
11629            Py_INCREF(self);
11630            return self;
11631        }
11632        else
11633            return PyUnicode_Copy(self);
11634    }
11635
11636    length = end - start;
11637    if (length == 1)
11638        return unicode_getitem(self, start);
11639
11640    if (start < 0 || end < 0) {
11641        PyErr_SetString(PyExc_IndexError, "string index out of range");
11642        return NULL;
11643    }
11644
11645    if (PyUnicode_IS_ASCII(self)) {
11646        kind = PyUnicode_KIND(self);
11647        data = PyUnicode_1BYTE_DATA(self);
11648        return unicode_fromascii(data + start, length);
11649    }
11650    else {
11651        kind = PyUnicode_KIND(self);
11652        data = PyUnicode_1BYTE_DATA(self);
11653        return PyUnicode_FromKindAndData(kind,
11654                                         data + kind * start,
11655                                         length);
11656    }
11657}
11658
11659static PyObject *
11660do_strip(PyObject *self, int striptype)
11661{
11662    int kind;
11663    void *data;
11664    Py_ssize_t len, i, j;
11665
11666    if (PyUnicode_READY(self) == -1)
11667        return NULL;
11668
11669    kind = PyUnicode_KIND(self);
11670    data = PyUnicode_DATA(self);
11671    len = PyUnicode_GET_LENGTH(self);
11672
11673    i = 0;
11674    if (striptype != RIGHTSTRIP) {
11675        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11676            i++;
11677        }
11678    }
11679
11680    j = len;
11681    if (striptype != LEFTSTRIP) {
11682        do {
11683            j--;
11684        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11685        j++;
11686    }
11687
11688    return PyUnicode_Substring(self, i, j);
11689}
11690
11691
11692static PyObject *
11693do_argstrip(PyObject *self, int striptype, PyObject *args)
11694{
11695    PyObject *sep = NULL;
11696
11697    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11698        return NULL;
11699
11700    if (sep != NULL && sep != Py_None) {
11701        if (PyUnicode_Check(sep))
11702            return _PyUnicode_XStrip(self, striptype, sep);
11703        else {
11704            PyErr_Format(PyExc_TypeError,
11705                         "%s arg must be None or str",
11706                         STRIPNAME(striptype));
11707            return NULL;
11708        }
11709    }
11710
11711    return do_strip(self, striptype);
11712}
11713
11714
11715PyDoc_STRVAR(strip__doc__,
11716             "S.strip([chars]) -> str\n\
11717\n\
11718Return a copy of the string S with leading and trailing\n\
11719whitespace removed.\n\
11720If chars is given and not None, remove characters in chars instead.");
11721
11722static PyObject *
11723unicode_strip(PyObject *self, PyObject *args)
11724{
11725    if (PyTuple_GET_SIZE(args) == 0)
11726        return do_strip(self, BOTHSTRIP); /* Common case */
11727    else
11728        return do_argstrip(self, BOTHSTRIP, args);
11729}
11730
11731
11732PyDoc_STRVAR(lstrip__doc__,
11733             "S.lstrip([chars]) -> str\n\
11734\n\
11735Return a copy of the string S with leading whitespace removed.\n\
11736If chars is given and not None, remove characters in chars instead.");
11737
11738static PyObject *
11739unicode_lstrip(PyObject *self, PyObject *args)
11740{
11741    if (PyTuple_GET_SIZE(args) == 0)
11742        return do_strip(self, LEFTSTRIP); /* Common case */
11743    else
11744        return do_argstrip(self, LEFTSTRIP, args);
11745}
11746
11747
11748PyDoc_STRVAR(rstrip__doc__,
11749             "S.rstrip([chars]) -> str\n\
11750\n\
11751Return a copy of the string S with trailing whitespace removed.\n\
11752If chars is given and not None, remove characters in chars instead.");
11753
11754static PyObject *
11755unicode_rstrip(PyObject *self, PyObject *args)
11756{
11757    if (PyTuple_GET_SIZE(args) == 0)
11758        return do_strip(self, RIGHTSTRIP); /* Common case */
11759    else
11760        return do_argstrip(self, RIGHTSTRIP, args);
11761}
11762
11763
11764static PyObject*
11765unicode_repeat(PyObject *str, Py_ssize_t len)
11766{
11767    PyObject *u;
11768    Py_ssize_t nchars, n;
11769
11770    if (len < 1) {
11771        Py_INCREF(unicode_empty);
11772        return unicode_empty;
11773    }
11774
11775    if (len == 1 && PyUnicode_CheckExact(str)) {
11776        /* no repeat, return original string */
11777        Py_INCREF(str);
11778        return str;
11779    }
11780
11781    if (PyUnicode_READY(str) == -1)
11782        return NULL;
11783
11784    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11785        PyErr_SetString(PyExc_OverflowError,
11786                        "repeated string is too long");
11787        return NULL;
11788    }
11789    nchars = len * PyUnicode_GET_LENGTH(str);
11790
11791    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11792    if (!u)
11793        return NULL;
11794    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11795
11796    if (PyUnicode_GET_LENGTH(str) == 1) {
11797        const int kind = PyUnicode_KIND(str);
11798        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11799        void *to = PyUnicode_DATA(u);
11800        if (kind == PyUnicode_1BYTE_KIND)
11801            memset(to, (unsigned char)fill_char, len);
11802        else {
11803            for (n = 0; n < len; ++n)
11804                PyUnicode_WRITE(kind, to, n, fill_char);
11805        }
11806    }
11807    else {
11808        /* number of characters copied this far */
11809        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11810        const Py_ssize_t char_size = PyUnicode_KIND(str);
11811        char *to = (char *) PyUnicode_DATA(u);
11812        Py_MEMCPY(to, PyUnicode_DATA(str),
11813                  PyUnicode_GET_LENGTH(str) * char_size);
11814        while (done < nchars) {
11815            n = (done <= nchars-done) ? done : nchars-done;
11816            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11817            done += n;
11818        }
11819    }
11820
11821    assert(_PyUnicode_CheckConsistency(u, 1));
11822    return u;
11823}
11824
11825PyObject *
11826PyUnicode_Replace(PyObject *obj,
11827                  PyObject *subobj,
11828                  PyObject *replobj,
11829                  Py_ssize_t maxcount)
11830{
11831    PyObject *self;
11832    PyObject *str1;
11833    PyObject *str2;
11834    PyObject *result;
11835
11836    self = PyUnicode_FromObject(obj);
11837    if (self == NULL || PyUnicode_READY(self) == -1)
11838        return NULL;
11839    str1 = PyUnicode_FromObject(subobj);
11840    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11841        Py_DECREF(self);
11842        return NULL;
11843    }
11844    str2 = PyUnicode_FromObject(replobj);
11845    if (str2 == NULL || PyUnicode_READY(str2)) {
11846        Py_DECREF(self);
11847        Py_DECREF(str1);
11848        return NULL;
11849    }
11850    result = replace(self, str1, str2, maxcount);
11851    Py_DECREF(self);
11852    Py_DECREF(str1);
11853    Py_DECREF(str2);
11854    return result;
11855}
11856
11857PyDoc_STRVAR(replace__doc__,
11858             "S.replace(old, new[, count]) -> str\n\
11859\n\
11860Return a copy of S with all occurrences of substring\n\
11861old replaced by new.  If the optional argument count is\n\
11862given, only the first count occurrences are replaced.");
11863
11864static PyObject*
11865unicode_replace(PyObject *self, PyObject *args)
11866{
11867    PyObject *str1;
11868    PyObject *str2;
11869    Py_ssize_t maxcount = -1;
11870    PyObject *result;
11871
11872    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11873        return NULL;
11874    if (!PyUnicode_READY(self) == -1)
11875        return NULL;
11876    str1 = PyUnicode_FromObject(str1);
11877    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11878        return NULL;
11879    str2 = PyUnicode_FromObject(str2);
11880    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11881        Py_DECREF(str1);
11882        return NULL;
11883    }
11884
11885    result = replace(self, str1, str2, maxcount);
11886
11887    Py_DECREF(str1);
11888    Py_DECREF(str2);
11889    return result;
11890}
11891
11892static PyObject *
11893unicode_repr(PyObject *unicode)
11894{
11895    PyObject *repr;
11896    Py_ssize_t isize;
11897    Py_ssize_t osize, squote, dquote, i, o;
11898    Py_UCS4 max, quote;
11899    int ikind, okind;
11900    void *idata, *odata;
11901
11902    if (PyUnicode_READY(unicode) == -1)
11903        return NULL;
11904
11905    isize = PyUnicode_GET_LENGTH(unicode);
11906    idata = PyUnicode_DATA(unicode);
11907
11908    /* Compute length of output, quote characters, and
11909       maximum character */
11910    osize = 2; /* quotes */
11911    max = 127;
11912    squote = dquote = 0;
11913    ikind = PyUnicode_KIND(unicode);
11914    for (i = 0; i < isize; i++) {
11915        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11916        switch (ch) {
11917        case '\'': squote++; osize++; break;
11918        case '"':  dquote++; osize++; break;
11919        case '\\': case '\t': case '\r': case '\n':
11920            osize += 2; break;
11921        default:
11922            /* Fast-path ASCII */
11923            if (ch < ' ' || ch == 0x7f)
11924                osize += 4; /* \xHH */
11925            else if (ch < 0x7f)
11926                osize++;
11927            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11928                osize++;
11929                max = ch > max ? ch : max;
11930            }
11931            else if (ch < 0x100)
11932                osize += 4; /* \xHH */
11933            else if (ch < 0x10000)
11934                osize += 6; /* \uHHHH */
11935            else
11936                osize += 10; /* \uHHHHHHHH */
11937        }
11938    }
11939
11940    quote = '\'';
11941    if (squote) {
11942        if (dquote)
11943            /* Both squote and dquote present. Use squote,
11944               and escape them */
11945            osize += squote;
11946        else
11947            quote = '"';
11948    }
11949
11950    repr = PyUnicode_New(osize, max);
11951    if (repr == NULL)
11952        return NULL;
11953    okind = PyUnicode_KIND(repr);
11954    odata = PyUnicode_DATA(repr);
11955
11956    PyUnicode_WRITE(okind, odata, 0, quote);
11957    PyUnicode_WRITE(okind, odata, osize-1, quote);
11958
11959    for (i = 0, o = 1; i < isize; i++) {
11960        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11961
11962        /* Escape quotes and backslashes */
11963        if ((ch == quote) || (ch == '\\')) {
11964            PyUnicode_WRITE(okind, odata, o++, '\\');
11965            PyUnicode_WRITE(okind, odata, o++, ch);
11966            continue;
11967        }
11968
11969        /* Map special whitespace to '\t', \n', '\r' */
11970        if (ch == '\t') {
11971            PyUnicode_WRITE(okind, odata, o++, '\\');
11972            PyUnicode_WRITE(okind, odata, o++, 't');
11973        }
11974        else if (ch == '\n') {
11975            PyUnicode_WRITE(okind, odata, o++, '\\');
11976            PyUnicode_WRITE(okind, odata, o++, 'n');
11977        }
11978        else if (ch == '\r') {
11979            PyUnicode_WRITE(okind, odata, o++, '\\');
11980            PyUnicode_WRITE(okind, odata, o++, 'r');
11981        }
11982
11983        /* Map non-printable US ASCII to '\xhh' */
11984        else if (ch < ' ' || ch == 0x7F) {
11985            PyUnicode_WRITE(okind, odata, o++, '\\');
11986            PyUnicode_WRITE(okind, odata, o++, 'x');
11987            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11988            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
11989        }
11990
11991        /* Copy ASCII characters as-is */
11992        else if (ch < 0x7F) {
11993            PyUnicode_WRITE(okind, odata, o++, ch);
11994        }
11995
11996        /* Non-ASCII characters */
11997        else {
11998            /* Map Unicode whitespace and control characters
11999               (categories Z* and C* except ASCII space)
12000            */
12001            if (!Py_UNICODE_ISPRINTABLE(ch)) {
12002                /* Map 8-bit characters to '\xhh' */
12003                if (ch <= 0xff) {
12004                    PyUnicode_WRITE(okind, odata, o++, '\\');
12005                    PyUnicode_WRITE(okind, odata, o++, 'x');
12006                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12007                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12008                }
12009                /* Map 21-bit characters to '\U00xxxxxx' */
12010                else if (ch >= 0x10000) {
12011                    PyUnicode_WRITE(okind, odata, o++, '\\');
12012                    PyUnicode_WRITE(okind, odata, o++, 'U');
12013                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12014                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12015                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12016                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12017                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12018                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12019                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12020                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12021                }
12022                /* Map 16-bit characters to '\uxxxx' */
12023                else {
12024                    PyUnicode_WRITE(okind, odata, o++, '\\');
12025                    PyUnicode_WRITE(okind, odata, o++, 'u');
12026                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12027                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12028                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12029                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12030                }
12031            }
12032            /* Copy characters as-is */
12033            else {
12034                PyUnicode_WRITE(okind, odata, o++, ch);
12035            }
12036        }
12037    }
12038    /* Closing quote already added at the beginning */
12039    assert(_PyUnicode_CheckConsistency(repr, 1));
12040    return repr;
12041}
12042
12043PyDoc_STRVAR(rfind__doc__,
12044             "S.rfind(sub[, start[, end]]) -> int\n\
12045\n\
12046Return the highest index in S where substring sub is found,\n\
12047such that sub is contained within S[start:end].  Optional\n\
12048arguments start and end are interpreted as in slice notation.\n\
12049\n\
12050Return -1 on failure.");
12051
12052static PyObject *
12053unicode_rfind(PyObject *self, PyObject *args)
12054{
12055    PyObject *substring;
12056    Py_ssize_t start;
12057    Py_ssize_t end;
12058    Py_ssize_t result;
12059
12060    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12061                                            &start, &end))
12062        return NULL;
12063
12064    if (PyUnicode_READY(self) == -1)
12065        return NULL;
12066    if (PyUnicode_READY(substring) == -1)
12067        return NULL;
12068
12069    result = any_find_slice(-1, self, substring, start, end);
12070
12071    Py_DECREF(substring);
12072
12073    if (result == -2)
12074        return NULL;
12075
12076    return PyLong_FromSsize_t(result);
12077}
12078
12079PyDoc_STRVAR(rindex__doc__,
12080             "S.rindex(sub[, start[, end]]) -> int\n\
12081\n\
12082Like S.rfind() but raise ValueError when the substring is not found.");
12083
12084static PyObject *
12085unicode_rindex(PyObject *self, PyObject *args)
12086{
12087    PyObject *substring;
12088    Py_ssize_t start;
12089    Py_ssize_t end;
12090    Py_ssize_t result;
12091
12092    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12093                                            &start, &end))
12094        return NULL;
12095
12096    if (PyUnicode_READY(self) == -1)
12097        return NULL;
12098    if (PyUnicode_READY(substring) == -1)
12099        return NULL;
12100
12101    result = any_find_slice(-1, self, substring, start, end);
12102
12103    Py_DECREF(substring);
12104
12105    if (result == -2)
12106        return NULL;
12107
12108    if (result < 0) {
12109        PyErr_SetString(PyExc_ValueError, "substring not found");
12110        return NULL;
12111    }
12112
12113    return PyLong_FromSsize_t(result);
12114}
12115
12116PyDoc_STRVAR(rjust__doc__,
12117             "S.rjust(width[, fillchar]) -> str\n\
12118\n\
12119Return S right-justified in a string of length width. Padding is\n\
12120done using the specified fill character (default is a space).");
12121
12122static PyObject *
12123unicode_rjust(PyObject *self, PyObject *args)
12124{
12125    Py_ssize_t width;
12126    Py_UCS4 fillchar = ' ';
12127
12128    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12129        return NULL;
12130
12131    if (PyUnicode_READY(self) == -1)
12132        return NULL;
12133
12134    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
12135        Py_INCREF(self);
12136        return self;
12137    }
12138
12139    return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
12140}
12141
12142PyObject *
12143PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12144{
12145    PyObject *result;
12146
12147    s = PyUnicode_FromObject(s);
12148    if (s == NULL)
12149        return NULL;
12150    if (sep != NULL) {
12151        sep = PyUnicode_FromObject(sep);
12152        if (sep == NULL) {
12153            Py_DECREF(s);
12154            return NULL;
12155        }
12156    }
12157
12158    result = split(s, sep, maxsplit);
12159
12160    Py_DECREF(s);
12161    Py_XDECREF(sep);
12162    return result;
12163}
12164
12165PyDoc_STRVAR(split__doc__,
12166             "S.split([sep[, maxsplit]]) -> list of strings\n\
12167\n\
12168Return a list of the words in S, using sep as the\n\
12169delimiter string.  If maxsplit is given, at most maxsplit\n\
12170splits are done. If sep is not specified or is None, any\n\
12171whitespace string is a separator and empty strings are\n\
12172removed from the result.");
12173
12174static PyObject*
12175unicode_split(PyObject *self, PyObject *args)
12176{
12177    PyObject *substring = Py_None;
12178    Py_ssize_t maxcount = -1;
12179
12180    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
12181        return NULL;
12182
12183    if (substring == Py_None)
12184        return split(self, NULL, maxcount);
12185    else if (PyUnicode_Check(substring))
12186        return split(self, substring, maxcount);
12187    else
12188        return PyUnicode_Split(self, substring, maxcount);
12189}
12190
12191PyObject *
12192PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12193{
12194    PyObject* str_obj;
12195    PyObject* sep_obj;
12196    PyObject* out;
12197    int kind1, kind2, kind;
12198    void *buf1 = NULL, *buf2 = NULL;
12199    Py_ssize_t len1, len2;
12200
12201    str_obj = PyUnicode_FromObject(str_in);
12202    if (!str_obj || PyUnicode_READY(str_obj) == -1)
12203        return NULL;
12204    sep_obj = PyUnicode_FromObject(sep_in);
12205    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
12206        Py_DECREF(str_obj);
12207        return NULL;
12208    }
12209
12210    kind1 = PyUnicode_KIND(str_obj);
12211    kind2 = PyUnicode_KIND(sep_obj);
12212    kind = Py_MAX(kind1, kind2);
12213    buf1 = PyUnicode_DATA(str_obj);
12214    if (kind1 != kind)
12215        buf1 = _PyUnicode_AsKind(str_obj, kind);
12216    if (!buf1)
12217        goto onError;
12218    buf2 = PyUnicode_DATA(sep_obj);
12219    if (kind2 != kind)
12220        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12221    if (!buf2)
12222        goto onError;
12223    len1 = PyUnicode_GET_LENGTH(str_obj);
12224    len2 = PyUnicode_GET_LENGTH(sep_obj);
12225
12226    switch(PyUnicode_KIND(str_obj)) {
12227    case PyUnicode_1BYTE_KIND:
12228        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12229            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12230        else
12231            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12232        break;
12233    case PyUnicode_2BYTE_KIND:
12234        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12235        break;
12236    case PyUnicode_4BYTE_KIND:
12237        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12238        break;
12239    default:
12240        assert(0);
12241        out = 0;
12242    }
12243
12244    Py_DECREF(sep_obj);
12245    Py_DECREF(str_obj);
12246    if (kind1 != kind)
12247        PyMem_Free(buf1);
12248    if (kind2 != kind)
12249        PyMem_Free(buf2);
12250
12251    return out;
12252  onError:
12253    Py_DECREF(sep_obj);
12254    Py_DECREF(str_obj);
12255    if (kind1 != kind && buf1)
12256        PyMem_Free(buf1);
12257    if (kind2 != kind && buf2)
12258        PyMem_Free(buf2);
12259    return NULL;
12260}
12261
12262
12263PyObject *
12264PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12265{
12266    PyObject* str_obj;
12267    PyObject* sep_obj;
12268    PyObject* out;
12269    int kind1, kind2, kind;
12270    void *buf1 = NULL, *buf2 = NULL;
12271    Py_ssize_t len1, len2;
12272
12273    str_obj = PyUnicode_FromObject(str_in);
12274    if (!str_obj)
12275        return NULL;
12276    sep_obj = PyUnicode_FromObject(sep_in);
12277    if (!sep_obj) {
12278        Py_DECREF(str_obj);
12279        return NULL;
12280    }
12281
12282    kind1 = PyUnicode_KIND(str_in);
12283    kind2 = PyUnicode_KIND(sep_obj);
12284    kind = Py_MAX(kind1, kind2);
12285    buf1 = PyUnicode_DATA(str_in);
12286    if (kind1 != kind)
12287        buf1 = _PyUnicode_AsKind(str_in, kind);
12288    if (!buf1)
12289        goto onError;
12290    buf2 = PyUnicode_DATA(sep_obj);
12291    if (kind2 != kind)
12292        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12293    if (!buf2)
12294        goto onError;
12295    len1 = PyUnicode_GET_LENGTH(str_obj);
12296    len2 = PyUnicode_GET_LENGTH(sep_obj);
12297
12298    switch(PyUnicode_KIND(str_in)) {
12299    case PyUnicode_1BYTE_KIND:
12300        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12301            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12302        else
12303            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12304        break;
12305    case PyUnicode_2BYTE_KIND:
12306        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12307        break;
12308    case PyUnicode_4BYTE_KIND:
12309        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12310        break;
12311    default:
12312        assert(0);
12313        out = 0;
12314    }
12315
12316    Py_DECREF(sep_obj);
12317    Py_DECREF(str_obj);
12318    if (kind1 != kind)
12319        PyMem_Free(buf1);
12320    if (kind2 != kind)
12321        PyMem_Free(buf2);
12322
12323    return out;
12324  onError:
12325    Py_DECREF(sep_obj);
12326    Py_DECREF(str_obj);
12327    if (kind1 != kind && buf1)
12328        PyMem_Free(buf1);
12329    if (kind2 != kind && buf2)
12330        PyMem_Free(buf2);
12331    return NULL;
12332}
12333
12334PyDoc_STRVAR(partition__doc__,
12335             "S.partition(sep) -> (head, sep, tail)\n\
12336\n\
12337Search for the separator sep in S, and return the part before it,\n\
12338the separator itself, and the part after it.  If the separator is not\n\
12339found, return S and two empty strings.");
12340
12341static PyObject*
12342unicode_partition(PyObject *self, PyObject *separator)
12343{
12344    return PyUnicode_Partition(self, separator);
12345}
12346
12347PyDoc_STRVAR(rpartition__doc__,
12348             "S.rpartition(sep) -> (head, sep, tail)\n\
12349\n\
12350Search for the separator sep in S, starting at the end of S, and return\n\
12351the part before it, the separator itself, and the part after it.  If the\n\
12352separator is not found, return two empty strings and S.");
12353
12354static PyObject*
12355unicode_rpartition(PyObject *self, PyObject *separator)
12356{
12357    return PyUnicode_RPartition(self, separator);
12358}
12359
12360PyObject *
12361PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12362{
12363    PyObject *result;
12364
12365    s = PyUnicode_FromObject(s);
12366    if (s == NULL)
12367        return NULL;
12368    if (sep != NULL) {
12369        sep = PyUnicode_FromObject(sep);
12370        if (sep == NULL) {
12371            Py_DECREF(s);
12372            return NULL;
12373        }
12374    }
12375
12376    result = rsplit(s, sep, maxsplit);
12377
12378    Py_DECREF(s);
12379    Py_XDECREF(sep);
12380    return result;
12381}
12382
12383PyDoc_STRVAR(rsplit__doc__,
12384             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
12385\n\
12386Return a list of the words in S, using sep as the\n\
12387delimiter string, starting at the end of the string and\n\
12388working to the front.  If maxsplit is given, at most maxsplit\n\
12389splits are done. If sep is not specified, any whitespace string\n\
12390is a separator.");
12391
12392static PyObject*
12393unicode_rsplit(PyObject *self, PyObject *args)
12394{
12395    PyObject *substring = Py_None;
12396    Py_ssize_t maxcount = -1;
12397
12398    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
12399        return NULL;
12400
12401    if (substring == Py_None)
12402        return rsplit(self, NULL, maxcount);
12403    else if (PyUnicode_Check(substring))
12404        return rsplit(self, substring, maxcount);
12405    else
12406        return PyUnicode_RSplit(self, substring, maxcount);
12407}
12408
12409PyDoc_STRVAR(splitlines__doc__,
12410             "S.splitlines([keepends]) -> list of strings\n\
12411\n\
12412Return a list of the lines in S, breaking at line boundaries.\n\
12413Line breaks are not included in the resulting list unless keepends\n\
12414is given and true.");
12415
12416static PyObject*
12417unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12418{
12419    static char *kwlist[] = {"keepends", 0};
12420    int keepends = 0;
12421
12422    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12423                                     kwlist, &keepends))
12424        return NULL;
12425
12426    return PyUnicode_Splitlines(self, keepends);
12427}
12428
12429static
12430PyObject *unicode_str(PyObject *self)
12431{
12432    if (PyUnicode_CheckExact(self)) {
12433        Py_INCREF(self);
12434        return self;
12435    } else
12436        /* Subtype -- return genuine unicode string with the same value. */
12437        return PyUnicode_Copy(self);
12438}
12439
12440PyDoc_STRVAR(swapcase__doc__,
12441             "S.swapcase() -> str\n\
12442\n\
12443Return a copy of S with uppercase characters converted to lowercase\n\
12444and vice versa.");
12445
12446static PyObject*
12447unicode_swapcase(PyObject *self)
12448{
12449    return fixup(self, fixswapcase);
12450}
12451
12452PyDoc_STRVAR(maketrans__doc__,
12453             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12454\n\
12455Return a translation table usable for str.translate().\n\
12456If there is only one argument, it must be a dictionary mapping Unicode\n\
12457ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12458Character keys will be then converted to ordinals.\n\
12459If there are two arguments, they must be strings of equal length, and\n\
12460in the resulting dictionary, each character in x will be mapped to the\n\
12461character at the same position in y. If there is a third argument, it\n\
12462must be a string, whose characters will be mapped to None in the result.");
12463
12464static PyObject*
12465unicode_maketrans(PyObject *null, PyObject *args)
12466{
12467    PyObject *x, *y = NULL, *z = NULL;
12468    PyObject *new = NULL, *key, *value;
12469    Py_ssize_t i = 0;
12470    int res;
12471
12472    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12473        return NULL;
12474    new = PyDict_New();
12475    if (!new)
12476        return NULL;
12477    if (y != NULL) {
12478        int x_kind, y_kind, z_kind;
12479        void *x_data, *y_data, *z_data;
12480
12481        /* x must be a string too, of equal length */
12482        if (!PyUnicode_Check(x)) {
12483            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12484                            "be a string if there is a second argument");
12485            goto err;
12486        }
12487        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12488            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12489                            "arguments must have equal length");
12490            goto err;
12491        }
12492        /* create entries for translating chars in x to those in y */
12493        x_kind = PyUnicode_KIND(x);
12494        y_kind = PyUnicode_KIND(y);
12495        x_data = PyUnicode_DATA(x);
12496        y_data = PyUnicode_DATA(y);
12497        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12498            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12499            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12500            if (!key || !value)
12501                goto err;
12502            res = PyDict_SetItem(new, key, value);
12503            Py_DECREF(key);
12504            Py_DECREF(value);
12505            if (res < 0)
12506                goto err;
12507        }
12508        /* create entries for deleting chars in z */
12509        if (z != NULL) {
12510            z_kind = PyUnicode_KIND(z);
12511            z_data = PyUnicode_DATA(z);
12512            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12513                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12514                if (!key)
12515                    goto err;
12516                res = PyDict_SetItem(new, key, Py_None);
12517                Py_DECREF(key);
12518                if (res < 0)
12519                    goto err;
12520            }
12521        }
12522    } else {
12523        int kind;
12524        void *data;
12525
12526        /* x must be a dict */
12527        if (!PyDict_CheckExact(x)) {
12528            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12529                            "to maketrans it must be a dict");
12530            goto err;
12531        }
12532        /* copy entries into the new dict, converting string keys to int keys */
12533        while (PyDict_Next(x, &i, &key, &value)) {
12534            if (PyUnicode_Check(key)) {
12535                /* convert string keys to integer keys */
12536                PyObject *newkey;
12537                if (PyUnicode_GET_LENGTH(key) != 1) {
12538                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12539                                    "table must be of length 1");
12540                    goto err;
12541                }
12542                kind = PyUnicode_KIND(key);
12543                data = PyUnicode_DATA(key);
12544                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12545                if (!newkey)
12546                    goto err;
12547                res = PyDict_SetItem(new, newkey, value);
12548                Py_DECREF(newkey);
12549                if (res < 0)
12550                    goto err;
12551            } else if (PyLong_Check(key)) {
12552                /* just keep integer keys */
12553                if (PyDict_SetItem(new, key, value) < 0)
12554                    goto err;
12555            } else {
12556                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12557                                "be strings or integers");
12558                goto err;
12559            }
12560        }
12561    }
12562    return new;
12563  err:
12564    Py_DECREF(new);
12565    return NULL;
12566}
12567
12568PyDoc_STRVAR(translate__doc__,
12569             "S.translate(table) -> str\n\
12570\n\
12571Return a copy of the string S, where all characters have been mapped\n\
12572through the given translation table, which must be a mapping of\n\
12573Unicode ordinals to Unicode ordinals, strings, or None.\n\
12574Unmapped characters are left untouched. Characters mapped to None\n\
12575are deleted.");
12576
12577static PyObject*
12578unicode_translate(PyObject *self, PyObject *table)
12579{
12580    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12581}
12582
12583PyDoc_STRVAR(upper__doc__,
12584             "S.upper() -> str\n\
12585\n\
12586Return a copy of S converted to uppercase.");
12587
12588static PyObject*
12589unicode_upper(PyObject *self)
12590{
12591    return fixup(self, fixupper);
12592}
12593
12594PyDoc_STRVAR(zfill__doc__,
12595             "S.zfill(width) -> str\n\
12596\n\
12597Pad a numeric string S with zeros on the left, to fill a field\n\
12598of the specified width. The string S is never truncated.");
12599
12600static PyObject *
12601unicode_zfill(PyObject *self, PyObject *args)
12602{
12603    Py_ssize_t fill;
12604    PyObject *u;
12605    Py_ssize_t width;
12606    int kind;
12607    void *data;
12608    Py_UCS4 chr;
12609
12610    if (PyUnicode_READY(self) == -1)
12611        return NULL;
12612
12613    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12614        return NULL;
12615
12616    if (PyUnicode_GET_LENGTH(self) >= width) {
12617        if (PyUnicode_CheckExact(self)) {
12618            Py_INCREF(self);
12619            return self;
12620        }
12621        else
12622            return PyUnicode_Copy(self);
12623    }
12624
12625    fill = width - _PyUnicode_LENGTH(self);
12626
12627    u = pad(self, fill, 0, '0');
12628
12629    if (u == NULL)
12630        return NULL;
12631
12632    kind = PyUnicode_KIND(u);
12633    data = PyUnicode_DATA(u);
12634    chr = PyUnicode_READ(kind, data, fill);
12635
12636    if (chr == '+' || chr == '-') {
12637        /* move sign to beginning of string */
12638        PyUnicode_WRITE(kind, data, 0, chr);
12639        PyUnicode_WRITE(kind, data, fill, '0');
12640    }
12641
12642    assert(_PyUnicode_CheckConsistency(u, 1));
12643    return u;
12644}
12645
12646#if 0
12647static PyObject *
12648unicode__decimal2ascii(PyObject *self)
12649{
12650    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12651}
12652#endif
12653
12654PyDoc_STRVAR(startswith__doc__,
12655             "S.startswith(prefix[, start[, end]]) -> bool\n\
12656\n\
12657Return True if S starts with the specified prefix, False otherwise.\n\
12658With optional start, test S beginning at that position.\n\
12659With optional end, stop comparing S at that position.\n\
12660prefix can also be a tuple of strings to try.");
12661
12662static PyObject *
12663unicode_startswith(PyObject *self,
12664                   PyObject *args)
12665{
12666    PyObject *subobj;
12667    PyObject *substring;
12668    Py_ssize_t start = 0;
12669    Py_ssize_t end = PY_SSIZE_T_MAX;
12670    int result;
12671
12672    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12673        return NULL;
12674    if (PyTuple_Check(subobj)) {
12675        Py_ssize_t i;
12676        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12677            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12678            if (substring == NULL)
12679                return NULL;
12680            result = tailmatch(self, substring, start, end, -1);
12681            Py_DECREF(substring);
12682            if (result) {
12683                Py_RETURN_TRUE;
12684            }
12685        }
12686        /* nothing matched */
12687        Py_RETURN_FALSE;
12688    }
12689    substring = PyUnicode_FromObject(subobj);
12690    if (substring == NULL) {
12691        if (PyErr_ExceptionMatches(PyExc_TypeError))
12692            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12693                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12694        return NULL;
12695    }
12696    result = tailmatch(self, substring, start, end, -1);
12697    Py_DECREF(substring);
12698    return PyBool_FromLong(result);
12699}
12700
12701
12702PyDoc_STRVAR(endswith__doc__,
12703             "S.endswith(suffix[, start[, end]]) -> bool\n\
12704\n\
12705Return True if S ends with the specified suffix, False otherwise.\n\
12706With optional start, test S beginning at that position.\n\
12707With optional end, stop comparing S at that position.\n\
12708suffix can also be a tuple of strings to try.");
12709
12710static PyObject *
12711unicode_endswith(PyObject *self,
12712                 PyObject *args)
12713{
12714    PyObject *subobj;
12715    PyObject *substring;
12716    Py_ssize_t start = 0;
12717    Py_ssize_t end = PY_SSIZE_T_MAX;
12718    int result;
12719
12720    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12721        return NULL;
12722    if (PyTuple_Check(subobj)) {
12723        Py_ssize_t i;
12724        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12725            substring = PyUnicode_FromObject(
12726                PyTuple_GET_ITEM(subobj, i));
12727            if (substring == NULL)
12728                return NULL;
12729            result = tailmatch(self, substring, start, end, +1);
12730            Py_DECREF(substring);
12731            if (result) {
12732                Py_RETURN_TRUE;
12733            }
12734        }
12735        Py_RETURN_FALSE;
12736    }
12737    substring = PyUnicode_FromObject(subobj);
12738    if (substring == NULL) {
12739        if (PyErr_ExceptionMatches(PyExc_TypeError))
12740            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12741                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12742        return NULL;
12743    }
12744    result = tailmatch(self, substring, start, end, +1);
12745    Py_DECREF(substring);
12746    return PyBool_FromLong(result);
12747}
12748
12749#include "stringlib/unicode_format.h"
12750
12751PyDoc_STRVAR(format__doc__,
12752             "S.format(*args, **kwargs) -> str\n\
12753\n\
12754Return a formatted version of S, using substitutions from args and kwargs.\n\
12755The substitutions are identified by braces ('{' and '}').");
12756
12757PyDoc_STRVAR(format_map__doc__,
12758             "S.format_map(mapping) -> str\n\
12759\n\
12760Return a formatted version of S, using substitutions from mapping.\n\
12761The substitutions are identified by braces ('{' and '}').");
12762
12763static PyObject *
12764unicode__format__(PyObject* self, PyObject* args)
12765{
12766    PyObject *format_spec, *out;
12767
12768    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12769        return NULL;
12770
12771    out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
12772                                     PyUnicode_GET_LENGTH(format_spec));
12773    return out;
12774}
12775
12776PyDoc_STRVAR(p_format__doc__,
12777             "S.__format__(format_spec) -> str\n\
12778\n\
12779Return a formatted version of S as described by format_spec.");
12780
12781static PyObject *
12782unicode__sizeof__(PyObject *v)
12783{
12784    Py_ssize_t size;
12785
12786    /* If it's a compact object, account for base structure +
12787       character data. */
12788    if (PyUnicode_IS_COMPACT_ASCII(v))
12789        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12790    else if (PyUnicode_IS_COMPACT(v))
12791        size = sizeof(PyCompactUnicodeObject) +
12792            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12793    else {
12794        /* If it is a two-block object, account for base object, and
12795           for character block if present. */
12796        size = sizeof(PyUnicodeObject);
12797        if (_PyUnicode_DATA_ANY(v))
12798            size += (PyUnicode_GET_LENGTH(v) + 1) *
12799                PyUnicode_KIND(v);
12800    }
12801    /* If the wstr pointer is present, account for it unless it is shared
12802       with the data pointer. Check if the data is not shared. */
12803    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12804        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12805    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12806        size += PyUnicode_UTF8_LENGTH(v) + 1;
12807
12808    return PyLong_FromSsize_t(size);
12809}
12810
12811PyDoc_STRVAR(sizeof__doc__,
12812             "S.__sizeof__() -> size of S in memory, in bytes");
12813
12814static PyObject *
12815unicode_getnewargs(PyObject *v)
12816{
12817    PyObject *copy = PyUnicode_Copy(v);
12818    if (!copy)
12819        return NULL;
12820    return Py_BuildValue("(N)", copy);
12821}
12822
12823static PyMethodDef unicode_methods[] = {
12824
12825    /* Order is according to common usage: often used methods should
12826       appear first, since lookup is done sequentially. */
12827
12828    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12829    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12830    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12831    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12832    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12833    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12834    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12835    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12836    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12837    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12838    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12839    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12840    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12841    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12842    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12843    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12844    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12845    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12846    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12847    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12848    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12849    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12850    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12851    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12852    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12853    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12854    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12855    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12856    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12857    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12858    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12859    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12860    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12861    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12862    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12863    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12864    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12865    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12866    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12867    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12868    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12869    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12870    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12871    {"maketrans", (PyCFunction) unicode_maketrans,
12872     METH_VARARGS | METH_STATIC, maketrans__doc__},
12873    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12874#if 0
12875    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12876#endif
12877
12878#if 0
12879    /* These methods are just used for debugging the implementation. */
12880    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12881#endif
12882
12883    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12884    {NULL, NULL}
12885};
12886
12887static PyObject *
12888unicode_mod(PyObject *v, PyObject *w)
12889{
12890    if (!PyUnicode_Check(v))
12891        Py_RETURN_NOTIMPLEMENTED;
12892    return PyUnicode_Format(v, w);
12893}
12894
12895static PyNumberMethods unicode_as_number = {
12896    0,              /*nb_add*/
12897    0,              /*nb_subtract*/
12898    0,              /*nb_multiply*/
12899    unicode_mod,            /*nb_remainder*/
12900};
12901
12902static PySequenceMethods unicode_as_sequence = {
12903    (lenfunc) unicode_length,       /* sq_length */
12904    PyUnicode_Concat,           /* sq_concat */
12905    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12906    (ssizeargfunc) unicode_getitem,     /* sq_item */
12907    0,                  /* sq_slice */
12908    0,                  /* sq_ass_item */
12909    0,                  /* sq_ass_slice */
12910    PyUnicode_Contains,         /* sq_contains */
12911};
12912
12913static PyObject*
12914unicode_subscript(PyObject* self, PyObject* item)
12915{
12916    if (PyUnicode_READY(self) == -1)
12917        return NULL;
12918
12919    if (PyIndex_Check(item)) {
12920        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12921        if (i == -1 && PyErr_Occurred())
12922            return NULL;
12923        if (i < 0)
12924            i += PyUnicode_GET_LENGTH(self);
12925        return unicode_getitem(self, i);
12926    } else if (PySlice_Check(item)) {
12927        Py_ssize_t start, stop, step, slicelength, cur, i;
12928        PyObject *result;
12929        void *src_data, *dest_data;
12930        int src_kind, dest_kind;
12931        Py_UCS4 ch, max_char, kind_limit;
12932
12933        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12934                                 &start, &stop, &step, &slicelength) < 0) {
12935            return NULL;
12936        }
12937
12938        if (slicelength <= 0) {
12939            return PyUnicode_New(0, 0);
12940        } else if (start == 0 && step == 1 &&
12941                   slicelength == PyUnicode_GET_LENGTH(self) &&
12942                   PyUnicode_CheckExact(self)) {
12943            Py_INCREF(self);
12944            return self;
12945        } else if (step == 1) {
12946            return PyUnicode_Substring(self,
12947                                       start, start + slicelength);
12948        }
12949        /* General case */
12950        src_kind = PyUnicode_KIND(self);
12951        src_data = PyUnicode_DATA(self);
12952        if (!PyUnicode_IS_ASCII(self)) {
12953            kind_limit = kind_maxchar_limit(src_kind);
12954            max_char = 0;
12955            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12956                ch = PyUnicode_READ(src_kind, src_data, cur);
12957                if (ch > max_char) {
12958                    max_char = ch;
12959                    if (max_char >= kind_limit)
12960                        break;
12961                }
12962            }
12963        }
12964        else
12965            max_char = 127;
12966        result = PyUnicode_New(slicelength, max_char);
12967        if (result == NULL)
12968            return NULL;
12969        dest_kind = PyUnicode_KIND(result);
12970        dest_data = PyUnicode_DATA(result);
12971
12972        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12973            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12974            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
12975        }
12976        assert(_PyUnicode_CheckConsistency(result, 1));
12977        return result;
12978    } else {
12979        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12980        return NULL;
12981    }
12982}
12983
12984static PyMappingMethods unicode_as_mapping = {
12985    (lenfunc)unicode_length,        /* mp_length */
12986    (binaryfunc)unicode_subscript,  /* mp_subscript */
12987    (objobjargproc)0,           /* mp_ass_subscript */
12988};
12989
12990
12991/* Helpers for PyUnicode_Format() */
12992
12993static PyObject *
12994getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12995{
12996    Py_ssize_t argidx = *p_argidx;
12997    if (argidx < arglen) {
12998        (*p_argidx)++;
12999        if (arglen < 0)
13000            return args;
13001        else
13002            return PyTuple_GetItem(args, argidx);
13003    }
13004    PyErr_SetString(PyExc_TypeError,
13005                    "not enough arguments for format string");
13006    return NULL;
13007}
13008
13009/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13010
13011static PyObject *
13012formatfloat(PyObject *v, int flags, int prec, int type)
13013{
13014    char *p;
13015    PyObject *result;
13016    double x;
13017
13018    x = PyFloat_AsDouble(v);
13019    if (x == -1.0 && PyErr_Occurred())
13020        return NULL;
13021
13022    if (prec < 0)
13023        prec = 6;
13024
13025    p = PyOS_double_to_string(x, type, prec,
13026                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
13027    if (p == NULL)
13028        return NULL;
13029    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
13030    PyMem_Free(p);
13031    return result;
13032}
13033
13034static PyObject*
13035formatlong(PyObject *val, int flags, int prec, int type)
13036{
13037    char *buf;
13038    int len;
13039    PyObject *str; /* temporary string object. */
13040    PyObject *result;
13041
13042    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13043    if (!str)
13044        return NULL;
13045    result = PyUnicode_DecodeASCII(buf, len, NULL);
13046    Py_DECREF(str);
13047    return result;
13048}
13049
13050static Py_UCS4
13051formatchar(PyObject *v)
13052{
13053    /* presume that the buffer is at least 3 characters long */
13054    if (PyUnicode_Check(v)) {
13055        if (PyUnicode_GET_LENGTH(v) == 1) {
13056            return PyUnicode_READ_CHAR(v, 0);
13057        }
13058        goto onError;
13059    }
13060    else {
13061        /* Integer input truncated to a character */
13062        long x;
13063        x = PyLong_AsLong(v);
13064        if (x == -1 && PyErr_Occurred())
13065            goto onError;
13066
13067        if (x < 0 || x > MAX_UNICODE) {
13068            PyErr_SetString(PyExc_OverflowError,
13069                            "%c arg not in range(0x110000)");
13070            return (Py_UCS4) -1;
13071        }
13072
13073        return (Py_UCS4) x;
13074    }
13075
13076  onError:
13077    PyErr_SetString(PyExc_TypeError,
13078                    "%c requires int or char");
13079    return (Py_UCS4) -1;
13080}
13081
13082static int
13083repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13084{
13085    int r;
13086    assert(count > 0);
13087    assert(PyUnicode_Check(obj));
13088    if (count > 5) {
13089        PyObject *repeated = unicode_repeat(obj, count);
13090        if (repeated == NULL)
13091            return -1;
13092        r = _PyAccu_Accumulate(acc, repeated);
13093        Py_DECREF(repeated);
13094        return r;
13095    }
13096    else {
13097        do {
13098            if (_PyAccu_Accumulate(acc, obj))
13099                return -1;
13100        } while (--count);
13101        return 0;
13102    }
13103}
13104
13105PyObject *
13106PyUnicode_Format(PyObject *format, PyObject *args)
13107{
13108    void *fmt;
13109    int fmtkind;
13110    PyObject *result;
13111    int kind;
13112    int r;
13113    Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
13114    int args_owned = 0;
13115    PyObject *dict = NULL;
13116    PyObject *temp = NULL;
13117    PyObject *second = NULL;
13118    PyObject *uformat;
13119    _PyAccu acc;
13120    static PyObject *plus, *minus, *blank, *zero, *percent;
13121
13122    if (!plus && !(plus = get_latin1_char('+')))
13123        return NULL;
13124    if (!minus && !(minus = get_latin1_char('-')))
13125        return NULL;
13126    if (!blank && !(blank = get_latin1_char(' ')))
13127        return NULL;
13128    if (!zero && !(zero = get_latin1_char('0')))
13129        return NULL;
13130    if (!percent && !(percent = get_latin1_char('%')))
13131        return NULL;
13132
13133    if (format == NULL || args == NULL) {
13134        PyErr_BadInternalCall();
13135        return NULL;
13136    }
13137    uformat = PyUnicode_FromObject(format);
13138    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
13139        return NULL;
13140    if (_PyAccu_Init(&acc))
13141        goto onError;
13142    fmt = PyUnicode_DATA(uformat);
13143    fmtkind = PyUnicode_KIND(uformat);
13144    fmtcnt = PyUnicode_GET_LENGTH(uformat);
13145    fmtpos = 0;
13146
13147    if (PyTuple_Check(args)) {
13148        arglen = PyTuple_Size(args);
13149        argidx = 0;
13150    }
13151    else {
13152        arglen = -1;
13153        argidx = -2;
13154    }
13155    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
13156        !PyUnicode_Check(args))
13157        dict = args;
13158
13159    while (--fmtcnt >= 0) {
13160        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13161            PyObject *nonfmt;
13162            Py_ssize_t nonfmtpos;
13163            nonfmtpos = fmtpos++;
13164            while (fmtcnt >= 0 &&
13165                   PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13166                fmtpos++;
13167                fmtcnt--;
13168            }
13169            nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
13170            if (nonfmt == NULL)
13171                goto onError;
13172            r = _PyAccu_Accumulate(&acc, nonfmt);
13173            Py_DECREF(nonfmt);
13174            if (r)
13175                goto onError;
13176        }
13177        else {
13178            /* Got a format specifier */
13179            int flags = 0;
13180            Py_ssize_t width = -1;
13181            int prec = -1;
13182            Py_UCS4 c = '\0';
13183            Py_UCS4 fill, sign;
13184            int isnumok;
13185            PyObject *v = NULL;
13186            void *pbuf = NULL;
13187            Py_ssize_t pindex, len;
13188            PyObject *signobj = NULL, *fillobj = NULL;
13189
13190            fmtpos++;
13191            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13192                Py_ssize_t keystart;
13193                Py_ssize_t keylen;
13194                PyObject *key;
13195                int pcount = 1;
13196
13197                if (dict == NULL) {
13198                    PyErr_SetString(PyExc_TypeError,
13199                                    "format requires a mapping");
13200                    goto onError;
13201                }
13202                ++fmtpos;
13203                --fmtcnt;
13204                keystart = fmtpos;
13205                /* Skip over balanced parentheses */
13206                while (pcount > 0 && --fmtcnt >= 0) {
13207                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
13208                        --pcount;
13209                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
13210                        ++pcount;
13211                    fmtpos++;
13212                }
13213                keylen = fmtpos - keystart - 1;
13214                if (fmtcnt < 0 || pcount > 0) {
13215                    PyErr_SetString(PyExc_ValueError,
13216                                    "incomplete format key");
13217                    goto onError;
13218                }
13219                key = PyUnicode_Substring(uformat,
13220                                          keystart, keystart + keylen);
13221                if (key == NULL)
13222                    goto onError;
13223                if (args_owned) {
13224                    Py_DECREF(args);
13225                    args_owned = 0;
13226                }
13227                args = PyObject_GetItem(dict, key);
13228                Py_DECREF(key);
13229                if (args == NULL) {
13230                    goto onError;
13231                }
13232                args_owned = 1;
13233                arglen = -1;
13234                argidx = -2;
13235            }
13236            while (--fmtcnt >= 0) {
13237                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
13238                case '-': flags |= F_LJUST; continue;
13239                case '+': flags |= F_SIGN; continue;
13240                case ' ': flags |= F_BLANK; continue;
13241                case '#': flags |= F_ALT; continue;
13242                case '0': flags |= F_ZERO; continue;
13243                }
13244                break;
13245            }
13246            if (c == '*') {
13247                v = getnextarg(args, arglen, &argidx);
13248                if (v == NULL)
13249                    goto onError;
13250                if (!PyLong_Check(v)) {
13251                    PyErr_SetString(PyExc_TypeError,
13252                                    "* wants int");
13253                    goto onError;
13254                }
13255                width = PyLong_AsLong(v);
13256                if (width == -1 && PyErr_Occurred())
13257                    goto onError;
13258                if (width < 0) {
13259                    flags |= F_LJUST;
13260                    width = -width;
13261                }
13262                if (--fmtcnt >= 0)
13263                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13264            }
13265            else if (c >= '0' && c <= '9') {
13266                width = c - '0';
13267                while (--fmtcnt >= 0) {
13268                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13269                    if (c < '0' || c > '9')
13270                        break;
13271                    if ((width*10) / 10 != width) {
13272                        PyErr_SetString(PyExc_ValueError,
13273                                        "width too big");
13274                        goto onError;
13275                    }
13276                    width = width*10 + (c - '0');
13277                }
13278            }
13279            if (c == '.') {
13280                prec = 0;
13281                if (--fmtcnt >= 0)
13282                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13283                if (c == '*') {
13284                    v = getnextarg(args, arglen, &argidx);
13285                    if (v == NULL)
13286                        goto onError;
13287                    if (!PyLong_Check(v)) {
13288                        PyErr_SetString(PyExc_TypeError,
13289                                        "* wants int");
13290                        goto onError;
13291                    }
13292                    prec = PyLong_AsLong(v);
13293                    if (prec == -1 && PyErr_Occurred())
13294                        goto onError;
13295                    if (prec < 0)
13296                        prec = 0;
13297                    if (--fmtcnt >= 0)
13298                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13299                }
13300                else if (c >= '0' && c <= '9') {
13301                    prec = c - '0';
13302                    while (--fmtcnt >= 0) {
13303                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13304                        if (c < '0' || c > '9')
13305                            break;
13306                        if ((prec*10) / 10 != prec) {
13307                            PyErr_SetString(PyExc_ValueError,
13308                                            "prec too big");
13309                            goto onError;
13310                        }
13311                        prec = prec*10 + (c - '0');
13312                    }
13313                }
13314            } /* prec */
13315            if (fmtcnt >= 0) {
13316                if (c == 'h' || c == 'l' || c == 'L') {
13317                    if (--fmtcnt >= 0)
13318                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13319                }
13320            }
13321            if (fmtcnt < 0) {
13322                PyErr_SetString(PyExc_ValueError,
13323                                "incomplete format");
13324                goto onError;
13325            }
13326            if (c != '%') {
13327                v = getnextarg(args, arglen, &argidx);
13328                if (v == NULL)
13329                    goto onError;
13330            }
13331            sign = 0;
13332            fill = ' ';
13333            fillobj = blank;
13334            switch (c) {
13335
13336            case '%':
13337                _PyAccu_Accumulate(&acc, percent);
13338                continue;
13339
13340            case 's':
13341            case 'r':
13342            case 'a':
13343                if (PyUnicode_CheckExact(v) && c == 's') {
13344                    temp = v;
13345                    Py_INCREF(temp);
13346                }
13347                else {
13348                    if (c == 's')
13349                        temp = PyObject_Str(v);
13350                    else if (c == 'r')
13351                        temp = PyObject_Repr(v);
13352                    else
13353                        temp = PyObject_ASCII(v);
13354                    if (temp == NULL)
13355                        goto onError;
13356                    if (PyUnicode_Check(temp))
13357                        /* nothing to do */;
13358                    else {
13359                        Py_DECREF(temp);
13360                        PyErr_SetString(PyExc_TypeError,
13361                                        "%s argument has non-string str()");
13362                        goto onError;
13363                    }
13364                }
13365                if (PyUnicode_READY(temp) == -1) {
13366                    Py_CLEAR(temp);
13367                    goto onError;
13368                }
13369                pbuf = PyUnicode_DATA(temp);
13370                kind = PyUnicode_KIND(temp);
13371                len = PyUnicode_GET_LENGTH(temp);
13372                if (prec >= 0 && len > prec)
13373                    len = prec;
13374                break;
13375
13376            case 'i':
13377            case 'd':
13378            case 'u':
13379            case 'o':
13380            case 'x':
13381            case 'X':
13382                isnumok = 0;
13383                if (PyNumber_Check(v)) {
13384                    PyObject *iobj=NULL;
13385
13386                    if (PyLong_Check(v)) {
13387                        iobj = v;
13388                        Py_INCREF(iobj);
13389                    }
13390                    else {
13391                        iobj = PyNumber_Long(v);
13392                    }
13393                    if (iobj!=NULL) {
13394                        if (PyLong_Check(iobj)) {
13395                            isnumok = 1;
13396                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13397                            Py_DECREF(iobj);
13398                            if (!temp)
13399                                goto onError;
13400                            if (PyUnicode_READY(temp) == -1) {
13401                                Py_CLEAR(temp);
13402                                goto onError;
13403                            }
13404                            pbuf = PyUnicode_DATA(temp);
13405                            kind = PyUnicode_KIND(temp);
13406                            len = PyUnicode_GET_LENGTH(temp);
13407                            sign = 1;
13408                        }
13409                        else {
13410                            Py_DECREF(iobj);
13411                        }
13412                    }
13413                }
13414                if (!isnumok) {
13415                    PyErr_Format(PyExc_TypeError,
13416                                 "%%%c format: a number is required, "
13417                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13418                    goto onError;
13419                }
13420                if (flags & F_ZERO) {
13421                    fill = '0';
13422                    fillobj = zero;
13423                }
13424                break;
13425
13426            case 'e':
13427            case 'E':
13428            case 'f':
13429            case 'F':
13430            case 'g':
13431            case 'G':
13432                temp = formatfloat(v, flags, prec, c);
13433                if (!temp)
13434                    goto onError;
13435                if (PyUnicode_READY(temp) == -1) {
13436                    Py_CLEAR(temp);
13437                    goto onError;
13438                }
13439                pbuf = PyUnicode_DATA(temp);
13440                kind = PyUnicode_KIND(temp);
13441                len = PyUnicode_GET_LENGTH(temp);
13442                sign = 1;
13443                if (flags & F_ZERO) {
13444                    fill = '0';
13445                    fillobj = zero;
13446                }
13447                break;
13448
13449            case 'c':
13450            {
13451                Py_UCS4 ch = formatchar(v);
13452                if (ch == (Py_UCS4) -1)
13453                    goto onError;
13454                temp = _PyUnicode_FromUCS4(&ch, 1);
13455                if (temp == NULL)
13456                    goto onError;
13457                pbuf = PyUnicode_DATA(temp);
13458                kind = PyUnicode_KIND(temp);
13459                len = PyUnicode_GET_LENGTH(temp);
13460                break;
13461            }
13462
13463            default:
13464                PyErr_Format(PyExc_ValueError,
13465                             "unsupported format character '%c' (0x%x) "
13466                             "at index %zd",
13467                             (31<=c && c<=126) ? (char)c : '?',
13468                             (int)c,
13469                             fmtpos - 1);
13470                goto onError;
13471            }
13472            /* pbuf is initialized here. */
13473            pindex = 0;
13474            if (sign) {
13475                if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13476                    signobj = minus;
13477                    len--;
13478                    pindex++;
13479                }
13480                else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13481                    signobj = plus;
13482                    len--;
13483                    pindex++;
13484                }
13485                else if (flags & F_SIGN)
13486                    signobj = plus;
13487                else if (flags & F_BLANK)
13488                    signobj = blank;
13489                else
13490                    sign = 0;
13491            }
13492            if (width < len)
13493                width = len;
13494            if (sign) {
13495                if (fill != ' ') {
13496                    assert(signobj != NULL);
13497                    if (_PyAccu_Accumulate(&acc, signobj))
13498                        goto onError;
13499                }
13500                if (width > len)
13501                    width--;
13502            }
13503            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13504                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13505                assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13506                if (fill != ' ') {
13507                    second = get_latin1_char(
13508                        PyUnicode_READ(kind, pbuf, pindex + 1));
13509                    pindex += 2;
13510                    if (second == NULL ||
13511                        _PyAccu_Accumulate(&acc, zero) ||
13512                        _PyAccu_Accumulate(&acc, second))
13513                        goto onError;
13514                    Py_CLEAR(second);
13515                }
13516                width -= 2;
13517                if (width < 0)
13518                    width = 0;
13519                len -= 2;
13520            }
13521            if (width > len && !(flags & F_LJUST)) {
13522                assert(fillobj != NULL);
13523                if (repeat_accumulate(&acc, fillobj, width - len))
13524                    goto onError;
13525                width = len;
13526            }
13527            if (fill == ' ') {
13528                if (sign) {
13529                    assert(signobj != NULL);
13530                    if (_PyAccu_Accumulate(&acc, signobj))
13531                        goto onError;
13532                }
13533                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13534                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13535                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13536                    second = get_latin1_char(
13537                        PyUnicode_READ(kind, pbuf, pindex + 1));
13538                    pindex += 2;
13539                    if (second == NULL ||
13540                        _PyAccu_Accumulate(&acc, zero) ||
13541                        _PyAccu_Accumulate(&acc, second))
13542                        goto onError;
13543                    Py_CLEAR(second);
13544                }
13545            }
13546            /* Copy all characters, preserving len */
13547            if (temp != NULL) {
13548                assert(pbuf == PyUnicode_DATA(temp));
13549                v = PyUnicode_Substring(temp, pindex, pindex + len);
13550            }
13551            else {
13552                const char *p = (const char *) pbuf;
13553                assert(pbuf != NULL);
13554                p += kind * pindex;
13555                v = PyUnicode_FromKindAndData(kind, p, len);
13556            }
13557            if (v == NULL)
13558                goto onError;
13559            r = _PyAccu_Accumulate(&acc, v);
13560            Py_DECREF(v);
13561            if (r)
13562                goto onError;
13563            if (width > len && repeat_accumulate(&acc, blank, width - len))
13564                goto onError;
13565            if (dict && (argidx < arglen) && c != '%') {
13566                PyErr_SetString(PyExc_TypeError,
13567                                "not all arguments converted during string formatting");
13568                goto onError;
13569            }
13570            Py_CLEAR(temp);
13571        } /* '%' */
13572    } /* until end */
13573    if (argidx < arglen && !dict) {
13574        PyErr_SetString(PyExc_TypeError,
13575                        "not all arguments converted during string formatting");
13576        goto onError;
13577    }
13578
13579    result = _PyAccu_Finish(&acc);
13580    if (args_owned) {
13581        Py_DECREF(args);
13582    }
13583    Py_DECREF(uformat);
13584    Py_XDECREF(temp);
13585    Py_XDECREF(second);
13586    return result;
13587
13588  onError:
13589    Py_DECREF(uformat);
13590    Py_XDECREF(temp);
13591    Py_XDECREF(second);
13592    _PyAccu_Destroy(&acc);
13593    if (args_owned) {
13594        Py_DECREF(args);
13595    }
13596    return NULL;
13597}
13598
13599static PyObject *
13600unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13601
13602static PyObject *
13603unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13604{
13605    PyObject *x = NULL;
13606    static char *kwlist[] = {"object", "encoding", "errors", 0};
13607    char *encoding = NULL;
13608    char *errors = NULL;
13609
13610    if (type != &PyUnicode_Type)
13611        return unicode_subtype_new(type, args, kwds);
13612    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13613                                     kwlist, &x, &encoding, &errors))
13614        return NULL;
13615    if (x == NULL)
13616        return PyUnicode_New(0, 0);
13617    if (encoding == NULL && errors == NULL)
13618        return PyObject_Str(x);
13619    else
13620        return PyUnicode_FromEncodedObject(x, encoding, errors);
13621}
13622
13623static PyObject *
13624unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13625{
13626    PyObject *unicode, *self;
13627    Py_ssize_t length, char_size;
13628    int share_wstr, share_utf8;
13629    unsigned int kind;
13630    void *data;
13631
13632    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13633
13634    unicode = unicode_new(&PyUnicode_Type, args, kwds);
13635    if (unicode == NULL)
13636        return NULL;
13637    assert(_PyUnicode_CHECK(unicode));
13638    if (PyUnicode_READY(unicode))
13639        return NULL;
13640
13641    self = type->tp_alloc(type, 0);
13642    if (self == NULL) {
13643        Py_DECREF(unicode);
13644        return NULL;
13645    }
13646    kind = PyUnicode_KIND(unicode);
13647    length = PyUnicode_GET_LENGTH(unicode);
13648
13649    _PyUnicode_LENGTH(self) = length;
13650#ifdef Py_DEBUG
13651    _PyUnicode_HASH(self) = -1;
13652#else
13653    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13654#endif
13655    _PyUnicode_STATE(self).interned = 0;
13656    _PyUnicode_STATE(self).kind = kind;
13657    _PyUnicode_STATE(self).compact = 0;
13658    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13659    _PyUnicode_STATE(self).ready = 1;
13660    _PyUnicode_WSTR(self) = NULL;
13661    _PyUnicode_UTF8_LENGTH(self) = 0;
13662    _PyUnicode_UTF8(self) = NULL;
13663    _PyUnicode_WSTR_LENGTH(self) = 0;
13664    _PyUnicode_DATA_ANY(self) = NULL;
13665
13666    share_utf8 = 0;
13667    share_wstr = 0;
13668    if (kind == PyUnicode_1BYTE_KIND) {
13669        char_size = 1;
13670        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13671            share_utf8 = 1;
13672    }
13673    else if (kind == PyUnicode_2BYTE_KIND) {
13674        char_size = 2;
13675        if (sizeof(wchar_t) == 2)
13676            share_wstr = 1;
13677    }
13678    else {
13679        assert(kind == PyUnicode_4BYTE_KIND);
13680        char_size = 4;
13681        if (sizeof(wchar_t) == 4)
13682            share_wstr = 1;
13683    }
13684
13685    /* Ensure we won't overflow the length. */
13686    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13687        PyErr_NoMemory();
13688        goto onError;
13689    }
13690    data = PyObject_MALLOC((length + 1) * char_size);
13691    if (data == NULL) {
13692        PyErr_NoMemory();
13693        goto onError;
13694    }
13695
13696    _PyUnicode_DATA_ANY(self) = data;
13697    if (share_utf8) {
13698        _PyUnicode_UTF8_LENGTH(self) = length;
13699        _PyUnicode_UTF8(self) = data;
13700    }
13701    if (share_wstr) {
13702        _PyUnicode_WSTR_LENGTH(self) = length;
13703        _PyUnicode_WSTR(self) = (wchar_t *)data;
13704    }
13705
13706    Py_MEMCPY(data, PyUnicode_DATA(unicode),
13707              kind * (length + 1));
13708    assert(_PyUnicode_CheckConsistency(self, 1));
13709#ifdef Py_DEBUG
13710    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13711#endif
13712    Py_DECREF(unicode);
13713    return self;
13714
13715onError:
13716    Py_DECREF(unicode);
13717    Py_DECREF(self);
13718    return NULL;
13719}
13720
13721PyDoc_STRVAR(unicode_doc,
13722             "str(string[, encoding[, errors]]) -> str\n\
13723\n\
13724Create a new string object from the given encoded string.\n\
13725encoding defaults to the current default string encoding.\n\
13726errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
13727
13728static PyObject *unicode_iter(PyObject *seq);
13729
13730PyTypeObject PyUnicode_Type = {
13731    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13732    "str",              /* tp_name */
13733    sizeof(PyUnicodeObject),        /* tp_size */
13734    0,                  /* tp_itemsize */
13735    /* Slots */
13736    (destructor)unicode_dealloc,    /* tp_dealloc */
13737    0,                  /* tp_print */
13738    0,                  /* tp_getattr */
13739    0,                  /* tp_setattr */
13740    0,                  /* tp_reserved */
13741    unicode_repr,           /* tp_repr */
13742    &unicode_as_number,         /* tp_as_number */
13743    &unicode_as_sequence,       /* tp_as_sequence */
13744    &unicode_as_mapping,        /* tp_as_mapping */
13745    (hashfunc) unicode_hash,        /* tp_hash*/
13746    0,                  /* tp_call*/
13747    (reprfunc) unicode_str,     /* tp_str */
13748    PyObject_GenericGetAttr,        /* tp_getattro */
13749    0,                  /* tp_setattro */
13750    0,                  /* tp_as_buffer */
13751    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
13752    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
13753    unicode_doc,            /* tp_doc */
13754    0,                  /* tp_traverse */
13755    0,                  /* tp_clear */
13756    PyUnicode_RichCompare,      /* tp_richcompare */
13757    0,                  /* tp_weaklistoffset */
13758    unicode_iter,           /* tp_iter */
13759    0,                  /* tp_iternext */
13760    unicode_methods,            /* tp_methods */
13761    0,                  /* tp_members */
13762    0,                  /* tp_getset */
13763    &PyBaseObject_Type,         /* tp_base */
13764    0,                  /* tp_dict */
13765    0,                  /* tp_descr_get */
13766    0,                  /* tp_descr_set */
13767    0,                  /* tp_dictoffset */
13768    0,                  /* tp_init */
13769    0,                  /* tp_alloc */
13770    unicode_new,            /* tp_new */
13771    PyObject_Del,           /* tp_free */
13772};
13773
13774/* Initialize the Unicode implementation */
13775
13776int _PyUnicode_Init(void)
13777{
13778    int i;
13779
13780    /* XXX - move this array to unicodectype.c ? */
13781    Py_UCS2 linebreak[] = {
13782        0x000A, /* LINE FEED */
13783        0x000D, /* CARRIAGE RETURN */
13784        0x001C, /* FILE SEPARATOR */
13785        0x001D, /* GROUP SEPARATOR */
13786        0x001E, /* RECORD SEPARATOR */
13787        0x0085, /* NEXT LINE */
13788        0x2028, /* LINE SEPARATOR */
13789        0x2029, /* PARAGRAPH SEPARATOR */
13790    };
13791
13792    /* Init the implementation */
13793    unicode_empty = PyUnicode_New(0, 0);
13794    if (!unicode_empty)
13795        Py_FatalError("Can't create empty string");
13796    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
13797
13798    for (i = 0; i < 256; i++)
13799        unicode_latin1[i] = NULL;
13800    if (PyType_Ready(&PyUnicode_Type) < 0)
13801        Py_FatalError("Can't initialize 'unicode'");
13802
13803    /* initialize the linebreak bloom filter */
13804    bloom_linebreak = make_bloom_mask(
13805        PyUnicode_2BYTE_KIND, linebreak,
13806        Py_ARRAY_LENGTH(linebreak));
13807
13808    PyType_Ready(&EncodingMapType);
13809
13810#ifdef HAVE_MBCS
13811    winver.dwOSVersionInfoSize = sizeof(winver);
13812    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13813        PyErr_SetFromWindowsErr(0);
13814        return -1;
13815    }
13816#endif
13817    return 0;
13818}
13819
13820/* Finalize the Unicode implementation */
13821
13822int
13823PyUnicode_ClearFreeList(void)
13824{
13825    return 0;
13826}
13827
13828void
13829_PyUnicode_Fini(void)
13830{
13831    int i;
13832
13833    Py_XDECREF(unicode_empty);
13834    unicode_empty = NULL;
13835
13836    for (i = 0; i < 256; i++) {
13837        if (unicode_latin1[i]) {
13838            Py_DECREF(unicode_latin1[i]);
13839            unicode_latin1[i] = NULL;
13840        }
13841    }
13842    _PyUnicode_ClearStaticStrings();
13843    (void)PyUnicode_ClearFreeList();
13844}
13845
13846void
13847PyUnicode_InternInPlace(PyObject **p)
13848{
13849    register PyObject *s = *p;
13850    PyObject *t;
13851#ifdef Py_DEBUG
13852    assert(s != NULL);
13853    assert(_PyUnicode_CHECK(s));
13854#else
13855    if (s == NULL || !PyUnicode_Check(s))
13856        return;
13857#endif
13858    /* If it's a subclass, we don't really know what putting
13859       it in the interned dict might do. */
13860    if (!PyUnicode_CheckExact(s))
13861        return;
13862    if (PyUnicode_CHECK_INTERNED(s))
13863        return;
13864    if (interned == NULL) {
13865        interned = PyDict_New();
13866        if (interned == NULL) {
13867            PyErr_Clear(); /* Don't leave an exception */
13868            return;
13869        }
13870    }
13871    /* It might be that the GetItem call fails even
13872       though the key is present in the dictionary,
13873       namely when this happens during a stack overflow. */
13874    Py_ALLOW_RECURSION
13875    t = PyDict_GetItem(interned, s);
13876    Py_END_ALLOW_RECURSION
13877
13878        if (t) {
13879            Py_INCREF(t);
13880            Py_DECREF(*p);
13881            *p = t;
13882            return;
13883        }
13884
13885    PyThreadState_GET()->recursion_critical = 1;
13886    if (PyDict_SetItem(interned, s, s) < 0) {
13887        PyErr_Clear();
13888        PyThreadState_GET()->recursion_critical = 0;
13889        return;
13890    }
13891    PyThreadState_GET()->recursion_critical = 0;
13892    /* The two references in interned are not counted by refcnt.
13893       The deallocator will take care of this */
13894    Py_REFCNT(s) -= 2;
13895    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13896}
13897
13898void
13899PyUnicode_InternImmortal(PyObject **p)
13900{
13901    PyUnicode_InternInPlace(p);
13902    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13903        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
13904        Py_INCREF(*p);
13905    }
13906}
13907
13908PyObject *
13909PyUnicode_InternFromString(const char *cp)
13910{
13911    PyObject *s = PyUnicode_FromString(cp);
13912    if (s == NULL)
13913        return NULL;
13914    PyUnicode_InternInPlace(&s);
13915    return s;
13916}
13917
13918void
13919_Py_ReleaseInternedUnicodeStrings(void)
13920{
13921    PyObject *keys;
13922    PyObject *s;
13923    Py_ssize_t i, n;
13924    Py_ssize_t immortal_size = 0, mortal_size = 0;
13925
13926    if (interned == NULL || !PyDict_Check(interned))
13927        return;
13928    keys = PyDict_Keys(interned);
13929    if (keys == NULL || !PyList_Check(keys)) {
13930        PyErr_Clear();
13931        return;
13932    }
13933
13934    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13935       detector, interned unicode strings are not forcibly deallocated;
13936       rather, we give them their stolen references back, and then clear
13937       and DECREF the interned dict. */
13938
13939    n = PyList_GET_SIZE(keys);
13940    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13941            n);
13942    for (i = 0; i < n; i++) {
13943        s = PyList_GET_ITEM(keys, i);
13944        if (PyUnicode_READY(s) == -1) {
13945            assert(0 && "could not ready string");
13946            fprintf(stderr, "could not ready string\n");
13947        }
13948        switch (PyUnicode_CHECK_INTERNED(s)) {
13949        case SSTATE_NOT_INTERNED:
13950            /* XXX Shouldn't happen */
13951            break;
13952        case SSTATE_INTERNED_IMMORTAL:
13953            Py_REFCNT(s) += 1;
13954            immortal_size += PyUnicode_GET_LENGTH(s);
13955            break;
13956        case SSTATE_INTERNED_MORTAL:
13957            Py_REFCNT(s) += 2;
13958            mortal_size += PyUnicode_GET_LENGTH(s);
13959            break;
13960        default:
13961            Py_FatalError("Inconsistent interned string state.");
13962        }
13963        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13964    }
13965    fprintf(stderr, "total size of all interned strings: "
13966            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13967            "mortal/immortal\n", mortal_size, immortal_size);
13968    Py_DECREF(keys);
13969    PyDict_Clear(interned);
13970    Py_DECREF(interned);
13971    interned = NULL;
13972}
13973
13974
13975/********************* Unicode Iterator **************************/
13976
13977typedef struct {
13978    PyObject_HEAD
13979    Py_ssize_t it_index;
13980    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
13981} unicodeiterobject;
13982
13983static void
13984unicodeiter_dealloc(unicodeiterobject *it)
13985{
13986    _PyObject_GC_UNTRACK(it);
13987    Py_XDECREF(it->it_seq);
13988    PyObject_GC_Del(it);
13989}
13990
13991static int
13992unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13993{
13994    Py_VISIT(it->it_seq);
13995    return 0;
13996}
13997
13998static PyObject *
13999unicodeiter_next(unicodeiterobject *it)
14000{
14001    PyObject *seq, *item;
14002
14003    assert(it != NULL);
14004    seq = it->it_seq;
14005    if (seq == NULL)
14006        return NULL;
14007    assert(_PyUnicode_CHECK(seq));
14008
14009    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14010        int kind = PyUnicode_KIND(seq);
14011        void *data = PyUnicode_DATA(seq);
14012        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14013        item = PyUnicode_FromOrdinal(chr);
14014        if (item != NULL)
14015            ++it->it_index;
14016        return item;
14017    }
14018
14019    Py_DECREF(seq);
14020    it->it_seq = NULL;
14021    return NULL;
14022}
14023
14024static PyObject *
14025unicodeiter_len(unicodeiterobject *it)
14026{
14027    Py_ssize_t len = 0;
14028    if (it->it_seq)
14029        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14030    return PyLong_FromSsize_t(len);
14031}
14032
14033PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14034
14035static PyMethodDef unicodeiter_methods[] = {
14036    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14037     length_hint_doc},
14038    {NULL,      NULL}       /* sentinel */
14039};
14040
14041PyTypeObject PyUnicodeIter_Type = {
14042    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14043    "str_iterator",         /* tp_name */
14044    sizeof(unicodeiterobject),      /* tp_basicsize */
14045    0,                  /* tp_itemsize */
14046    /* methods */
14047    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14048    0,                  /* tp_print */
14049    0,                  /* tp_getattr */
14050    0,                  /* tp_setattr */
14051    0,                  /* tp_reserved */
14052    0,                  /* tp_repr */
14053    0,                  /* tp_as_number */
14054    0,                  /* tp_as_sequence */
14055    0,                  /* tp_as_mapping */
14056    0,                  /* tp_hash */
14057    0,                  /* tp_call */
14058    0,                  /* tp_str */
14059    PyObject_GenericGetAttr,        /* tp_getattro */
14060    0,                  /* tp_setattro */
14061    0,                  /* tp_as_buffer */
14062    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14063    0,                  /* tp_doc */
14064    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14065    0,                  /* tp_clear */
14066    0,                  /* tp_richcompare */
14067    0,                  /* tp_weaklistoffset */
14068    PyObject_SelfIter,          /* tp_iter */
14069    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14070    unicodeiter_methods,            /* tp_methods */
14071    0,
14072};
14073
14074static PyObject *
14075unicode_iter(PyObject *seq)
14076{
14077    unicodeiterobject *it;
14078
14079    if (!PyUnicode_Check(seq)) {
14080        PyErr_BadInternalCall();
14081        return NULL;
14082    }
14083    if (PyUnicode_READY(seq) == -1)
14084        return NULL;
14085    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14086    if (it == NULL)
14087        return NULL;
14088    it->it_index = 0;
14089    Py_INCREF(seq);
14090    it->it_seq = seq;
14091    _PyObject_GC_TRACK(it);
14092    return (PyObject *)it;
14093}
14094
14095
14096size_t
14097Py_UNICODE_strlen(const Py_UNICODE *u)
14098{
14099    int res = 0;
14100    while(*u++)
14101        res++;
14102    return res;
14103}
14104
14105Py_UNICODE*
14106Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14107{
14108    Py_UNICODE *u = s1;
14109    while ((*u++ = *s2++));
14110    return s1;
14111}
14112
14113Py_UNICODE*
14114Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14115{
14116    Py_UNICODE *u = s1;
14117    while ((*u++ = *s2++))
14118        if (n-- == 0)
14119            break;
14120    return s1;
14121}
14122
14123Py_UNICODE*
14124Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14125{
14126    Py_UNICODE *u1 = s1;
14127    u1 += Py_UNICODE_strlen(u1);
14128    Py_UNICODE_strcpy(u1, s2);
14129    return s1;
14130}
14131
14132int
14133Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14134{
14135    while (*s1 && *s2 && *s1 == *s2)
14136        s1++, s2++;
14137    if (*s1 && *s2)
14138        return (*s1 < *s2) ? -1 : +1;
14139    if (*s1)
14140        return 1;
14141    if (*s2)
14142        return -1;
14143    return 0;
14144}
14145
14146int
14147Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14148{
14149    register Py_UNICODE u1, u2;
14150    for (; n != 0; n--) {
14151        u1 = *s1;
14152        u2 = *s2;
14153        if (u1 != u2)
14154            return (u1 < u2) ? -1 : +1;
14155        if (u1 == '\0')
14156            return 0;
14157        s1++;
14158        s2++;
14159    }
14160    return 0;
14161}
14162
14163Py_UNICODE*
14164Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14165{
14166    const Py_UNICODE *p;
14167    for (p = s; *p; p++)
14168        if (*p == c)
14169            return (Py_UNICODE*)p;
14170    return NULL;
14171}
14172
14173Py_UNICODE*
14174Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14175{
14176    const Py_UNICODE *p;
14177    p = s + Py_UNICODE_strlen(s);
14178    while (p != s) {
14179        p--;
14180        if (*p == c)
14181            return (Py_UNICODE*)p;
14182    }
14183    return NULL;
14184}
14185
14186Py_UNICODE*
14187PyUnicode_AsUnicodeCopy(PyObject *unicode)
14188{
14189    Py_UNICODE *u, *copy;
14190    Py_ssize_t len, size;
14191
14192    if (!PyUnicode_Check(unicode)) {
14193        PyErr_BadArgument();
14194        return NULL;
14195    }
14196    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14197    if (u == NULL)
14198        return NULL;
14199    /* Ensure we won't overflow the size. */
14200    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14201        PyErr_NoMemory();
14202        return NULL;
14203    }
14204    size = len + 1; /* copy the null character */
14205    size *= sizeof(Py_UNICODE);
14206    copy = PyMem_Malloc(size);
14207    if (copy == NULL) {
14208        PyErr_NoMemory();
14209        return NULL;
14210    }
14211    memcpy(copy, u, size);
14212    return copy;
14213}
14214
14215/* A _string module, to export formatter_parser and formatter_field_name_split
14216   to the string.Formatter class implemented in Python. */
14217
14218static PyMethodDef _string_methods[] = {
14219    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14220     METH_O, PyDoc_STR("split the argument as a field name")},
14221    {"formatter_parser", (PyCFunction) formatter_parser,
14222     METH_O, PyDoc_STR("parse the argument as a format string")},
14223    {NULL, NULL}
14224};
14225
14226static struct PyModuleDef _string_module = {
14227    PyModuleDef_HEAD_INIT,
14228    "_string",
14229    PyDoc_STR("string helper module"),
14230    0,
14231    _string_methods,
14232    NULL,
14233    NULL,
14234    NULL,
14235    NULL
14236};
14237
14238PyMODINIT_FUNC
14239PyInit__string(void)
14240{
14241    return PyModule_Create(&_string_module);
14242}
14243
14244
14245#ifdef __cplusplus
14246}
14247#endif
14248