unicodeobject.c revision e45c0c5cefaa634ce1bf9efe95523756c60917c7
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
58/* --- Globals ------------------------------------------------------------
59
60   The globals are initialized by the _PyUnicode_Init() API and should
61   not be used before calling that API.
62
63*/
64
65
66#ifdef __cplusplus
67extern "C" {
68#endif
69
70/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
73#ifdef Py_DEBUG
74#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
75#else
76#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
78
79#define _PyUnicode_UTF8(op)                             \
80    (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op)                              \
82    (assert(_PyUnicode_CHECK(op)),                      \
83     assert(PyUnicode_IS_READY(op)),                    \
84     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
85         ((char*)((PyASCIIObject*)(op) + 1)) :          \
86         _PyUnicode_UTF8(op))
87#define _PyUnicode_UTF8_LENGTH(op)                      \
88    (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op)                       \
90    (assert(_PyUnicode_CHECK(op)),                      \
91     assert(PyUnicode_IS_READY(op)),                    \
92     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
93         ((PyASCIIObject*)(op))->length :               \
94         _PyUnicode_UTF8_LENGTH(op))
95#define _PyUnicode_WSTR(op)                             \
96    (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op)                      \
98    (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op)                           \
100    (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op)                            \
102    (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op)                             \
104    (((PyASCIIObject *)(op))->hash)
105#define _PyUnicode_KIND(op)                             \
106    (assert(_PyUnicode_CHECK(op)),                      \
107     ((PyASCIIObject *)(op))->state.kind)
108#define _PyUnicode_GET_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     ((PyASCIIObject *)(op))->length)
111#define _PyUnicode_DATA_ANY(op)                         \
112    (((PyUnicodeObject*)(op))->data.any)
113
114/* Optimized version of Py_MAX() to compute the maximum character:
115   use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2)                 \
117    ((maxchar1) | (maxchar2))
118
119#undef PyUnicode_READY
120#define PyUnicode_READY(op)                             \
121    (assert(_PyUnicode_CHECK(op)),                      \
122     (PyUnicode_IS_READY(op) ?                          \
123      0 :                                               \
124      _PyUnicode_Ready(op)))
125
126#define _PyUnicode_SHARE_UTF8(op)                       \
127    (assert(_PyUnicode_CHECK(op)),                      \
128     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
129     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op)                       \
131    (assert(_PyUnicode_CHECK(op)),                      \
132     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated UTF-8 memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
137    (assert(_PyUnicode_CHECK(op)),                      \
138     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
139      && _PyUnicode_UTF8(op)                            \
140      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
142/* true if the Unicode object has an allocated wstr memory block
143   (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
145    (assert(_PyUnicode_CHECK(op)),                      \
146     (_PyUnicode_WSTR(op) &&                            \
147      (!PyUnicode_IS_READY(op) ||                       \
148       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
150/* Generic helper macro to convert characters of different types.
151   from_type and to_type have to be valid type names, begin and end
152   are pointers to the source characters which should be of type
153   "from_type *".  to is a pointer of type "to_type *" and points to the
154   buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156    do {                                                \
157        to_type *_to = (to_type *) to;                  \
158        const from_type *_iter = (begin);               \
159        const from_type *_end = (end);                  \
160        Py_ssize_t n = (_end) - (_iter);                \
161        const from_type *_unrolled_end =                \
162            _iter + (n & ~ (Py_ssize_t) 3);             \
163        while (_iter < (_unrolled_end)) {               \
164            _to[0] = (to_type) _iter[0];                \
165            _to[1] = (to_type) _iter[1];                \
166            _to[2] = (to_type) _iter[2];                \
167            _to[3] = (to_type) _iter[3];                \
168            _iter += 4; _to += 4;                       \
169        }                                               \
170        while (_iter < (_end))                          \
171            *_to++ = (to_type) *_iter++;                \
172    } while (0)
173
174/* This dictionary holds all interned unicode strings.  Note that references
175   to strings in this dictionary are *not* counted in the string's ob_refcnt.
176   When the interned string reaches a refcnt of 0 the string deallocation
177   function will delete the reference from this dictionary.
178
179   Another way to look at this is that to say that the actual reference
180   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
181*/
182static PyObject *interned;
183
184/* The empty Unicode object is shared to improve performance. */
185static PyObject *unicode_empty;
186
187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
190/* Single character Unicode strings in the Latin-1 range are being
191   shared as well. */
192static PyObject *unicode_latin1[256];
193
194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
196    0, 0, 0, 0, 0, 0, 0, 0,
197/*     case 0x0009: * CHARACTER TABULATION */
198/*     case 0x000A: * LINE FEED */
199/*     case 0x000B: * LINE TABULATION */
200/*     case 0x000C: * FORM FEED */
201/*     case 0x000D: * CARRIAGE RETURN */
202    0, 1, 1, 1, 1, 1, 0, 0,
203    0, 0, 0, 0, 0, 0, 0, 0,
204/*     case 0x001C: * FILE SEPARATOR */
205/*     case 0x001D: * GROUP SEPARATOR */
206/*     case 0x001E: * RECORD SEPARATOR */
207/*     case 0x001F: * UNIT SEPARATOR */
208    0, 0, 0, 0, 1, 1, 1, 1,
209/*     case 0x0020: * SPACE */
210    1, 0, 0, 0, 0, 0, 0, 0,
211    0, 0, 0, 0, 0, 0, 0, 0,
212    0, 0, 0, 0, 0, 0, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* forward */
226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
227static PyObject* get_latin1_char(unsigned char ch);
228static void copy_characters(
229    PyObject *to, Py_ssize_t to_start,
230    PyObject *from, Py_ssize_t from_start,
231    Py_ssize_t how_many);
232static int unicode_modifiable(PyObject *unicode);
233
234
235static PyObject *
236unicode_fromascii(const unsigned char *s, Py_ssize_t size);
237static PyObject *
238_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
239static PyObject *
240_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
243
244static PyObject *
245unicode_encode_call_errorhandler(const char *errors,
246       PyObject **errorHandler,const char *encoding, const char *reason,
247       PyObject *unicode, PyObject **exceptionObject,
248       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
249
250static void
251raise_encode_exception(PyObject **exceptionObject,
252                       const char *encoding,
253                       PyObject *unicode,
254                       Py_ssize_t startpos, Py_ssize_t endpos,
255                       const char *reason);
256
257/* Same for linebreaks */
258static unsigned char ascii_linebreak[] = {
259    0, 0, 0, 0, 0, 0, 0, 0,
260/*         0x000A, * LINE FEED */
261/*         0x000B, * LINE TABULATION */
262/*         0x000C, * FORM FEED */
263/*         0x000D, * CARRIAGE RETURN */
264    0, 0, 1, 1, 1, 1, 0, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266/*         0x001C, * FILE SEPARATOR */
267/*         0x001D, * GROUP SEPARATOR */
268/*         0x001E, * RECORD SEPARATOR */
269    0, 0, 0, 0, 1, 1, 1, 0,
270    0, 0, 0, 0, 0, 0, 0, 0,
271    0, 0, 0, 0, 0, 0, 0, 0,
272    0, 0, 0, 0, 0, 0, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0,
277    0, 0, 0, 0, 0, 0, 0, 0,
278    0, 0, 0, 0, 0, 0, 0, 0,
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0
283};
284
285/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
286   This function is kept for backward compatibility with the old API. */
287Py_UNICODE
288PyUnicode_GetMax(void)
289{
290#ifdef Py_UNICODE_WIDE
291    return 0x10FFFF;
292#else
293    /* This is actually an illegal character, so it should
294       not be passed to unichr. */
295    return 0xFFFF;
296#endif
297}
298
299#ifdef Py_DEBUG
300int
301_PyUnicode_CheckConsistency(PyObject *op, int check_content)
302{
303    PyASCIIObject *ascii;
304    unsigned int kind;
305
306    assert(PyUnicode_Check(op));
307
308    ascii = (PyASCIIObject *)op;
309    kind = ascii->state.kind;
310
311    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
312        assert(kind == PyUnicode_1BYTE_KIND);
313        assert(ascii->state.ready == 1);
314    }
315    else {
316        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
317        void *data;
318
319        if (ascii->state.compact == 1) {
320            data = compact + 1;
321            assert(kind == PyUnicode_1BYTE_KIND
322                   || kind == PyUnicode_2BYTE_KIND
323                   || kind == PyUnicode_4BYTE_KIND);
324            assert(ascii->state.ascii == 0);
325            assert(ascii->state.ready == 1);
326            assert (compact->utf8 != data);
327        }
328        else {
329            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
330
331            data = unicode->data.any;
332            if (kind == PyUnicode_WCHAR_KIND) {
333                assert(ascii->length == 0);
334                assert(ascii->hash == -1);
335                assert(ascii->state.compact == 0);
336                assert(ascii->state.ascii == 0);
337                assert(ascii->state.ready == 0);
338                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
339                assert(ascii->wstr != NULL);
340                assert(data == NULL);
341                assert(compact->utf8 == NULL);
342            }
343            else {
344                assert(kind == PyUnicode_1BYTE_KIND
345                       || kind == PyUnicode_2BYTE_KIND
346                       || kind == PyUnicode_4BYTE_KIND);
347                assert(ascii->state.compact == 0);
348                assert(ascii->state.ready == 1);
349                assert(data != NULL);
350                if (ascii->state.ascii) {
351                    assert (compact->utf8 == data);
352                    assert (compact->utf8_length == ascii->length);
353                }
354                else
355                    assert (compact->utf8 != data);
356            }
357        }
358        if (kind != PyUnicode_WCHAR_KIND) {
359            if (
360#if SIZEOF_WCHAR_T == 2
361                kind == PyUnicode_2BYTE_KIND
362#else
363                kind == PyUnicode_4BYTE_KIND
364#endif
365               )
366            {
367                assert(ascii->wstr == data);
368                assert(compact->wstr_length == ascii->length);
369            } else
370                assert(ascii->wstr != data);
371        }
372
373        if (compact->utf8 == NULL)
374            assert(compact->utf8_length == 0);
375        if (ascii->wstr == NULL)
376            assert(compact->wstr_length == 0);
377    }
378    /* check that the best kind is used */
379    if (check_content && kind != PyUnicode_WCHAR_KIND)
380    {
381        Py_ssize_t i;
382        Py_UCS4 maxchar = 0;
383        void *data;
384        Py_UCS4 ch;
385
386        data = PyUnicode_DATA(ascii);
387        for (i=0; i < ascii->length; i++)
388        {
389            ch = PyUnicode_READ(kind, data, i);
390            if (ch > maxchar)
391                maxchar = ch;
392        }
393        if (kind == PyUnicode_1BYTE_KIND) {
394            if (ascii->state.ascii == 0) {
395                assert(maxchar >= 128);
396                assert(maxchar <= 255);
397            }
398            else
399                assert(maxchar < 128);
400        }
401        else if (kind == PyUnicode_2BYTE_KIND) {
402            assert(maxchar >= 0x100);
403            assert(maxchar <= 0xFFFF);
404        }
405        else {
406            assert(maxchar >= 0x10000);
407            assert(maxchar <= MAX_UNICODE);
408        }
409        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
410    }
411    return 1;
412}
413#endif
414
415static PyObject*
416unicode_result_wchar(PyObject *unicode)
417{
418#ifndef Py_DEBUG
419    Py_ssize_t len;
420
421    assert(Py_REFCNT(unicode) == 1);
422
423    len = _PyUnicode_WSTR_LENGTH(unicode);
424    if (len == 0) {
425        Py_INCREF(unicode_empty);
426        Py_DECREF(unicode);
427        return unicode_empty;
428    }
429
430    if (len == 1) {
431        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
432        if (ch < 256) {
433            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434            Py_DECREF(unicode);
435            return latin1_char;
436        }
437    }
438
439    if (_PyUnicode_Ready(unicode) < 0) {
440        Py_XDECREF(unicode);
441        return NULL;
442    }
443#else
444    /* don't make the result ready in debug mode to ensure that the caller
445       makes the string ready before using it */
446    assert(_PyUnicode_CheckConsistency(unicode, 1));
447#endif
448    return unicode;
449}
450
451static PyObject*
452unicode_result_ready(PyObject *unicode)
453{
454    Py_ssize_t length;
455
456    length = PyUnicode_GET_LENGTH(unicode);
457    if (length == 0) {
458        if (unicode != unicode_empty) {
459            Py_INCREF(unicode_empty);
460            Py_DECREF(unicode);
461        }
462        return unicode_empty;
463    }
464
465    if (length == 1) {
466        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
467        if (ch < 256) {
468            PyObject *latin1_char = unicode_latin1[ch];
469            if (latin1_char != NULL) {
470                if (unicode != latin1_char) {
471                    Py_INCREF(latin1_char);
472                    Py_DECREF(unicode);
473                }
474                return latin1_char;
475            }
476            else {
477                assert(_PyUnicode_CheckConsistency(unicode, 1));
478                Py_INCREF(unicode);
479                unicode_latin1[ch] = unicode;
480                return unicode;
481            }
482        }
483    }
484
485    assert(_PyUnicode_CheckConsistency(unicode, 1));
486    return unicode;
487}
488
489static PyObject*
490unicode_result(PyObject *unicode)
491{
492    assert(_PyUnicode_CHECK(unicode));
493    if (PyUnicode_IS_READY(unicode))
494        return unicode_result_ready(unicode);
495    else
496        return unicode_result_wchar(unicode);
497}
498
499static PyObject*
500unicode_result_unchanged(PyObject *unicode)
501{
502    if (PyUnicode_CheckExact(unicode)) {
503        if (PyUnicode_READY(unicode) == -1)
504            return NULL;
505        Py_INCREF(unicode);
506        return unicode;
507    }
508    else
509        /* Subtype -- return genuine unicode string with the same value. */
510        return _PyUnicode_Copy(unicode);
511}
512
513#ifdef HAVE_MBCS
514static OSVERSIONINFOEX winver;
515#endif
516
517/* --- Bloom Filters ----------------------------------------------------- */
518
519/* stuff to implement simple "bloom filters" for Unicode characters.
520   to keep things simple, we use a single bitmask, using the least 5
521   bits from each unicode characters as the bit index. */
522
523/* the linebreak mask is set up by Unicode_Init below */
524
525#if LONG_BIT >= 128
526#define BLOOM_WIDTH 128
527#elif LONG_BIT >= 64
528#define BLOOM_WIDTH 64
529#elif LONG_BIT >= 32
530#define BLOOM_WIDTH 32
531#else
532#error "LONG_BIT is smaller than 32"
533#endif
534
535#define BLOOM_MASK unsigned long
536
537static BLOOM_MASK bloom_linebreak;
538
539#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
540#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
541
542#define BLOOM_LINEBREAK(ch)                                             \
543    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
544     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
545
546Py_LOCAL_INLINE(BLOOM_MASK)
547make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
548{
549    /* calculate simple bloom-style bitmask for a given unicode string */
550
551    BLOOM_MASK mask;
552    Py_ssize_t i;
553
554    mask = 0;
555    for (i = 0; i < len; i++)
556        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
557
558    return mask;
559}
560
561#define BLOOM_MEMBER(mask, chr, str) \
562    (BLOOM(mask, chr) \
563     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
564
565/* Compilation of templated routines */
566
567#include "stringlib/asciilib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs1lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs2lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
597#include "stringlib/ucs4lib.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/partition.h"
600#include "stringlib/split.h"
601#include "stringlib/count.h"
602#include "stringlib/find.h"
603#include "stringlib/find_max_char.h"
604#include "stringlib/localeutil.h"
605#include "stringlib/undef.h"
606
607#include "stringlib/unicodedefs.h"
608#include "stringlib/fastsearch.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
611#include "stringlib/undef.h"
612
613/* --- Unicode Object ----------------------------------------------------- */
614
615static PyObject *
616fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
617
618Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
619                                     Py_ssize_t size, Py_UCS4 ch,
620                                     int direction)
621{
622    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
623
624    switch (kind) {
625    case PyUnicode_1BYTE_KIND:
626        {
627            Py_UCS1 ch1 = (Py_UCS1) ch;
628            if (ch1 == ch)
629                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
630            else
631                return -1;
632        }
633    case PyUnicode_2BYTE_KIND:
634        {
635            Py_UCS2 ch2 = (Py_UCS2) ch;
636            if (ch2 == ch)
637                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
638            else
639                return -1;
640        }
641    case PyUnicode_4BYTE_KIND:
642        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
643    default:
644        assert(0);
645        return -1;
646    }
647}
648
649static PyObject*
650resize_compact(PyObject *unicode, Py_ssize_t length)
651{
652    Py_ssize_t char_size;
653    Py_ssize_t struct_size;
654    Py_ssize_t new_size;
655    int share_wstr;
656    PyObject *new_unicode;
657    assert(unicode_modifiable(unicode));
658    assert(PyUnicode_IS_READY(unicode));
659    assert(PyUnicode_IS_COMPACT(unicode));
660
661    char_size = PyUnicode_KIND(unicode);
662    if (PyUnicode_IS_ASCII(unicode))
663        struct_size = sizeof(PyASCIIObject);
664    else
665        struct_size = sizeof(PyCompactUnicodeObject);
666    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
667
668    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
669        PyErr_NoMemory();
670        return NULL;
671    }
672    new_size = (struct_size + (length + 1) * char_size);
673
674    _Py_DEC_REFTOTAL;
675    _Py_ForgetReference(unicode);
676
677    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
678    if (new_unicode == NULL) {
679        _Py_NewReference(unicode);
680        PyErr_NoMemory();
681        return NULL;
682    }
683    unicode = new_unicode;
684    _Py_NewReference(unicode);
685
686    _PyUnicode_LENGTH(unicode) = length;
687    if (share_wstr) {
688        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
689        if (!PyUnicode_IS_ASCII(unicode))
690            _PyUnicode_WSTR_LENGTH(unicode) = length;
691    }
692    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
693                    length, 0);
694    assert(_PyUnicode_CheckConsistency(unicode, 0));
695    return unicode;
696}
697
698static int
699resize_inplace(PyObject *unicode, Py_ssize_t length)
700{
701    wchar_t *wstr;
702    Py_ssize_t new_size;
703    assert(!PyUnicode_IS_COMPACT(unicode));
704    assert(Py_REFCNT(unicode) == 1);
705
706    if (PyUnicode_IS_READY(unicode)) {
707        Py_ssize_t char_size;
708        int share_wstr, share_utf8;
709        void *data;
710
711        data = _PyUnicode_DATA_ANY(unicode);
712        char_size = PyUnicode_KIND(unicode);
713        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
714        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
715
716        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
717            PyErr_NoMemory();
718            return -1;
719        }
720        new_size = (length + 1) * char_size;
721
722        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
723        {
724            PyObject_DEL(_PyUnicode_UTF8(unicode));
725            _PyUnicode_UTF8(unicode) = NULL;
726            _PyUnicode_UTF8_LENGTH(unicode) = 0;
727        }
728
729        data = (PyObject *)PyObject_REALLOC(data, new_size);
730        if (data == NULL) {
731            PyErr_NoMemory();
732            return -1;
733        }
734        _PyUnicode_DATA_ANY(unicode) = data;
735        if (share_wstr) {
736            _PyUnicode_WSTR(unicode) = data;
737            _PyUnicode_WSTR_LENGTH(unicode) = length;
738        }
739        if (share_utf8) {
740            _PyUnicode_UTF8(unicode) = data;
741            _PyUnicode_UTF8_LENGTH(unicode) = length;
742        }
743        _PyUnicode_LENGTH(unicode) = length;
744        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
745        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
746            assert(_PyUnicode_CheckConsistency(unicode, 0));
747            return 0;
748        }
749    }
750    assert(_PyUnicode_WSTR(unicode) != NULL);
751
752    /* check for integer overflow */
753    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
754        PyErr_NoMemory();
755        return -1;
756    }
757    new_size = sizeof(wchar_t) * (length + 1);
758    wstr =  _PyUnicode_WSTR(unicode);
759    wstr = PyObject_REALLOC(wstr, new_size);
760    if (!wstr) {
761        PyErr_NoMemory();
762        return -1;
763    }
764    _PyUnicode_WSTR(unicode) = wstr;
765    _PyUnicode_WSTR(unicode)[length] = 0;
766    _PyUnicode_WSTR_LENGTH(unicode) = length;
767    assert(_PyUnicode_CheckConsistency(unicode, 0));
768    return 0;
769}
770
771static PyObject*
772resize_copy(PyObject *unicode, Py_ssize_t length)
773{
774    Py_ssize_t copy_length;
775    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
776        PyObject *copy;
777
778        if (PyUnicode_READY(unicode) == -1)
779            return NULL;
780
781        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
782        if (copy == NULL)
783            return NULL;
784
785        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
786        copy_characters(copy, 0, unicode, 0, copy_length);
787        return copy;
788    }
789    else {
790        PyObject *w;
791
792        w = (PyObject*)_PyUnicode_New(length);
793        if (w == NULL)
794            return NULL;
795        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
796        copy_length = Py_MIN(copy_length, length);
797        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
798                        copy_length);
799        return w;
800    }
801}
802
803/* We allocate one more byte to make sure the string is
804   Ux0000 terminated; some code (e.g. new_identifier)
805   relies on that.
806
807   XXX This allocator could further be enhanced by assuring that the
808   free list never reduces its size below 1.
809
810*/
811
812#ifdef Py_DEBUG
813static int unicode_old_new_calls = 0;
814#endif
815
816static PyUnicodeObject *
817_PyUnicode_New(Py_ssize_t length)
818{
819    register PyUnicodeObject *unicode;
820    size_t new_size;
821
822    /* Optimization for empty strings */
823    if (length == 0 && unicode_empty != NULL) {
824        Py_INCREF(unicode_empty);
825        return (PyUnicodeObject*)unicode_empty;
826    }
827
828    /* Ensure we won't overflow the size. */
829    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
830        return (PyUnicodeObject *)PyErr_NoMemory();
831    }
832    if (length < 0) {
833        PyErr_SetString(PyExc_SystemError,
834                        "Negative size passed to _PyUnicode_New");
835        return NULL;
836    }
837
838#ifdef Py_DEBUG
839    ++unicode_old_new_calls;
840#endif
841
842    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
843    if (unicode == NULL)
844        return NULL;
845    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
846    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
847    if (!_PyUnicode_WSTR(unicode)) {
848        Py_DECREF(unicode);
849        PyErr_NoMemory();
850        return NULL;
851    }
852
853    /* Initialize the first element to guard against cases where
854     * the caller fails before initializing str -- unicode_resize()
855     * reads str[0], and the Keep-Alive optimization can keep memory
856     * allocated for str alive across a call to unicode_dealloc(unicode).
857     * We don't want unicode_resize to read uninitialized memory in
858     * that case.
859     */
860    _PyUnicode_WSTR(unicode)[0] = 0;
861    _PyUnicode_WSTR(unicode)[length] = 0;
862    _PyUnicode_WSTR_LENGTH(unicode) = length;
863    _PyUnicode_HASH(unicode) = -1;
864    _PyUnicode_STATE(unicode).interned = 0;
865    _PyUnicode_STATE(unicode).kind = 0;
866    _PyUnicode_STATE(unicode).compact = 0;
867    _PyUnicode_STATE(unicode).ready = 0;
868    _PyUnicode_STATE(unicode).ascii = 0;
869    _PyUnicode_DATA_ANY(unicode) = NULL;
870    _PyUnicode_LENGTH(unicode) = 0;
871    _PyUnicode_UTF8(unicode) = NULL;
872    _PyUnicode_UTF8_LENGTH(unicode) = 0;
873    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
874    return unicode;
875}
876
877static const char*
878unicode_kind_name(PyObject *unicode)
879{
880    /* don't check consistency: unicode_kind_name() is called from
881       _PyUnicode_Dump() */
882    if (!PyUnicode_IS_COMPACT(unicode))
883    {
884        if (!PyUnicode_IS_READY(unicode))
885            return "wstr";
886        switch (PyUnicode_KIND(unicode))
887        {
888        case PyUnicode_1BYTE_KIND:
889            if (PyUnicode_IS_ASCII(unicode))
890                return "legacy ascii";
891            else
892                return "legacy latin1";
893        case PyUnicode_2BYTE_KIND:
894            return "legacy UCS2";
895        case PyUnicode_4BYTE_KIND:
896            return "legacy UCS4";
897        default:
898            return "<legacy invalid kind>";
899        }
900    }
901    assert(PyUnicode_IS_READY(unicode));
902    switch (PyUnicode_KIND(unicode)) {
903    case PyUnicode_1BYTE_KIND:
904        if (PyUnicode_IS_ASCII(unicode))
905            return "ascii";
906        else
907            return "latin1";
908    case PyUnicode_2BYTE_KIND:
909        return "UCS2";
910    case PyUnicode_4BYTE_KIND:
911        return "UCS4";
912    default:
913        return "<invalid compact kind>";
914    }
915}
916
917#ifdef Py_DEBUG
918static int unicode_new_new_calls = 0;
919
920/* Functions wrapping macros for use in debugger */
921char *_PyUnicode_utf8(void *unicode){
922    return PyUnicode_UTF8(unicode);
923}
924
925void *_PyUnicode_compact_data(void *unicode) {
926    return _PyUnicode_COMPACT_DATA(unicode);
927}
928void *_PyUnicode_data(void *unicode){
929    printf("obj %p\n", unicode);
930    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
931    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
932    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
933    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
934    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
935    return PyUnicode_DATA(unicode);
936}
937
938void
939_PyUnicode_Dump(PyObject *op)
940{
941    PyASCIIObject *ascii = (PyASCIIObject *)op;
942    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
943    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
944    void *data;
945
946    if (ascii->state.compact)
947    {
948        if (ascii->state.ascii)
949            data = (ascii + 1);
950        else
951            data = (compact + 1);
952    }
953    else
954        data = unicode->data.any;
955    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
956
957    if (ascii->wstr == data)
958        printf("shared ");
959    printf("wstr=%p", ascii->wstr);
960
961    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
962        printf(" (%zu), ", compact->wstr_length);
963        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
964            printf("shared ");
965        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
966    }
967    printf(", data=%p\n", data);
968}
969#endif
970
971PyObject *
972PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
973{
974    PyObject *obj;
975    PyCompactUnicodeObject *unicode;
976    void *data;
977    enum PyUnicode_Kind kind;
978    int is_sharing, is_ascii;
979    Py_ssize_t char_size;
980    Py_ssize_t struct_size;
981
982    /* Optimization for empty strings */
983    if (size == 0 && unicode_empty != NULL) {
984        Py_INCREF(unicode_empty);
985        return unicode_empty;
986    }
987
988#ifdef Py_DEBUG
989    ++unicode_new_new_calls;
990#endif
991
992    is_ascii = 0;
993    is_sharing = 0;
994    struct_size = sizeof(PyCompactUnicodeObject);
995    if (maxchar < 128) {
996        kind = PyUnicode_1BYTE_KIND;
997        char_size = 1;
998        is_ascii = 1;
999        struct_size = sizeof(PyASCIIObject);
1000    }
1001    else if (maxchar < 256) {
1002        kind = PyUnicode_1BYTE_KIND;
1003        char_size = 1;
1004    }
1005    else if (maxchar < 65536) {
1006        kind = PyUnicode_2BYTE_KIND;
1007        char_size = 2;
1008        if (sizeof(wchar_t) == 2)
1009            is_sharing = 1;
1010    }
1011    else {
1012        if (maxchar > MAX_UNICODE) {
1013            PyErr_SetString(PyExc_SystemError,
1014                            "invalid maximum character passed to PyUnicode_New");
1015            return NULL;
1016        }
1017        kind = PyUnicode_4BYTE_KIND;
1018        char_size = 4;
1019        if (sizeof(wchar_t) == 4)
1020            is_sharing = 1;
1021    }
1022
1023    /* Ensure we won't overflow the size. */
1024    if (size < 0) {
1025        PyErr_SetString(PyExc_SystemError,
1026                        "Negative size passed to PyUnicode_New");
1027        return NULL;
1028    }
1029    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1030        return PyErr_NoMemory();
1031
1032    /* Duplicated allocation code from _PyObject_New() instead of a call to
1033     * PyObject_New() so we are able to allocate space for the object and
1034     * it's data buffer.
1035     */
1036    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1037    if (obj == NULL)
1038        return PyErr_NoMemory();
1039    obj = PyObject_INIT(obj, &PyUnicode_Type);
1040    if (obj == NULL)
1041        return NULL;
1042
1043    unicode = (PyCompactUnicodeObject *)obj;
1044    if (is_ascii)
1045        data = ((PyASCIIObject*)obj) + 1;
1046    else
1047        data = unicode + 1;
1048    _PyUnicode_LENGTH(unicode) = size;
1049    _PyUnicode_HASH(unicode) = -1;
1050    _PyUnicode_STATE(unicode).interned = 0;
1051    _PyUnicode_STATE(unicode).kind = kind;
1052    _PyUnicode_STATE(unicode).compact = 1;
1053    _PyUnicode_STATE(unicode).ready = 1;
1054    _PyUnicode_STATE(unicode).ascii = is_ascii;
1055    if (is_ascii) {
1056        ((char*)data)[size] = 0;
1057        _PyUnicode_WSTR(unicode) = NULL;
1058    }
1059    else if (kind == PyUnicode_1BYTE_KIND) {
1060        ((char*)data)[size] = 0;
1061        _PyUnicode_WSTR(unicode) = NULL;
1062        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063        unicode->utf8 = NULL;
1064        unicode->utf8_length = 0;
1065    }
1066    else {
1067        unicode->utf8 = NULL;
1068        unicode->utf8_length = 0;
1069        if (kind == PyUnicode_2BYTE_KIND)
1070            ((Py_UCS2*)data)[size] = 0;
1071        else /* kind == PyUnicode_4BYTE_KIND */
1072            ((Py_UCS4*)data)[size] = 0;
1073        if (is_sharing) {
1074            _PyUnicode_WSTR_LENGTH(unicode) = size;
1075            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1076        }
1077        else {
1078            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1079            _PyUnicode_WSTR(unicode) = NULL;
1080        }
1081    }
1082#ifdef Py_DEBUG
1083    /* Fill the data with invalid characters to detect bugs earlier.
1084       _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1085       at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1086       and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1087    memset(data, 0xff, size * kind);
1088#endif
1089    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1090    return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095   will decode surrogate pairs, the other conversions are implemented as macros
1096   for efficiency.
1097
1098   This function assumes that unicode can hold one more code point than wstr
1099   characters for a terminating null character. */
1100static void
1101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1102                              PyObject *unicode)
1103{
1104    const wchar_t *iter;
1105    Py_UCS4 *ucs4_out;
1106
1107    assert(unicode != NULL);
1108    assert(_PyUnicode_CHECK(unicode));
1109    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112    for (iter = begin; iter < end; ) {
1113        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114                           _PyUnicode_GET_LENGTH(unicode)));
1115        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116            && (iter+1) < end
1117            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1118        {
1119            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1120            iter += 2;
1121        }
1122        else {
1123            *ucs4_out++ = *iter;
1124            iter++;
1125        }
1126    }
1127    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128                        _PyUnicode_GET_LENGTH(unicode)));
1129
1130}
1131#endif
1132
1133static int
1134unicode_check_modifiable(PyObject *unicode)
1135{
1136    if (!unicode_modifiable(unicode)) {
1137        PyErr_SetString(PyExc_SystemError,
1138                        "Cannot modify a string currently used");
1139        return -1;
1140    }
1141    return 0;
1142}
1143
1144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146                 PyObject *from, Py_ssize_t from_start,
1147                 Py_ssize_t how_many, int check_maxchar)
1148{
1149    unsigned int from_kind, to_kind;
1150    void *from_data, *to_data;
1151    int fast;
1152
1153    assert(0 <= how_many);
1154    assert(0 <= from_start);
1155    assert(0 <= to_start);
1156    assert(PyUnicode_Check(from));
1157    assert(PyUnicode_Check(to));
1158    assert(PyUnicode_IS_READY(from));
1159    assert(PyUnicode_IS_READY(to));
1160    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1161    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
1163    if (how_many == 0)
1164        return 0;
1165
1166    from_kind = PyUnicode_KIND(from);
1167    from_data = PyUnicode_DATA(from);
1168    to_kind = PyUnicode_KIND(to);
1169    to_data = PyUnicode_DATA(to);
1170
1171#ifdef Py_DEBUG
1172    if (!check_maxchar
1173        && (from_kind > to_kind
1174            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
1175    {
1176        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1177        Py_UCS4 ch;
1178        Py_ssize_t i;
1179        for (i=0; i < how_many; i++) {
1180            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1181            assert(ch <= to_maxchar);
1182        }
1183    }
1184#endif
1185    fast = (from_kind == to_kind);
1186    if (check_maxchar
1187        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1188    {
1189        /* deny latin1 => ascii */
1190        fast = 0;
1191    }
1192
1193    if (fast) {
1194        Py_MEMCPY((char*)to_data + to_kind * to_start,
1195                  (char*)from_data + from_kind * from_start,
1196                  to_kind * how_many);
1197    }
1198    else if (from_kind == PyUnicode_1BYTE_KIND
1199             && to_kind == PyUnicode_2BYTE_KIND)
1200    {
1201        _PyUnicode_CONVERT_BYTES(
1202            Py_UCS1, Py_UCS2,
1203            PyUnicode_1BYTE_DATA(from) + from_start,
1204            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1205            PyUnicode_2BYTE_DATA(to) + to_start
1206            );
1207    }
1208    else if (from_kind == PyUnicode_1BYTE_KIND
1209             && to_kind == PyUnicode_4BYTE_KIND)
1210    {
1211        _PyUnicode_CONVERT_BYTES(
1212            Py_UCS1, Py_UCS4,
1213            PyUnicode_1BYTE_DATA(from) + from_start,
1214            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1215            PyUnicode_4BYTE_DATA(to) + to_start
1216            );
1217    }
1218    else if (from_kind == PyUnicode_2BYTE_KIND
1219             && to_kind == PyUnicode_4BYTE_KIND)
1220    {
1221        _PyUnicode_CONVERT_BYTES(
1222            Py_UCS2, Py_UCS4,
1223            PyUnicode_2BYTE_DATA(from) + from_start,
1224            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1225            PyUnicode_4BYTE_DATA(to) + to_start
1226            );
1227    }
1228    else {
1229        /* check if max_char(from substring) <= max_char(to) */
1230        if (from_kind > to_kind
1231                /* latin1 => ascii */
1232            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1233        {
1234            /* slow path to check for character overflow */
1235            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1236            Py_UCS4 ch;
1237            Py_ssize_t i;
1238
1239#ifdef Py_DEBUG
1240            for (i=0; i < how_many; i++) {
1241                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1242                assert(ch <= to_maxchar);
1243                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1244            }
1245#else
1246            if (!check_maxchar) {
1247                for (i=0; i < how_many; i++) {
1248                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1249                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1250                }
1251            }
1252            else {
1253                for (i=0; i < how_many; i++) {
1254                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1255                    if (ch > to_maxchar)
1256                        return 1;
1257                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1258                }
1259            }
1260#endif
1261        }
1262        else {
1263            assert(0 && "inconsistent state");
1264            return 1;
1265        }
1266    }
1267    return 0;
1268}
1269
1270static void
1271copy_characters(PyObject *to, Py_ssize_t to_start,
1272                       PyObject *from, Py_ssize_t from_start,
1273                       Py_ssize_t how_many)
1274{
1275    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1276}
1277
1278Py_ssize_t
1279PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1280                         PyObject *from, Py_ssize_t from_start,
1281                         Py_ssize_t how_many)
1282{
1283    int err;
1284
1285    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1286        PyErr_BadInternalCall();
1287        return -1;
1288    }
1289
1290    if (PyUnicode_READY(from) == -1)
1291        return -1;
1292    if (PyUnicode_READY(to) == -1)
1293        return -1;
1294
1295    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1296    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1297        PyErr_Format(PyExc_SystemError,
1298                     "Cannot write %zi characters at %zi "
1299                     "in a string of %zi characters",
1300                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1301        return -1;
1302    }
1303
1304    if (how_many == 0)
1305        return 0;
1306
1307    if (unicode_check_modifiable(to))
1308        return -1;
1309
1310    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1311    if (err) {
1312        PyErr_Format(PyExc_SystemError,
1313                     "Cannot copy %s characters "
1314                     "into a string of %s characters",
1315                     unicode_kind_name(from),
1316                     unicode_kind_name(to));
1317        return -1;
1318    }
1319    return how_many;
1320}
1321
1322/* Find the maximum code point and count the number of surrogate pairs so a
1323   correct string length can be computed before converting a string to UCS4.
1324   This function counts single surrogates as a character and not as a pair.
1325
1326   Return 0 on success, or -1 on error. */
1327static int
1328find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1329                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1330{
1331    const wchar_t *iter;
1332    Py_UCS4 ch;
1333
1334    assert(num_surrogates != NULL && maxchar != NULL);
1335    *num_surrogates = 0;
1336    *maxchar = 0;
1337
1338    for (iter = begin; iter < end; ) {
1339#if SIZEOF_WCHAR_T == 2
1340        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1341            && (iter+1) < end
1342            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1343        {
1344            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1345            ++(*num_surrogates);
1346            iter += 2;
1347        }
1348        else
1349#endif
1350        {
1351            ch = *iter;
1352            iter++;
1353        }
1354        if (ch > *maxchar) {
1355            *maxchar = ch;
1356            if (*maxchar > MAX_UNICODE) {
1357                PyErr_Format(PyExc_ValueError,
1358                             "character U+%x is not in range [U+0000; U+10ffff]",
1359                             ch);
1360                return -1;
1361            }
1362        }
1363    }
1364    return 0;
1365}
1366
1367#ifdef Py_DEBUG
1368static int unicode_ready_calls = 0;
1369#endif
1370
1371int
1372_PyUnicode_Ready(PyObject *unicode)
1373{
1374    wchar_t *end;
1375    Py_UCS4 maxchar = 0;
1376    Py_ssize_t num_surrogates;
1377#if SIZEOF_WCHAR_T == 2
1378    Py_ssize_t length_wo_surrogates;
1379#endif
1380
1381    /* _PyUnicode_Ready() is only intended for old-style API usage where
1382       strings were created using _PyObject_New() and where no canonical
1383       representation (the str field) has been set yet aka strings
1384       which are not yet ready. */
1385    assert(_PyUnicode_CHECK(unicode));
1386    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1387    assert(_PyUnicode_WSTR(unicode) != NULL);
1388    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1389    assert(_PyUnicode_UTF8(unicode) == NULL);
1390    /* Actually, it should neither be interned nor be anything else: */
1391    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1392
1393#ifdef Py_DEBUG
1394    ++unicode_ready_calls;
1395#endif
1396
1397    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1398    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1399                                &maxchar, &num_surrogates) == -1)
1400        return -1;
1401
1402    if (maxchar < 256) {
1403        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1404        if (!_PyUnicode_DATA_ANY(unicode)) {
1405            PyErr_NoMemory();
1406            return -1;
1407        }
1408        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1409                                _PyUnicode_WSTR(unicode), end,
1410                                PyUnicode_1BYTE_DATA(unicode));
1411        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1412        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1413        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1414        if (maxchar < 128) {
1415            _PyUnicode_STATE(unicode).ascii = 1;
1416            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1417            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1418        }
1419        else {
1420            _PyUnicode_STATE(unicode).ascii = 0;
1421            _PyUnicode_UTF8(unicode) = NULL;
1422            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1423        }
1424        PyObject_FREE(_PyUnicode_WSTR(unicode));
1425        _PyUnicode_WSTR(unicode) = NULL;
1426        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1427    }
1428    /* In this case we might have to convert down from 4-byte native
1429       wchar_t to 2-byte unicode. */
1430    else if (maxchar < 65536) {
1431        assert(num_surrogates == 0 &&
1432               "FindMaxCharAndNumSurrogatePairs() messed up");
1433
1434#if SIZEOF_WCHAR_T == 2
1435        /* We can share representations and are done. */
1436        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1437        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1438        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1439        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1440        _PyUnicode_UTF8(unicode) = NULL;
1441        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1442#else
1443        /* sizeof(wchar_t) == 4 */
1444        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1445            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1446        if (!_PyUnicode_DATA_ANY(unicode)) {
1447            PyErr_NoMemory();
1448            return -1;
1449        }
1450        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1451                                _PyUnicode_WSTR(unicode), end,
1452                                PyUnicode_2BYTE_DATA(unicode));
1453        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1456        _PyUnicode_UTF8(unicode) = NULL;
1457        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1458        PyObject_FREE(_PyUnicode_WSTR(unicode));
1459        _PyUnicode_WSTR(unicode) = NULL;
1460        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1461#endif
1462    }
1463    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1464    else {
1465#if SIZEOF_WCHAR_T == 2
1466        /* in case the native representation is 2-bytes, we need to allocate a
1467           new normalized 4-byte version. */
1468        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1469        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1470        if (!_PyUnicode_DATA_ANY(unicode)) {
1471            PyErr_NoMemory();
1472            return -1;
1473        }
1474        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1475        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1476        _PyUnicode_UTF8(unicode) = NULL;
1477        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1478        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1479        _PyUnicode_STATE(unicode).ready = 1;
1480        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1481        PyObject_FREE(_PyUnicode_WSTR(unicode));
1482        _PyUnicode_WSTR(unicode) = NULL;
1483        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1484#else
1485        assert(num_surrogates == 0);
1486
1487        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1488        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1489        _PyUnicode_UTF8(unicode) = NULL;
1490        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1491        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1492#endif
1493        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1494    }
1495    _PyUnicode_STATE(unicode).ready = 1;
1496    assert(_PyUnicode_CheckConsistency(unicode, 1));
1497    return 0;
1498}
1499
1500static void
1501unicode_dealloc(register PyObject *unicode)
1502{
1503    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1504    case SSTATE_NOT_INTERNED:
1505        break;
1506
1507    case SSTATE_INTERNED_MORTAL:
1508        /* revive dead object temporarily for DelItem */
1509        Py_REFCNT(unicode) = 3;
1510        if (PyDict_DelItem(interned, unicode) != 0)
1511            Py_FatalError(
1512                "deletion of interned string failed");
1513        break;
1514
1515    case SSTATE_INTERNED_IMMORTAL:
1516        Py_FatalError("Immortal interned string died.");
1517
1518    default:
1519        Py_FatalError("Inconsistent interned string state.");
1520    }
1521
1522    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1523        PyObject_DEL(_PyUnicode_WSTR(unicode));
1524    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1525        PyObject_DEL(_PyUnicode_UTF8(unicode));
1526    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1527        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1528
1529    Py_TYPE(unicode)->tp_free(unicode);
1530}
1531
1532#ifdef Py_DEBUG
1533static int
1534unicode_is_singleton(PyObject *unicode)
1535{
1536    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1537    if (unicode == unicode_empty)
1538        return 1;
1539    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1540    {
1541        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1542        if (ch < 256 && unicode_latin1[ch] == unicode)
1543            return 1;
1544    }
1545    return 0;
1546}
1547#endif
1548
1549static int
1550unicode_modifiable(PyObject *unicode)
1551{
1552    assert(_PyUnicode_CHECK(unicode));
1553    if (Py_REFCNT(unicode) != 1)
1554        return 0;
1555    if (_PyUnicode_HASH(unicode) != -1)
1556        return 0;
1557    if (PyUnicode_CHECK_INTERNED(unicode))
1558        return 0;
1559    if (!PyUnicode_CheckExact(unicode))
1560        return 0;
1561#ifdef Py_DEBUG
1562    /* singleton refcount is greater than 1 */
1563    assert(!unicode_is_singleton(unicode));
1564#endif
1565    return 1;
1566}
1567
1568static int
1569unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1570{
1571    PyObject *unicode;
1572    Py_ssize_t old_length;
1573
1574    assert(p_unicode != NULL);
1575    unicode = *p_unicode;
1576
1577    assert(unicode != NULL);
1578    assert(PyUnicode_Check(unicode));
1579    assert(0 <= length);
1580
1581    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1582        old_length = PyUnicode_WSTR_LENGTH(unicode);
1583    else
1584        old_length = PyUnicode_GET_LENGTH(unicode);
1585    if (old_length == length)
1586        return 0;
1587
1588    if (length == 0) {
1589        Py_DECREF(*p_unicode);
1590        *p_unicode = unicode_empty;
1591        Py_INCREF(*p_unicode);
1592        return 0;
1593    }
1594
1595    if (!unicode_modifiable(unicode)) {
1596        PyObject *copy = resize_copy(unicode, length);
1597        if (copy == NULL)
1598            return -1;
1599        Py_DECREF(*p_unicode);
1600        *p_unicode = copy;
1601        return 0;
1602    }
1603
1604    if (PyUnicode_IS_COMPACT(unicode)) {
1605        PyObject *new_unicode = resize_compact(unicode, length);
1606        if (new_unicode == NULL)
1607            return -1;
1608        *p_unicode = new_unicode;
1609        return 0;
1610    }
1611    return resize_inplace(unicode, length);
1612}
1613
1614int
1615PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1616{
1617    PyObject *unicode;
1618    if (p_unicode == NULL) {
1619        PyErr_BadInternalCall();
1620        return -1;
1621    }
1622    unicode = *p_unicode;
1623    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1624    {
1625        PyErr_BadInternalCall();
1626        return -1;
1627    }
1628    return unicode_resize(p_unicode, length);
1629}
1630
1631static int
1632unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1633              unsigned int maxchar)
1634{
1635    PyObject *result;
1636    assert(PyUnicode_IS_READY(*p_unicode));
1637    assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
1638    if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1639        return 0;
1640    result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1641                           maxchar);
1642    if (result == NULL)
1643        return -1;
1644    PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, length);
1645    Py_DECREF(*p_unicode);
1646    *p_unicode = result;
1647    return 0;
1648}
1649
1650static int
1651unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1652                Py_UCS4 ch)
1653{
1654    assert(ch <= MAX_UNICODE);
1655    if (unicode_widen(p_unicode, *pos, ch) < 0)
1656        return -1;
1657    PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1658                    PyUnicode_DATA(*p_unicode),
1659                    (*pos)++, ch);
1660    return 0;
1661}
1662
1663/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1664   Return the length of the input string.
1665
1666   WARNING: The function doesn't copy the terminating null character and
1667   doesn't check the maximum character (may write a latin1 character in an
1668   ASCII string). */
1669static Py_ssize_t
1670unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1671{
1672    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1673    void *data = PyUnicode_DATA(unicode);
1674
1675    switch (kind) {
1676    case PyUnicode_1BYTE_KIND: {
1677        Py_ssize_t len = strlen(str);
1678        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1679        memcpy((char *) data + index, str, len);
1680        return len;
1681    }
1682    case PyUnicode_2BYTE_KIND: {
1683        Py_UCS2 *start = (Py_UCS2 *)data + index;
1684        Py_UCS2 *ucs2 = start;
1685        assert(index <= PyUnicode_GET_LENGTH(unicode));
1686
1687        for (; *str; ++ucs2, ++str)
1688            *ucs2 = (Py_UCS2)*str;
1689
1690        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1691        return ucs2 - start;
1692    }
1693    default: {
1694        Py_UCS4 *start = (Py_UCS4 *)data + index;
1695        Py_UCS4 *ucs4 = start;
1696        assert(kind == PyUnicode_4BYTE_KIND);
1697        assert(index <= PyUnicode_GET_LENGTH(unicode));
1698
1699        for (; *str; ++ucs4, ++str)
1700            *ucs4 = (Py_UCS4)*str;
1701
1702        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1703        return ucs4 - start;
1704    }
1705    }
1706}
1707
1708
1709static PyObject*
1710get_latin1_char(unsigned char ch)
1711{
1712    PyObject *unicode = unicode_latin1[ch];
1713    if (!unicode) {
1714        unicode = PyUnicode_New(1, ch);
1715        if (!unicode)
1716            return NULL;
1717        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1718        assert(_PyUnicode_CheckConsistency(unicode, 1));
1719        unicode_latin1[ch] = unicode;
1720    }
1721    Py_INCREF(unicode);
1722    return unicode;
1723}
1724
1725PyObject *
1726PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1727{
1728    PyObject *unicode;
1729    Py_UCS4 maxchar = 0;
1730    Py_ssize_t num_surrogates;
1731
1732    if (u == NULL)
1733        return (PyObject*)_PyUnicode_New(size);
1734
1735    /* If the Unicode data is known at construction time, we can apply
1736       some optimizations which share commonly used objects. */
1737
1738    /* Optimization for empty strings */
1739    if (size == 0 && unicode_empty != NULL) {
1740        Py_INCREF(unicode_empty);
1741        return unicode_empty;
1742    }
1743
1744    /* Single character Unicode objects in the Latin-1 range are
1745       shared when using this constructor */
1746    if (size == 1 && *u < 256)
1747        return get_latin1_char((unsigned char)*u);
1748
1749    /* If not empty and not single character, copy the Unicode data
1750       into the new object */
1751    if (find_maxchar_surrogates(u, u + size,
1752                                &maxchar, &num_surrogates) == -1)
1753        return NULL;
1754
1755    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1756    if (!unicode)
1757        return NULL;
1758
1759    switch (PyUnicode_KIND(unicode)) {
1760    case PyUnicode_1BYTE_KIND:
1761        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1762                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1763        break;
1764    case PyUnicode_2BYTE_KIND:
1765#if Py_UNICODE_SIZE == 2
1766        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1767#else
1768        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1769                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1770#endif
1771        break;
1772    case PyUnicode_4BYTE_KIND:
1773#if SIZEOF_WCHAR_T == 2
1774        /* This is the only case which has to process surrogates, thus
1775           a simple copy loop is not enough and we need a function. */
1776        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1777#else
1778        assert(num_surrogates == 0);
1779        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1780#endif
1781        break;
1782    default:
1783        assert(0 && "Impossible state");
1784    }
1785
1786    return unicode_result(unicode);
1787}
1788
1789PyObject *
1790PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1791{
1792    if (size < 0) {
1793        PyErr_SetString(PyExc_SystemError,
1794                        "Negative size passed to PyUnicode_FromStringAndSize");
1795        return NULL;
1796    }
1797    if (u != NULL)
1798        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1799    else
1800        return (PyObject *)_PyUnicode_New(size);
1801}
1802
1803PyObject *
1804PyUnicode_FromString(const char *u)
1805{
1806    size_t size = strlen(u);
1807    if (size > PY_SSIZE_T_MAX) {
1808        PyErr_SetString(PyExc_OverflowError, "input too long");
1809        return NULL;
1810    }
1811    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1812}
1813
1814PyObject *
1815_PyUnicode_FromId(_Py_Identifier *id)
1816{
1817    if (!id->object) {
1818        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1819                                                  strlen(id->string),
1820                                                  NULL, NULL);
1821        if (!id->object)
1822            return NULL;
1823        PyUnicode_InternInPlace(&id->object);
1824        assert(!id->next);
1825        id->next = static_strings;
1826        static_strings = id;
1827    }
1828    return id->object;
1829}
1830
1831void
1832_PyUnicode_ClearStaticStrings()
1833{
1834    _Py_Identifier *i;
1835    for (i = static_strings; i; i = i->next) {
1836        Py_DECREF(i->object);
1837        i->object = NULL;
1838        i->next = NULL;
1839    }
1840}
1841
1842/* Internal function, doesn't check maximum character */
1843
1844static PyObject*
1845unicode_fromascii(const unsigned char* s, Py_ssize_t size)
1846{
1847    PyObject *unicode;
1848    if (size == 1) {
1849#ifdef Py_DEBUG
1850        assert(s[0] < 128);
1851#endif
1852        return get_latin1_char(s[0]);
1853    }
1854    unicode = PyUnicode_New(size, 127);
1855    if (!unicode)
1856        return NULL;
1857    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1858    assert(_PyUnicode_CheckConsistency(unicode, 1));
1859    return unicode;
1860}
1861
1862static Py_UCS4
1863kind_maxchar_limit(unsigned int kind)
1864{
1865    switch (kind) {
1866    case PyUnicode_1BYTE_KIND:
1867        return 0x80;
1868    case PyUnicode_2BYTE_KIND:
1869        return 0x100;
1870    case PyUnicode_4BYTE_KIND:
1871        return 0x10000;
1872    default:
1873        assert(0 && "invalid kind");
1874        return MAX_UNICODE;
1875    }
1876}
1877
1878Py_LOCAL_INLINE(Py_UCS4)
1879align_maxchar(Py_UCS4 maxchar)
1880{
1881    if (maxchar <= 127)
1882        return 127;
1883    else if (maxchar <= 255)
1884        return 255;
1885    else if (maxchar <= 65535)
1886        return 65535;
1887    else
1888        return MAX_UNICODE;
1889}
1890
1891static PyObject*
1892_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1893{
1894    PyObject *res;
1895    unsigned char max_char;
1896
1897    if (size == 0) {
1898        Py_INCREF(unicode_empty);
1899        return unicode_empty;
1900    }
1901    assert(size > 0);
1902    if (size == 1)
1903        return get_latin1_char(u[0]);
1904
1905    max_char = ucs1lib_find_max_char(u, u + size);
1906    res = PyUnicode_New(size, max_char);
1907    if (!res)
1908        return NULL;
1909    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1910    assert(_PyUnicode_CheckConsistency(res, 1));
1911    return res;
1912}
1913
1914static PyObject*
1915_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1916{
1917    PyObject *res;
1918    Py_UCS2 max_char;
1919
1920    if (size == 0) {
1921        Py_INCREF(unicode_empty);
1922        return unicode_empty;
1923    }
1924    assert(size > 0);
1925    if (size == 1) {
1926        Py_UCS4 ch = u[0];
1927        if (ch < 256)
1928            return get_latin1_char((unsigned char)ch);
1929
1930        res = PyUnicode_New(1, ch);
1931        if (res == NULL)
1932            return NULL;
1933        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1934        assert(_PyUnicode_CheckConsistency(res, 1));
1935        return res;
1936    }
1937
1938    max_char = ucs2lib_find_max_char(u, u + size);
1939    res = PyUnicode_New(size, max_char);
1940    if (!res)
1941        return NULL;
1942    if (max_char >= 256)
1943        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1944    else {
1945        _PyUnicode_CONVERT_BYTES(
1946            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1947    }
1948    assert(_PyUnicode_CheckConsistency(res, 1));
1949    return res;
1950}
1951
1952static PyObject*
1953_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1954{
1955    PyObject *res;
1956    Py_UCS4 max_char;
1957
1958    if (size == 0) {
1959        Py_INCREF(unicode_empty);
1960        return unicode_empty;
1961    }
1962    assert(size > 0);
1963    if (size == 1) {
1964        Py_UCS4 ch = u[0];
1965        if (ch < 256)
1966            return get_latin1_char((unsigned char)ch);
1967
1968        res = PyUnicode_New(1, ch);
1969        if (res == NULL)
1970            return NULL;
1971        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1972        assert(_PyUnicode_CheckConsistency(res, 1));
1973        return res;
1974    }
1975
1976    max_char = ucs4lib_find_max_char(u, u + size);
1977    res = PyUnicode_New(size, max_char);
1978    if (!res)
1979        return NULL;
1980    if (max_char < 256)
1981        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1982                                 PyUnicode_1BYTE_DATA(res));
1983    else if (max_char < 0x10000)
1984        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1985                                 PyUnicode_2BYTE_DATA(res));
1986    else
1987        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1988    assert(_PyUnicode_CheckConsistency(res, 1));
1989    return res;
1990}
1991
1992PyObject*
1993PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1994{
1995    if (size < 0) {
1996        PyErr_SetString(PyExc_ValueError, "size must be positive");
1997        return NULL;
1998    }
1999    switch (kind) {
2000    case PyUnicode_1BYTE_KIND:
2001        return _PyUnicode_FromUCS1(buffer, size);
2002    case PyUnicode_2BYTE_KIND:
2003        return _PyUnicode_FromUCS2(buffer, size);
2004    case PyUnicode_4BYTE_KIND:
2005        return _PyUnicode_FromUCS4(buffer, size);
2006    default:
2007        PyErr_SetString(PyExc_SystemError, "invalid kind");
2008        return NULL;
2009    }
2010}
2011
2012Py_UCS4
2013_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2014{
2015    enum PyUnicode_Kind kind;
2016    void *startptr, *endptr;
2017
2018    assert(PyUnicode_IS_READY(unicode));
2019    assert(0 <= start);
2020    assert(end <= PyUnicode_GET_LENGTH(unicode));
2021    assert(start <= end);
2022
2023    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2024        return PyUnicode_MAX_CHAR_VALUE(unicode);
2025
2026    if (start == end)
2027        return 127;
2028
2029    if (PyUnicode_IS_ASCII(unicode))
2030        return 127;
2031
2032    kind = PyUnicode_KIND(unicode);
2033    startptr = PyUnicode_DATA(unicode);
2034    endptr = (char *)startptr + end * kind;
2035    startptr = (char *)startptr + start * kind;
2036    switch(kind) {
2037    case PyUnicode_1BYTE_KIND:
2038        return ucs1lib_find_max_char(startptr, endptr);
2039    case PyUnicode_2BYTE_KIND:
2040        return ucs2lib_find_max_char(startptr, endptr);
2041    case PyUnicode_4BYTE_KIND:
2042        return ucs4lib_find_max_char(startptr, endptr);
2043    default:
2044        assert(0);
2045        return 0;
2046    }
2047}
2048
2049/* Ensure that a string uses the most efficient storage, if it is not the
2050   case: create a new string with of the right kind. Write NULL into *p_unicode
2051   on error. */
2052static void
2053unicode_adjust_maxchar(PyObject **p_unicode)
2054{
2055    PyObject *unicode, *copy;
2056    Py_UCS4 max_char;
2057    Py_ssize_t len;
2058    unsigned int kind;
2059
2060    assert(p_unicode != NULL);
2061    unicode = *p_unicode;
2062    assert(PyUnicode_IS_READY(unicode));
2063    if (PyUnicode_IS_ASCII(unicode))
2064        return;
2065
2066    len = PyUnicode_GET_LENGTH(unicode);
2067    kind = PyUnicode_KIND(unicode);
2068    if (kind == PyUnicode_1BYTE_KIND) {
2069        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2070        max_char = ucs1lib_find_max_char(u, u + len);
2071        if (max_char >= 128)
2072            return;
2073    }
2074    else if (kind == PyUnicode_2BYTE_KIND) {
2075        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2076        max_char = ucs2lib_find_max_char(u, u + len);
2077        if (max_char >= 256)
2078            return;
2079    }
2080    else {
2081        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2082        assert(kind == PyUnicode_4BYTE_KIND);
2083        max_char = ucs4lib_find_max_char(u, u + len);
2084        if (max_char >= 0x10000)
2085            return;
2086    }
2087    copy = PyUnicode_New(len, max_char);
2088    copy_characters(copy, 0, unicode, 0, len);
2089    Py_DECREF(unicode);
2090    *p_unicode = copy;
2091}
2092
2093PyObject*
2094_PyUnicode_Copy(PyObject *unicode)
2095{
2096    Py_ssize_t length;
2097    PyObject *copy;
2098
2099    if (!PyUnicode_Check(unicode)) {
2100        PyErr_BadInternalCall();
2101        return NULL;
2102    }
2103    if (PyUnicode_READY(unicode) == -1)
2104        return NULL;
2105
2106    length = PyUnicode_GET_LENGTH(unicode);
2107    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2108    if (!copy)
2109        return NULL;
2110    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2111
2112    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2113              length * PyUnicode_KIND(unicode));
2114    assert(_PyUnicode_CheckConsistency(copy, 1));
2115    return copy;
2116}
2117
2118
2119/* Widen Unicode objects to larger buffers. Don't write terminating null
2120   character. Return NULL on error. */
2121
2122void*
2123_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2124{
2125    Py_ssize_t len;
2126    void *result;
2127    unsigned int skind;
2128
2129    if (PyUnicode_READY(s) == -1)
2130        return NULL;
2131
2132    len = PyUnicode_GET_LENGTH(s);
2133    skind = PyUnicode_KIND(s);
2134    if (skind >= kind) {
2135        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2136        return NULL;
2137    }
2138    switch (kind) {
2139    case PyUnicode_2BYTE_KIND:
2140        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2141        if (!result)
2142            return PyErr_NoMemory();
2143        assert(skind == PyUnicode_1BYTE_KIND);
2144        _PyUnicode_CONVERT_BYTES(
2145            Py_UCS1, Py_UCS2,
2146            PyUnicode_1BYTE_DATA(s),
2147            PyUnicode_1BYTE_DATA(s) + len,
2148            result);
2149        return result;
2150    case PyUnicode_4BYTE_KIND:
2151        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2152        if (!result)
2153            return PyErr_NoMemory();
2154        if (skind == PyUnicode_2BYTE_KIND) {
2155            _PyUnicode_CONVERT_BYTES(
2156                Py_UCS2, Py_UCS4,
2157                PyUnicode_2BYTE_DATA(s),
2158                PyUnicode_2BYTE_DATA(s) + len,
2159                result);
2160        }
2161        else {
2162            assert(skind == PyUnicode_1BYTE_KIND);
2163            _PyUnicode_CONVERT_BYTES(
2164                Py_UCS1, Py_UCS4,
2165                PyUnicode_1BYTE_DATA(s),
2166                PyUnicode_1BYTE_DATA(s) + len,
2167                result);
2168        }
2169        return result;
2170    default:
2171        break;
2172    }
2173    PyErr_SetString(PyExc_SystemError, "invalid kind");
2174    return NULL;
2175}
2176
2177static Py_UCS4*
2178as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2179        int copy_null)
2180{
2181    int kind;
2182    void *data;
2183    Py_ssize_t len, targetlen;
2184    if (PyUnicode_READY(string) == -1)
2185        return NULL;
2186    kind = PyUnicode_KIND(string);
2187    data = PyUnicode_DATA(string);
2188    len = PyUnicode_GET_LENGTH(string);
2189    targetlen = len;
2190    if (copy_null)
2191        targetlen++;
2192    if (!target) {
2193        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2194            PyErr_NoMemory();
2195            return NULL;
2196        }
2197        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2198        if (!target) {
2199            PyErr_NoMemory();
2200            return NULL;
2201        }
2202    }
2203    else {
2204        if (targetsize < targetlen) {
2205            PyErr_Format(PyExc_SystemError,
2206                         "string is longer than the buffer");
2207            if (copy_null && 0 < targetsize)
2208                target[0] = 0;
2209            return NULL;
2210        }
2211    }
2212    if (kind == PyUnicode_1BYTE_KIND) {
2213        Py_UCS1 *start = (Py_UCS1 *) data;
2214        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2215    }
2216    else if (kind == PyUnicode_2BYTE_KIND) {
2217        Py_UCS2 *start = (Py_UCS2 *) data;
2218        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2219    }
2220    else {
2221        assert(kind == PyUnicode_4BYTE_KIND);
2222        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2223    }
2224    if (copy_null)
2225        target[len] = 0;
2226    return target;
2227}
2228
2229Py_UCS4*
2230PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2231                 int copy_null)
2232{
2233    if (target == NULL || targetsize < 0) {
2234        PyErr_BadInternalCall();
2235        return NULL;
2236    }
2237    return as_ucs4(string, target, targetsize, copy_null);
2238}
2239
2240Py_UCS4*
2241PyUnicode_AsUCS4Copy(PyObject *string)
2242{
2243    return as_ucs4(string, NULL, 0, 1);
2244}
2245
2246#ifdef HAVE_WCHAR_H
2247
2248PyObject *
2249PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2250{
2251    if (w == NULL) {
2252        if (size == 0) {
2253            Py_INCREF(unicode_empty);
2254            return unicode_empty;
2255        }
2256        PyErr_BadInternalCall();
2257        return NULL;
2258    }
2259
2260    if (size == -1) {
2261        size = wcslen(w);
2262    }
2263
2264    return PyUnicode_FromUnicode(w, size);
2265}
2266
2267#endif /* HAVE_WCHAR_H */
2268
2269static void
2270makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2271        int zeropad, int width, int precision, char c)
2272{
2273    *fmt++ = '%';
2274    if (width) {
2275        if (zeropad)
2276            *fmt++ = '0';
2277        fmt += sprintf(fmt, "%d", width);
2278    }
2279    if (precision)
2280        fmt += sprintf(fmt, ".%d", precision);
2281    if (longflag)
2282        *fmt++ = 'l';
2283    else if (longlongflag) {
2284        /* longlongflag should only ever be nonzero on machines with
2285           HAVE_LONG_LONG defined */
2286#ifdef HAVE_LONG_LONG
2287        char *f = PY_FORMAT_LONG_LONG;
2288        while (*f)
2289            *fmt++ = *f++;
2290#else
2291        /* we shouldn't ever get here */
2292        assert(0);
2293        *fmt++ = 'l';
2294#endif
2295    }
2296    else if (size_tflag) {
2297        char *f = PY_FORMAT_SIZE_T;
2298        while (*f)
2299            *fmt++ = *f++;
2300    }
2301    *fmt++ = c;
2302    *fmt = '\0';
2303}
2304
2305/* helper for PyUnicode_FromFormatV() */
2306
2307static const char*
2308parse_format_flags(const char *f,
2309                   int *p_width, int *p_precision,
2310                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2311{
2312    int width, precision, longflag, longlongflag, size_tflag;
2313
2314    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2315    f++;
2316    width = 0;
2317    while (Py_ISDIGIT((unsigned)*f))
2318        width = (width*10) + *f++ - '0';
2319    precision = 0;
2320    if (*f == '.') {
2321        f++;
2322        while (Py_ISDIGIT((unsigned)*f))
2323            precision = (precision*10) + *f++ - '0';
2324        if (*f == '%') {
2325            /* "%.3%s" => f points to "3" */
2326            f--;
2327        }
2328    }
2329    if (*f == '\0') {
2330        /* bogus format "%.1" => go backward, f points to "1" */
2331        f--;
2332    }
2333    if (p_width != NULL)
2334        *p_width = width;
2335    if (p_precision != NULL)
2336        *p_precision = precision;
2337
2338    /* Handle %ld, %lu, %lld and %llu. */
2339    longflag = 0;
2340    longlongflag = 0;
2341    size_tflag = 0;
2342
2343    if (*f == 'l') {
2344        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2345            longflag = 1;
2346            ++f;
2347        }
2348#ifdef HAVE_LONG_LONG
2349        else if (f[1] == 'l' &&
2350                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2351            longlongflag = 1;
2352            f += 2;
2353        }
2354#endif
2355    }
2356    /* handle the size_t flag. */
2357    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2358        size_tflag = 1;
2359        ++f;
2360    }
2361    if (p_longflag != NULL)
2362        *p_longflag = longflag;
2363    if (p_longlongflag != NULL)
2364        *p_longlongflag = longlongflag;
2365    if (p_size_tflag != NULL)
2366        *p_size_tflag = size_tflag;
2367    return f;
2368}
2369
2370/* maximum number of characters required for output of %ld.  21 characters
2371   allows for 64-bit integers (in decimal) and an optional sign. */
2372#define MAX_LONG_CHARS 21
2373/* maximum number of characters required for output of %lld.
2374   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2375   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2376#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2377
2378PyObject *
2379PyUnicode_FromFormatV(const char *format, va_list vargs)
2380{
2381    va_list count;
2382    Py_ssize_t callcount = 0;
2383    PyObject **callresults = NULL;
2384    PyObject **callresult = NULL;
2385    Py_ssize_t n = 0;
2386    int width = 0;
2387    int precision = 0;
2388    int zeropad;
2389    const char* f;
2390    PyObject *string;
2391    /* used by sprintf */
2392    char fmt[61]; /* should be enough for %0width.precisionlld */
2393    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2394    Py_UCS4 argmaxchar;
2395    Py_ssize_t numbersize = 0;
2396    char *numberresults = NULL;
2397    char *numberresult = NULL;
2398    Py_ssize_t i;
2399    int kind;
2400    void *data;
2401
2402    Py_VA_COPY(count, vargs);
2403    /* step 1: count the number of %S/%R/%A/%s format specifications
2404     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2405     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2406     * result in an array)
2407     * also estimate a upper bound for all the number formats in the string,
2408     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2409     * buffer before putting everything together. */
2410    for (f = format; *f; f++) {
2411        if (*f == '%') {
2412            int longlongflag;
2413            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2414            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2415            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2416                ++callcount;
2417
2418            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2419#ifdef HAVE_LONG_LONG
2420                if (longlongflag) {
2421                    if (width < MAX_LONG_LONG_CHARS)
2422                        width = MAX_LONG_LONG_CHARS;
2423                }
2424                else
2425#endif
2426                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2427                       including sign.  Decimal takes the most space.  This
2428                       isn't enough for octal.  If a width is specified we
2429                       need more (which we allocate later). */
2430                    if (width < MAX_LONG_CHARS)
2431                        width = MAX_LONG_CHARS;
2432
2433                /* account for the size + '\0' to separate numbers
2434                   inside of the numberresults buffer */
2435                numbersize += (width + 1);
2436            }
2437        }
2438        else if ((unsigned char)*f > 127) {
2439            PyErr_Format(PyExc_ValueError,
2440                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2441                "string, got a non-ASCII byte: 0x%02x",
2442                (unsigned char)*f);
2443            return NULL;
2444        }
2445    }
2446    /* step 2: allocate memory for the results of
2447     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2448    if (callcount) {
2449        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2450        if (!callresults) {
2451            PyErr_NoMemory();
2452            return NULL;
2453        }
2454        callresult = callresults;
2455    }
2456    /* step 2.5: allocate memory for the results of formating numbers */
2457    if (numbersize) {
2458        numberresults = PyObject_Malloc(numbersize);
2459        if (!numberresults) {
2460            PyErr_NoMemory();
2461            goto fail;
2462        }
2463        numberresult = numberresults;
2464    }
2465
2466    /* step 3: format numbers and figure out how large a buffer we need */
2467    for (f = format; *f; f++) {
2468        if (*f == '%') {
2469            const char* p;
2470            int longflag;
2471            int longlongflag;
2472            int size_tflag;
2473            int numprinted;
2474
2475            p = f;
2476            zeropad = (f[1] == '0');
2477            f = parse_format_flags(f, &width, &precision,
2478                                   &longflag, &longlongflag, &size_tflag);
2479            switch (*f) {
2480            case 'c':
2481            {
2482                Py_UCS4 ordinal = va_arg(count, int);
2483                maxchar = MAX_MAXCHAR(maxchar, ordinal);
2484                n++;
2485                break;
2486            }
2487            case '%':
2488                n++;
2489                break;
2490            case 'i':
2491            case 'd':
2492                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2493                        width, precision, *f);
2494                if (longflag)
2495                    numprinted = sprintf(numberresult, fmt,
2496                                         va_arg(count, long));
2497#ifdef HAVE_LONG_LONG
2498                else if (longlongflag)
2499                    numprinted = sprintf(numberresult, fmt,
2500                                         va_arg(count, PY_LONG_LONG));
2501#endif
2502                else if (size_tflag)
2503                    numprinted = sprintf(numberresult, fmt,
2504                                         va_arg(count, Py_ssize_t));
2505                else
2506                    numprinted = sprintf(numberresult, fmt,
2507                                         va_arg(count, int));
2508                n += numprinted;
2509                /* advance by +1 to skip over the '\0' */
2510                numberresult += (numprinted + 1);
2511                assert(*(numberresult - 1) == '\0');
2512                assert(*(numberresult - 2) != '\0');
2513                assert(numprinted >= 0);
2514                assert(numberresult <= numberresults + numbersize);
2515                break;
2516            case 'u':
2517                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2518                        width, precision, 'u');
2519                if (longflag)
2520                    numprinted = sprintf(numberresult, fmt,
2521                                         va_arg(count, unsigned long));
2522#ifdef HAVE_LONG_LONG
2523                else if (longlongflag)
2524                    numprinted = sprintf(numberresult, fmt,
2525                                         va_arg(count, unsigned PY_LONG_LONG));
2526#endif
2527                else if (size_tflag)
2528                    numprinted = sprintf(numberresult, fmt,
2529                                         va_arg(count, size_t));
2530                else
2531                    numprinted = sprintf(numberresult, fmt,
2532                                         va_arg(count, unsigned int));
2533                n += numprinted;
2534                numberresult += (numprinted + 1);
2535                assert(*(numberresult - 1) == '\0');
2536                assert(*(numberresult - 2) != '\0');
2537                assert(numprinted >= 0);
2538                assert(numberresult <= numberresults + numbersize);
2539                break;
2540            case 'x':
2541                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2542                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2543                n += numprinted;
2544                numberresult += (numprinted + 1);
2545                assert(*(numberresult - 1) == '\0');
2546                assert(*(numberresult - 2) != '\0');
2547                assert(numprinted >= 0);
2548                assert(numberresult <= numberresults + numbersize);
2549                break;
2550            case 'p':
2551                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2552                /* %p is ill-defined:  ensure leading 0x. */
2553                if (numberresult[1] == 'X')
2554                    numberresult[1] = 'x';
2555                else if (numberresult[1] != 'x') {
2556                    memmove(numberresult + 2, numberresult,
2557                            strlen(numberresult) + 1);
2558                    numberresult[0] = '0';
2559                    numberresult[1] = 'x';
2560                    numprinted += 2;
2561                }
2562                n += numprinted;
2563                numberresult += (numprinted + 1);
2564                assert(*(numberresult - 1) == '\0');
2565                assert(*(numberresult - 2) != '\0');
2566                assert(numprinted >= 0);
2567                assert(numberresult <= numberresults + numbersize);
2568                break;
2569            case 's':
2570            {
2571                /* UTF-8 */
2572                const char *s = va_arg(count, const char*);
2573                PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2574                if (!str)
2575                    goto fail;
2576                /* since PyUnicode_DecodeUTF8 returns already flexible
2577                   unicode objects, there is no need to call ready on them */
2578                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2579                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2580                n += PyUnicode_GET_LENGTH(str);
2581                /* Remember the str and switch to the next slot */
2582                *callresult++ = str;
2583                break;
2584            }
2585            case 'U':
2586            {
2587                PyObject *obj = va_arg(count, PyObject *);
2588                assert(obj && _PyUnicode_CHECK(obj));
2589                if (PyUnicode_READY(obj) == -1)
2590                    goto fail;
2591                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2592                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2593                n += PyUnicode_GET_LENGTH(obj);
2594                break;
2595            }
2596            case 'V':
2597            {
2598                PyObject *obj = va_arg(count, PyObject *);
2599                const char *str = va_arg(count, const char *);
2600                PyObject *str_obj;
2601                assert(obj || str);
2602                assert(!obj || _PyUnicode_CHECK(obj));
2603                if (obj) {
2604                    if (PyUnicode_READY(obj) == -1)
2605                        goto fail;
2606                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2607                    maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2608                    n += PyUnicode_GET_LENGTH(obj);
2609                    *callresult++ = NULL;
2610                }
2611                else {
2612                    str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2613                    if (!str_obj)
2614                        goto fail;
2615                    if (PyUnicode_READY(str_obj) == -1) {
2616                        Py_DECREF(str_obj);
2617                        goto fail;
2618                    }
2619                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2620                    maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2621                    n += PyUnicode_GET_LENGTH(str_obj);
2622                    *callresult++ = str_obj;
2623                }
2624                break;
2625            }
2626            case 'S':
2627            {
2628                PyObject *obj = va_arg(count, PyObject *);
2629                PyObject *str;
2630                assert(obj);
2631                str = PyObject_Str(obj);
2632                if (!str)
2633                    goto fail;
2634                if (PyUnicode_READY(str) == -1) {
2635                    Py_DECREF(str);
2636                    goto fail;
2637                }
2638                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2639                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2640                n += PyUnicode_GET_LENGTH(str);
2641                /* Remember the str and switch to the next slot */
2642                *callresult++ = str;
2643                break;
2644            }
2645            case 'R':
2646            {
2647                PyObject *obj = va_arg(count, PyObject *);
2648                PyObject *repr;
2649                assert(obj);
2650                repr = PyObject_Repr(obj);
2651                if (!repr)
2652                    goto fail;
2653                if (PyUnicode_READY(repr) == -1) {
2654                    Py_DECREF(repr);
2655                    goto fail;
2656                }
2657                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2658                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2659                n += PyUnicode_GET_LENGTH(repr);
2660                /* Remember the repr and switch to the next slot */
2661                *callresult++ = repr;
2662                break;
2663            }
2664            case 'A':
2665            {
2666                PyObject *obj = va_arg(count, PyObject *);
2667                PyObject *ascii;
2668                assert(obj);
2669                ascii = PyObject_ASCII(obj);
2670                if (!ascii)
2671                    goto fail;
2672                if (PyUnicode_READY(ascii) == -1) {
2673                    Py_DECREF(ascii);
2674                    goto fail;
2675                }
2676                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2677                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2678                n += PyUnicode_GET_LENGTH(ascii);
2679                /* Remember the repr and switch to the next slot */
2680                *callresult++ = ascii;
2681                break;
2682            }
2683            default:
2684                /* if we stumble upon an unknown
2685                   formatting code, copy the rest of
2686                   the format string to the output
2687                   string. (we cannot just skip the
2688                   code, since there's no way to know
2689                   what's in the argument list) */
2690                n += strlen(p);
2691                goto expand;
2692            }
2693        } else
2694            n++;
2695    }
2696  expand:
2697    /* step 4: fill the buffer */
2698    /* Since we've analyzed how much space we need,
2699       we don't have to resize the string.
2700       There can be no errors beyond this point. */
2701    string = PyUnicode_New(n, maxchar);
2702    if (!string)
2703        goto fail;
2704    kind = PyUnicode_KIND(string);
2705    data = PyUnicode_DATA(string);
2706    callresult = callresults;
2707    numberresult = numberresults;
2708
2709    for (i = 0, f = format; *f; f++) {
2710        if (*f == '%') {
2711            const char* p;
2712
2713            p = f;
2714            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2715            /* checking for == because the last argument could be a empty
2716               string, which causes i to point to end, the assert at the end of
2717               the loop */
2718            assert(i <= PyUnicode_GET_LENGTH(string));
2719
2720            switch (*f) {
2721            case 'c':
2722            {
2723                const int ordinal = va_arg(vargs, int);
2724                PyUnicode_WRITE(kind, data, i++, ordinal);
2725                break;
2726            }
2727            case 'i':
2728            case 'd':
2729            case 'u':
2730            case 'x':
2731            case 'p':
2732            {
2733                Py_ssize_t written;
2734                /* unused, since we already have the result */
2735                if (*f == 'p')
2736                    (void) va_arg(vargs, void *);
2737                else
2738                    (void) va_arg(vargs, int);
2739                /* extract the result from numberresults and append. */
2740                written = unicode_write_cstr(string, i, numberresult);
2741                /* skip over the separating '\0' */
2742                i += written;
2743                numberresult += written;
2744                assert(*numberresult == '\0');
2745                numberresult++;
2746                assert(numberresult <= numberresults + numbersize);
2747                break;
2748            }
2749            case 's':
2750            {
2751                /* unused, since we already have the result */
2752                Py_ssize_t size;
2753                (void) va_arg(vargs, char *);
2754                size = PyUnicode_GET_LENGTH(*callresult);
2755                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2756                copy_characters(string, i, *callresult, 0, size);
2757                i += size;
2758                /* We're done with the unicode()/repr() => forget it */
2759                Py_DECREF(*callresult);
2760                /* switch to next unicode()/repr() result */
2761                ++callresult;
2762                break;
2763            }
2764            case 'U':
2765            {
2766                PyObject *obj = va_arg(vargs, PyObject *);
2767                Py_ssize_t size;
2768                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2769                size = PyUnicode_GET_LENGTH(obj);
2770                copy_characters(string, i, obj, 0, size);
2771                i += size;
2772                break;
2773            }
2774            case 'V':
2775            {
2776                Py_ssize_t size;
2777                PyObject *obj = va_arg(vargs, PyObject *);
2778                va_arg(vargs, const char *);
2779                if (obj) {
2780                    size = PyUnicode_GET_LENGTH(obj);
2781                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2782                    copy_characters(string, i, obj, 0, size);
2783                    i += size;
2784                } else {
2785                    size = PyUnicode_GET_LENGTH(*callresult);
2786                    assert(PyUnicode_KIND(*callresult) <=
2787                           PyUnicode_KIND(string));
2788                    copy_characters(string, i, *callresult, 0, size);
2789                    i += size;
2790                    Py_DECREF(*callresult);
2791                }
2792                ++callresult;
2793                break;
2794            }
2795            case 'S':
2796            case 'R':
2797            case 'A':
2798            {
2799                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2800                /* unused, since we already have the result */
2801                (void) va_arg(vargs, PyObject *);
2802                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2803                copy_characters(string, i, *callresult, 0,  size);
2804                i += size;
2805                /* We're done with the unicode()/repr() => forget it */
2806                Py_DECREF(*callresult);
2807                /* switch to next unicode()/repr() result */
2808                ++callresult;
2809                break;
2810            }
2811            case '%':
2812                PyUnicode_WRITE(kind, data, i++, '%');
2813                break;
2814            default:
2815                i += unicode_write_cstr(string, i, p);
2816                assert(i == PyUnicode_GET_LENGTH(string));
2817                goto end;
2818            }
2819        }
2820        else {
2821            assert(i < PyUnicode_GET_LENGTH(string));
2822            PyUnicode_WRITE(kind, data, i++, *f);
2823        }
2824    }
2825    assert(i == PyUnicode_GET_LENGTH(string));
2826
2827  end:
2828    if (callresults)
2829        PyObject_Free(callresults);
2830    if (numberresults)
2831        PyObject_Free(numberresults);
2832    return unicode_result(string);
2833  fail:
2834    if (callresults) {
2835        PyObject **callresult2 = callresults;
2836        while (callresult2 < callresult) {
2837            Py_XDECREF(*callresult2);
2838            ++callresult2;
2839        }
2840        PyObject_Free(callresults);
2841    }
2842    if (numberresults)
2843        PyObject_Free(numberresults);
2844    return NULL;
2845}
2846
2847PyObject *
2848PyUnicode_FromFormat(const char *format, ...)
2849{
2850    PyObject* ret;
2851    va_list vargs;
2852
2853#ifdef HAVE_STDARG_PROTOTYPES
2854    va_start(vargs, format);
2855#else
2856    va_start(vargs);
2857#endif
2858    ret = PyUnicode_FromFormatV(format, vargs);
2859    va_end(vargs);
2860    return ret;
2861}
2862
2863#ifdef HAVE_WCHAR_H
2864
2865/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2866   convert a Unicode object to a wide character string.
2867
2868   - If w is NULL: return the number of wide characters (including the null
2869     character) required to convert the unicode object. Ignore size argument.
2870
2871   - Otherwise: return the number of wide characters (excluding the null
2872     character) written into w. Write at most size wide characters (including
2873     the null character). */
2874static Py_ssize_t
2875unicode_aswidechar(PyObject *unicode,
2876                   wchar_t *w,
2877                   Py_ssize_t size)
2878{
2879    Py_ssize_t res;
2880    const wchar_t *wstr;
2881
2882    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2883    if (wstr == NULL)
2884        return -1;
2885
2886    if (w != NULL) {
2887        if (size > res)
2888            size = res + 1;
2889        else
2890            res = size;
2891        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2892        return res;
2893    }
2894    else
2895        return res + 1;
2896}
2897
2898Py_ssize_t
2899PyUnicode_AsWideChar(PyObject *unicode,
2900                     wchar_t *w,
2901                     Py_ssize_t size)
2902{
2903    if (unicode == NULL) {
2904        PyErr_BadInternalCall();
2905        return -1;
2906    }
2907    return unicode_aswidechar(unicode, w, size);
2908}
2909
2910wchar_t*
2911PyUnicode_AsWideCharString(PyObject *unicode,
2912                           Py_ssize_t *size)
2913{
2914    wchar_t* buffer;
2915    Py_ssize_t buflen;
2916
2917    if (unicode == NULL) {
2918        PyErr_BadInternalCall();
2919        return NULL;
2920    }
2921
2922    buflen = unicode_aswidechar(unicode, NULL, 0);
2923    if (buflen == -1)
2924        return NULL;
2925    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2926        PyErr_NoMemory();
2927        return NULL;
2928    }
2929
2930    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2931    if (buffer == NULL) {
2932        PyErr_NoMemory();
2933        return NULL;
2934    }
2935    buflen = unicode_aswidechar(unicode, buffer, buflen);
2936    if (buflen == -1)
2937        return NULL;
2938    if (size != NULL)
2939        *size = buflen;
2940    return buffer;
2941}
2942
2943#endif /* HAVE_WCHAR_H */
2944
2945PyObject *
2946PyUnicode_FromOrdinal(int ordinal)
2947{
2948    PyObject *v;
2949    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2950        PyErr_SetString(PyExc_ValueError,
2951                        "chr() arg not in range(0x110000)");
2952        return NULL;
2953    }
2954
2955    if (ordinal < 256)
2956        return get_latin1_char(ordinal);
2957
2958    v = PyUnicode_New(1, ordinal);
2959    if (v == NULL)
2960        return NULL;
2961    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2962    assert(_PyUnicode_CheckConsistency(v, 1));
2963    return v;
2964}
2965
2966PyObject *
2967PyUnicode_FromObject(register PyObject *obj)
2968{
2969    /* XXX Perhaps we should make this API an alias of
2970       PyObject_Str() instead ?! */
2971    if (PyUnicode_CheckExact(obj)) {
2972        if (PyUnicode_READY(obj) == -1)
2973            return NULL;
2974        Py_INCREF(obj);
2975        return obj;
2976    }
2977    if (PyUnicode_Check(obj)) {
2978        /* For a Unicode subtype that's not a Unicode object,
2979           return a true Unicode object with the same data. */
2980        return _PyUnicode_Copy(obj);
2981    }
2982    PyErr_Format(PyExc_TypeError,
2983                 "Can't convert '%.100s' object to str implicitly",
2984                 Py_TYPE(obj)->tp_name);
2985    return NULL;
2986}
2987
2988PyObject *
2989PyUnicode_FromEncodedObject(register PyObject *obj,
2990                            const char *encoding,
2991                            const char *errors)
2992{
2993    Py_buffer buffer;
2994    PyObject *v;
2995
2996    if (obj == NULL) {
2997        PyErr_BadInternalCall();
2998        return NULL;
2999    }
3000
3001    /* Decoding bytes objects is the most common case and should be fast */
3002    if (PyBytes_Check(obj)) {
3003        if (PyBytes_GET_SIZE(obj) == 0) {
3004            Py_INCREF(unicode_empty);
3005            v = unicode_empty;
3006        }
3007        else {
3008            v = PyUnicode_Decode(
3009                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3010                    encoding, errors);
3011        }
3012        return v;
3013    }
3014
3015    if (PyUnicode_Check(obj)) {
3016        PyErr_SetString(PyExc_TypeError,
3017                        "decoding str is not supported");
3018        return NULL;
3019    }
3020
3021    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3022    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3023        PyErr_Format(PyExc_TypeError,
3024                     "coercing to str: need bytes, bytearray "
3025                     "or buffer-like object, %.80s found",
3026                     Py_TYPE(obj)->tp_name);
3027        return NULL;
3028    }
3029
3030    if (buffer.len == 0) {
3031        Py_INCREF(unicode_empty);
3032        v = unicode_empty;
3033    }
3034    else
3035        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3036
3037    PyBuffer_Release(&buffer);
3038    return v;
3039}
3040
3041/* Convert encoding to lower case and replace '_' with '-' in order to
3042   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3043   1 on success. */
3044static int
3045normalize_encoding(const char *encoding,
3046                   char *lower,
3047                   size_t lower_len)
3048{
3049    const char *e;
3050    char *l;
3051    char *l_end;
3052
3053    if (encoding == NULL) {
3054        strcpy(lower, "utf-8");
3055        return 1;
3056    }
3057    e = encoding;
3058    l = lower;
3059    l_end = &lower[lower_len - 1];
3060    while (*e) {
3061        if (l == l_end)
3062            return 0;
3063        if (Py_ISUPPER(*e)) {
3064            *l++ = Py_TOLOWER(*e++);
3065        }
3066        else if (*e == '_') {
3067            *l++ = '-';
3068            e++;
3069        }
3070        else {
3071            *l++ = *e++;
3072        }
3073    }
3074    *l = '\0';
3075    return 1;
3076}
3077
3078PyObject *
3079PyUnicode_Decode(const char *s,
3080                 Py_ssize_t size,
3081                 const char *encoding,
3082                 const char *errors)
3083{
3084    PyObject *buffer = NULL, *unicode;
3085    Py_buffer info;
3086    char lower[11];  /* Enough for any encoding shortcut */
3087
3088    /* Shortcuts for common default encodings */
3089    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3090        if ((strcmp(lower, "utf-8") == 0) ||
3091            (strcmp(lower, "utf8") == 0))
3092            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3093        else if ((strcmp(lower, "latin-1") == 0) ||
3094                 (strcmp(lower, "latin1") == 0) ||
3095                 (strcmp(lower, "iso-8859-1") == 0))
3096            return PyUnicode_DecodeLatin1(s, size, errors);
3097#ifdef HAVE_MBCS
3098        else if (strcmp(lower, "mbcs") == 0)
3099            return PyUnicode_DecodeMBCS(s, size, errors);
3100#endif
3101        else if (strcmp(lower, "ascii") == 0)
3102            return PyUnicode_DecodeASCII(s, size, errors);
3103        else if (strcmp(lower, "utf-16") == 0)
3104            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3105        else if (strcmp(lower, "utf-32") == 0)
3106            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3107    }
3108
3109    /* Decode via the codec registry */
3110    buffer = NULL;
3111    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3112        goto onError;
3113    buffer = PyMemoryView_FromBuffer(&info);
3114    if (buffer == NULL)
3115        goto onError;
3116    unicode = PyCodec_Decode(buffer, encoding, errors);
3117    if (unicode == NULL)
3118        goto onError;
3119    if (!PyUnicode_Check(unicode)) {
3120        PyErr_Format(PyExc_TypeError,
3121                     "decoder did not return a str object (type=%.400s)",
3122                     Py_TYPE(unicode)->tp_name);
3123        Py_DECREF(unicode);
3124        goto onError;
3125    }
3126    Py_DECREF(buffer);
3127    return unicode_result(unicode);
3128
3129  onError:
3130    Py_XDECREF(buffer);
3131    return NULL;
3132}
3133
3134PyObject *
3135PyUnicode_AsDecodedObject(PyObject *unicode,
3136                          const char *encoding,
3137                          const char *errors)
3138{
3139    PyObject *v;
3140
3141    if (!PyUnicode_Check(unicode)) {
3142        PyErr_BadArgument();
3143        goto onError;
3144    }
3145
3146    if (encoding == NULL)
3147        encoding = PyUnicode_GetDefaultEncoding();
3148
3149    /* Decode via the codec registry */
3150    v = PyCodec_Decode(unicode, encoding, errors);
3151    if (v == NULL)
3152        goto onError;
3153    return unicode_result(v);
3154
3155  onError:
3156    return NULL;
3157}
3158
3159PyObject *
3160PyUnicode_AsDecodedUnicode(PyObject *unicode,
3161                           const char *encoding,
3162                           const char *errors)
3163{
3164    PyObject *v;
3165
3166    if (!PyUnicode_Check(unicode)) {
3167        PyErr_BadArgument();
3168        goto onError;
3169    }
3170
3171    if (encoding == NULL)
3172        encoding = PyUnicode_GetDefaultEncoding();
3173
3174    /* Decode via the codec registry */
3175    v = PyCodec_Decode(unicode, encoding, errors);
3176    if (v == NULL)
3177        goto onError;
3178    if (!PyUnicode_Check(v)) {
3179        PyErr_Format(PyExc_TypeError,
3180                     "decoder did not return a str object (type=%.400s)",
3181                     Py_TYPE(v)->tp_name);
3182        Py_DECREF(v);
3183        goto onError;
3184    }
3185    return unicode_result(v);
3186
3187  onError:
3188    return NULL;
3189}
3190
3191PyObject *
3192PyUnicode_Encode(const Py_UNICODE *s,
3193                 Py_ssize_t size,
3194                 const char *encoding,
3195                 const char *errors)
3196{
3197    PyObject *v, *unicode;
3198
3199    unicode = PyUnicode_FromUnicode(s, size);
3200    if (unicode == NULL)
3201        return NULL;
3202    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3203    Py_DECREF(unicode);
3204    return v;
3205}
3206
3207PyObject *
3208PyUnicode_AsEncodedObject(PyObject *unicode,
3209                          const char *encoding,
3210                          const char *errors)
3211{
3212    PyObject *v;
3213
3214    if (!PyUnicode_Check(unicode)) {
3215        PyErr_BadArgument();
3216        goto onError;
3217    }
3218
3219    if (encoding == NULL)
3220        encoding = PyUnicode_GetDefaultEncoding();
3221
3222    /* Encode via the codec registry */
3223    v = PyCodec_Encode(unicode, encoding, errors);
3224    if (v == NULL)
3225        goto onError;
3226    return v;
3227
3228  onError:
3229    return NULL;
3230}
3231
3232static size_t
3233wcstombs_errorpos(const wchar_t *wstr)
3234{
3235    size_t len;
3236#if SIZEOF_WCHAR_T == 2
3237    wchar_t buf[3];
3238#else
3239    wchar_t buf[2];
3240#endif
3241    char outbuf[MB_LEN_MAX];
3242    const wchar_t *start, *previous;
3243
3244#if SIZEOF_WCHAR_T == 2
3245    buf[2] = 0;
3246#else
3247    buf[1] = 0;
3248#endif
3249    start = wstr;
3250    while (*wstr != L'\0')
3251    {
3252        previous = wstr;
3253#if SIZEOF_WCHAR_T == 2
3254        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3255            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3256        {
3257            buf[0] = wstr[0];
3258            buf[1] = wstr[1];
3259            wstr += 2;
3260        }
3261        else {
3262            buf[0] = *wstr;
3263            buf[1] = 0;
3264            wstr++;
3265        }
3266#else
3267        buf[0] = *wstr;
3268        wstr++;
3269#endif
3270        len = wcstombs(outbuf, buf, sizeof(outbuf));
3271        if (len == (size_t)-1)
3272            return previous - start;
3273    }
3274
3275    /* failed to find the unencodable character */
3276    return 0;
3277}
3278
3279static int
3280locale_error_handler(const char *errors, int *surrogateescape)
3281{
3282    if (errors == NULL) {
3283        *surrogateescape = 0;
3284        return 0;
3285    }
3286
3287    if (strcmp(errors, "strict") == 0) {
3288        *surrogateescape = 0;
3289        return 0;
3290    }
3291    if (strcmp(errors, "surrogateescape") == 0) {
3292        *surrogateescape = 1;
3293        return 0;
3294    }
3295    PyErr_Format(PyExc_ValueError,
3296                 "only 'strict' and 'surrogateescape' error handlers "
3297                 "are supported, not '%s'",
3298                 errors);
3299    return -1;
3300}
3301
3302PyObject *
3303PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3304{
3305    Py_ssize_t wlen, wlen2;
3306    wchar_t *wstr;
3307    PyObject *bytes = NULL;
3308    char *errmsg;
3309    PyObject *reason;
3310    PyObject *exc;
3311    size_t error_pos;
3312    int surrogateescape;
3313
3314    if (locale_error_handler(errors, &surrogateescape) < 0)
3315        return NULL;
3316
3317    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3318    if (wstr == NULL)
3319        return NULL;
3320
3321    wlen2 = wcslen(wstr);
3322    if (wlen2 != wlen) {
3323        PyMem_Free(wstr);
3324        PyErr_SetString(PyExc_TypeError, "embedded null character");
3325        return NULL;
3326    }
3327
3328    if (surrogateescape) {
3329        /* locale encoding with surrogateescape */
3330        char *str;
3331
3332        str = _Py_wchar2char(wstr, &error_pos);
3333        if (str == NULL) {
3334            if (error_pos == (size_t)-1) {
3335                PyErr_NoMemory();
3336                PyMem_Free(wstr);
3337                return NULL;
3338            }
3339            else {
3340                goto encode_error;
3341            }
3342        }
3343        PyMem_Free(wstr);
3344
3345        bytes = PyBytes_FromString(str);
3346        PyMem_Free(str);
3347    }
3348    else {
3349        size_t len, len2;
3350
3351        len = wcstombs(NULL, wstr, 0);
3352        if (len == (size_t)-1) {
3353            error_pos = (size_t)-1;
3354            goto encode_error;
3355        }
3356
3357        bytes = PyBytes_FromStringAndSize(NULL, len);
3358        if (bytes == NULL) {
3359            PyMem_Free(wstr);
3360            return NULL;
3361        }
3362
3363        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3364        if (len2 == (size_t)-1 || len2 > len) {
3365            error_pos = (size_t)-1;
3366            goto encode_error;
3367        }
3368        PyMem_Free(wstr);
3369    }
3370    return bytes;
3371
3372encode_error:
3373    errmsg = strerror(errno);
3374    assert(errmsg != NULL);
3375
3376    if (error_pos == (size_t)-1)
3377        error_pos = wcstombs_errorpos(wstr);
3378
3379    PyMem_Free(wstr);
3380    Py_XDECREF(bytes);
3381
3382    if (errmsg != NULL) {
3383        size_t errlen;
3384        wstr = _Py_char2wchar(errmsg, &errlen);
3385        if (wstr != NULL) {
3386            reason = PyUnicode_FromWideChar(wstr, errlen);
3387            PyMem_Free(wstr);
3388        } else
3389            errmsg = NULL;
3390    }
3391    if (errmsg == NULL)
3392        reason = PyUnicode_FromString(
3393            "wcstombs() encountered an unencodable "
3394            "wide character");
3395    if (reason == NULL)
3396        return NULL;
3397
3398    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3399                                "locale", unicode,
3400                                (Py_ssize_t)error_pos,
3401                                (Py_ssize_t)(error_pos+1),
3402                                reason);
3403    Py_DECREF(reason);
3404    if (exc != NULL) {
3405        PyCodec_StrictErrors(exc);
3406        Py_XDECREF(exc);
3407    }
3408    return NULL;
3409}
3410
3411PyObject *
3412PyUnicode_EncodeFSDefault(PyObject *unicode)
3413{
3414#ifdef HAVE_MBCS
3415    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3416#elif defined(__APPLE__)
3417    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3418#else
3419    PyInterpreterState *interp = PyThreadState_GET()->interp;
3420    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3421       cannot use it to encode and decode filenames before it is loaded. Load
3422       the Python codec requires to encode at least its own filename. Use the C
3423       version of the locale codec until the codec registry is initialized and
3424       the Python codec is loaded.
3425
3426       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3427       cannot only rely on it: check also interp->fscodec_initialized for
3428       subinterpreters. */
3429    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3430        return PyUnicode_AsEncodedString(unicode,
3431                                         Py_FileSystemDefaultEncoding,
3432                                         "surrogateescape");
3433    }
3434    else {
3435        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3436    }
3437#endif
3438}
3439
3440PyObject *
3441PyUnicode_AsEncodedString(PyObject *unicode,
3442                          const char *encoding,
3443                          const char *errors)
3444{
3445    PyObject *v;
3446    char lower[11];  /* Enough for any encoding shortcut */
3447
3448    if (!PyUnicode_Check(unicode)) {
3449        PyErr_BadArgument();
3450        return NULL;
3451    }
3452
3453    /* Shortcuts for common default encodings */
3454    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3455        if ((strcmp(lower, "utf-8") == 0) ||
3456            (strcmp(lower, "utf8") == 0))
3457        {
3458            if (errors == NULL || strcmp(errors, "strict") == 0)
3459                return _PyUnicode_AsUTF8String(unicode, NULL);
3460            else
3461                return _PyUnicode_AsUTF8String(unicode, errors);
3462        }
3463        else if ((strcmp(lower, "latin-1") == 0) ||
3464                 (strcmp(lower, "latin1") == 0) ||
3465                 (strcmp(lower, "iso-8859-1") == 0))
3466            return _PyUnicode_AsLatin1String(unicode, errors);
3467#ifdef HAVE_MBCS
3468        else if (strcmp(lower, "mbcs") == 0)
3469            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3470#endif
3471        else if (strcmp(lower, "ascii") == 0)
3472            return _PyUnicode_AsASCIIString(unicode, errors);
3473    }
3474
3475    /* Encode via the codec registry */
3476    v = PyCodec_Encode(unicode, encoding, errors);
3477    if (v == NULL)
3478        return NULL;
3479
3480    /* The normal path */
3481    if (PyBytes_Check(v))
3482        return v;
3483
3484    /* If the codec returns a buffer, raise a warning and convert to bytes */
3485    if (PyByteArray_Check(v)) {
3486        int error;
3487        PyObject *b;
3488
3489        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3490            "encoder %s returned bytearray instead of bytes",
3491            encoding);
3492        if (error) {
3493            Py_DECREF(v);
3494            return NULL;
3495        }
3496
3497        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3498        Py_DECREF(v);
3499        return b;
3500    }
3501
3502    PyErr_Format(PyExc_TypeError,
3503                 "encoder did not return a bytes object (type=%.400s)",
3504                 Py_TYPE(v)->tp_name);
3505    Py_DECREF(v);
3506    return NULL;
3507}
3508
3509PyObject *
3510PyUnicode_AsEncodedUnicode(PyObject *unicode,
3511                           const char *encoding,
3512                           const char *errors)
3513{
3514    PyObject *v;
3515
3516    if (!PyUnicode_Check(unicode)) {
3517        PyErr_BadArgument();
3518        goto onError;
3519    }
3520
3521    if (encoding == NULL)
3522        encoding = PyUnicode_GetDefaultEncoding();
3523
3524    /* Encode via the codec registry */
3525    v = PyCodec_Encode(unicode, encoding, errors);
3526    if (v == NULL)
3527        goto onError;
3528    if (!PyUnicode_Check(v)) {
3529        PyErr_Format(PyExc_TypeError,
3530                     "encoder did not return an str object (type=%.400s)",
3531                     Py_TYPE(v)->tp_name);
3532        Py_DECREF(v);
3533        goto onError;
3534    }
3535    return v;
3536
3537  onError:
3538    return NULL;
3539}
3540
3541static size_t
3542mbstowcs_errorpos(const char *str, size_t len)
3543{
3544#ifdef HAVE_MBRTOWC
3545    const char *start = str;
3546    mbstate_t mbs;
3547    size_t converted;
3548    wchar_t ch;
3549
3550    memset(&mbs, 0, sizeof mbs);
3551    while (len)
3552    {
3553        converted = mbrtowc(&ch, (char*)str, len, &mbs);
3554        if (converted == 0)
3555            /* Reached end of string */
3556            break;
3557        if (converted == (size_t)-1 || converted == (size_t)-2) {
3558            /* Conversion error or incomplete character */
3559            return str - start;
3560        }
3561        else {
3562            str += converted;
3563            len -= converted;
3564        }
3565    }
3566    /* failed to find the undecodable byte sequence */
3567    return 0;
3568#endif
3569    return 0;
3570}
3571
3572PyObject*
3573PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3574                              const char *errors)
3575{
3576    wchar_t smallbuf[256];
3577    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3578    wchar_t *wstr;
3579    size_t wlen, wlen2;
3580    PyObject *unicode;
3581    int surrogateescape;
3582    size_t error_pos;
3583    char *errmsg;
3584    PyObject *reason, *exc;
3585
3586    if (locale_error_handler(errors, &surrogateescape) < 0)
3587        return NULL;
3588
3589    if (str[len] != '\0' || len != strlen(str)) {
3590        PyErr_SetString(PyExc_TypeError, "embedded null character");
3591        return NULL;
3592    }
3593
3594    if (surrogateescape)
3595    {
3596        wstr = _Py_char2wchar(str, &wlen);
3597        if (wstr == NULL) {
3598            if (wlen == (size_t)-1)
3599                PyErr_NoMemory();
3600            else
3601                PyErr_SetFromErrno(PyExc_OSError);
3602            return NULL;
3603        }
3604
3605        unicode = PyUnicode_FromWideChar(wstr, wlen);
3606        PyMem_Free(wstr);
3607    }
3608    else {
3609#ifndef HAVE_BROKEN_MBSTOWCS
3610        wlen = mbstowcs(NULL, str, 0);
3611#else
3612        wlen = len;
3613#endif
3614        if (wlen == (size_t)-1)
3615            goto decode_error;
3616        if (wlen+1 <= smallbuf_len) {
3617            wstr = smallbuf;
3618        }
3619        else {
3620            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3621                return PyErr_NoMemory();
3622
3623            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3624            if (!wstr)
3625                return PyErr_NoMemory();
3626        }
3627
3628        /* This shouldn't fail now */
3629        wlen2 = mbstowcs(wstr, str, wlen+1);
3630        if (wlen2 == (size_t)-1) {
3631            if (wstr != smallbuf)
3632                PyMem_Free(wstr);
3633            goto decode_error;
3634        }
3635#ifdef HAVE_BROKEN_MBSTOWCS
3636        assert(wlen2 == wlen);
3637#endif
3638        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3639        if (wstr != smallbuf)
3640            PyMem_Free(wstr);
3641    }
3642    return unicode;
3643
3644decode_error:
3645    errmsg = strerror(errno);
3646    assert(errmsg != NULL);
3647
3648    error_pos = mbstowcs_errorpos(str, len);
3649    if (errmsg != NULL) {
3650        size_t errlen;
3651        wstr = _Py_char2wchar(errmsg, &errlen);
3652        if (wstr != NULL) {
3653            reason = PyUnicode_FromWideChar(wstr, errlen);
3654            PyMem_Free(wstr);
3655        } else
3656            errmsg = NULL;
3657    }
3658    if (errmsg == NULL)
3659        reason = PyUnicode_FromString(
3660            "mbstowcs() encountered an invalid multibyte sequence");
3661    if (reason == NULL)
3662        return NULL;
3663
3664    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3665                                "locale", str, len,
3666                                (Py_ssize_t)error_pos,
3667                                (Py_ssize_t)(error_pos+1),
3668                                reason);
3669    Py_DECREF(reason);
3670    if (exc != NULL) {
3671        PyCodec_StrictErrors(exc);
3672        Py_XDECREF(exc);
3673    }
3674    return NULL;
3675}
3676
3677PyObject*
3678PyUnicode_DecodeLocale(const char *str, const char *errors)
3679{
3680    Py_ssize_t size = (Py_ssize_t)strlen(str);
3681    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3682}
3683
3684
3685PyObject*
3686PyUnicode_DecodeFSDefault(const char *s) {
3687    Py_ssize_t size = (Py_ssize_t)strlen(s);
3688    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3689}
3690
3691PyObject*
3692PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3693{
3694#ifdef HAVE_MBCS
3695    return PyUnicode_DecodeMBCS(s, size, NULL);
3696#elif defined(__APPLE__)
3697    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3698#else
3699    PyInterpreterState *interp = PyThreadState_GET()->interp;
3700    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3701       cannot use it to encode and decode filenames before it is loaded. Load
3702       the Python codec requires to encode at least its own filename. Use the C
3703       version of the locale codec until the codec registry is initialized and
3704       the Python codec is loaded.
3705
3706       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3707       cannot only rely on it: check also interp->fscodec_initialized for
3708       subinterpreters. */
3709    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3710        return PyUnicode_Decode(s, size,
3711                                Py_FileSystemDefaultEncoding,
3712                                "surrogateescape");
3713    }
3714    else {
3715        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3716    }
3717#endif
3718}
3719
3720
3721int
3722_PyUnicode_HasNULChars(PyObject* s)
3723{
3724    static PyObject *nul = NULL;
3725
3726    if (nul == NULL)
3727        nul = PyUnicode_FromStringAndSize("\0", 1);
3728    if (nul == NULL)
3729        return -1;
3730    return PyUnicode_Contains(s, nul);
3731}
3732
3733
3734int
3735PyUnicode_FSConverter(PyObject* arg, void* addr)
3736{
3737    PyObject *output = NULL;
3738    Py_ssize_t size;
3739    void *data;
3740    if (arg == NULL) {
3741        Py_DECREF(*(PyObject**)addr);
3742        return 1;
3743    }
3744    if (PyBytes_Check(arg)) {
3745        output = arg;
3746        Py_INCREF(output);
3747    }
3748    else {
3749        arg = PyUnicode_FromObject(arg);
3750        if (!arg)
3751            return 0;
3752        output = PyUnicode_EncodeFSDefault(arg);
3753        Py_DECREF(arg);
3754        if (!output)
3755            return 0;
3756        if (!PyBytes_Check(output)) {
3757            Py_DECREF(output);
3758            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3759            return 0;
3760        }
3761    }
3762    size = PyBytes_GET_SIZE(output);
3763    data = PyBytes_AS_STRING(output);
3764    if (size != strlen(data)) {
3765        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3766        Py_DECREF(output);
3767        return 0;
3768    }
3769    *(PyObject**)addr = output;
3770    return Py_CLEANUP_SUPPORTED;
3771}
3772
3773
3774int
3775PyUnicode_FSDecoder(PyObject* arg, void* addr)
3776{
3777    PyObject *output = NULL;
3778    if (arg == NULL) {
3779        Py_DECREF(*(PyObject**)addr);
3780        return 1;
3781    }
3782    if (PyUnicode_Check(arg)) {
3783        if (PyUnicode_READY(arg) == -1)
3784            return 0;
3785        output = arg;
3786        Py_INCREF(output);
3787    }
3788    else {
3789        arg = PyBytes_FromObject(arg);
3790        if (!arg)
3791            return 0;
3792        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3793                                                  PyBytes_GET_SIZE(arg));
3794        Py_DECREF(arg);
3795        if (!output)
3796            return 0;
3797        if (!PyUnicode_Check(output)) {
3798            Py_DECREF(output);
3799            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3800            return 0;
3801        }
3802    }
3803    if (PyUnicode_READY(output) == -1) {
3804        Py_DECREF(output);
3805        return 0;
3806    }
3807    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3808                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3809        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3810        Py_DECREF(output);
3811        return 0;
3812    }
3813    *(PyObject**)addr = output;
3814    return Py_CLEANUP_SUPPORTED;
3815}
3816
3817
3818char*
3819PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3820{
3821    PyObject *bytes;
3822
3823    if (!PyUnicode_Check(unicode)) {
3824        PyErr_BadArgument();
3825        return NULL;
3826    }
3827    if (PyUnicode_READY(unicode) == -1)
3828        return NULL;
3829
3830    if (PyUnicode_UTF8(unicode) == NULL) {
3831        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3832        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3833        if (bytes == NULL)
3834            return NULL;
3835        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3836        if (_PyUnicode_UTF8(unicode) == NULL) {
3837            Py_DECREF(bytes);
3838            return NULL;
3839        }
3840        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3841        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3842                  PyBytes_AS_STRING(bytes),
3843                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3844        Py_DECREF(bytes);
3845    }
3846
3847    if (psize)
3848        *psize = PyUnicode_UTF8_LENGTH(unicode);
3849    return PyUnicode_UTF8(unicode);
3850}
3851
3852char*
3853PyUnicode_AsUTF8(PyObject *unicode)
3854{
3855    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3856}
3857
3858#ifdef Py_DEBUG
3859static int unicode_as_unicode_calls = 0;
3860#endif
3861
3862
3863Py_UNICODE *
3864PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3865{
3866    const unsigned char *one_byte;
3867#if SIZEOF_WCHAR_T == 4
3868    const Py_UCS2 *two_bytes;
3869#else
3870    const Py_UCS4 *four_bytes;
3871    const Py_UCS4 *ucs4_end;
3872    Py_ssize_t num_surrogates;
3873#endif
3874    wchar_t *w;
3875    wchar_t *wchar_end;
3876
3877    if (!PyUnicode_Check(unicode)) {
3878        PyErr_BadArgument();
3879        return NULL;
3880    }
3881    if (_PyUnicode_WSTR(unicode) == NULL) {
3882        /* Non-ASCII compact unicode object */
3883        assert(_PyUnicode_KIND(unicode) != 0);
3884        assert(PyUnicode_IS_READY(unicode));
3885
3886#ifdef Py_DEBUG
3887        ++unicode_as_unicode_calls;
3888#endif
3889
3890        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3891#if SIZEOF_WCHAR_T == 2
3892            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3893            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3894            num_surrogates = 0;
3895
3896            for (; four_bytes < ucs4_end; ++four_bytes) {
3897                if (*four_bytes > 0xFFFF)
3898                    ++num_surrogates;
3899            }
3900
3901            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3902                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3903            if (!_PyUnicode_WSTR(unicode)) {
3904                PyErr_NoMemory();
3905                return NULL;
3906            }
3907            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3908
3909            w = _PyUnicode_WSTR(unicode);
3910            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3911            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3912            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3913                if (*four_bytes > 0xFFFF) {
3914                    assert(*four_bytes <= MAX_UNICODE);
3915                    /* encode surrogate pair in this case */
3916                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3917                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3918                }
3919                else
3920                    *w = *four_bytes;
3921
3922                if (w > wchar_end) {
3923                    assert(0 && "Miscalculated string end");
3924                }
3925            }
3926            *w = 0;
3927#else
3928            /* sizeof(wchar_t) == 4 */
3929            Py_FatalError("Impossible unicode object state, wstr and str "
3930                          "should share memory already.");
3931            return NULL;
3932#endif
3933        }
3934        else {
3935            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3936                                                  (_PyUnicode_LENGTH(unicode) + 1));
3937            if (!_PyUnicode_WSTR(unicode)) {
3938                PyErr_NoMemory();
3939                return NULL;
3940            }
3941            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3942                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3943            w = _PyUnicode_WSTR(unicode);
3944            wchar_end = w + _PyUnicode_LENGTH(unicode);
3945
3946            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3947                one_byte = PyUnicode_1BYTE_DATA(unicode);
3948                for (; w < wchar_end; ++one_byte, ++w)
3949                    *w = *one_byte;
3950                /* null-terminate the wstr */
3951                *w = 0;
3952            }
3953            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3954#if SIZEOF_WCHAR_T == 4
3955                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3956                for (; w < wchar_end; ++two_bytes, ++w)
3957                    *w = *two_bytes;
3958                /* null-terminate the wstr */
3959                *w = 0;
3960#else
3961                /* sizeof(wchar_t) == 2 */
3962                PyObject_FREE(_PyUnicode_WSTR(unicode));
3963                _PyUnicode_WSTR(unicode) = NULL;
3964                Py_FatalError("Impossible unicode object state, wstr "
3965                              "and str should share memory already.");
3966                return NULL;
3967#endif
3968            }
3969            else {
3970                assert(0 && "This should never happen.");
3971            }
3972        }
3973    }
3974    if (size != NULL)
3975        *size = PyUnicode_WSTR_LENGTH(unicode);
3976    return _PyUnicode_WSTR(unicode);
3977}
3978
3979Py_UNICODE *
3980PyUnicode_AsUnicode(PyObject *unicode)
3981{
3982    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3983}
3984
3985
3986Py_ssize_t
3987PyUnicode_GetSize(PyObject *unicode)
3988{
3989    if (!PyUnicode_Check(unicode)) {
3990        PyErr_BadArgument();
3991        goto onError;
3992    }
3993    return PyUnicode_GET_SIZE(unicode);
3994
3995  onError:
3996    return -1;
3997}
3998
3999Py_ssize_t
4000PyUnicode_GetLength(PyObject *unicode)
4001{
4002    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4003        PyErr_BadArgument();
4004        return -1;
4005    }
4006
4007    return PyUnicode_GET_LENGTH(unicode);
4008}
4009
4010Py_UCS4
4011PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4012{
4013    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4014        PyErr_BadArgument();
4015        return (Py_UCS4)-1;
4016    }
4017    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4018        PyErr_SetString(PyExc_IndexError, "string index out of range");
4019        return (Py_UCS4)-1;
4020    }
4021    return PyUnicode_READ_CHAR(unicode, index);
4022}
4023
4024int
4025PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4026{
4027    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4028        PyErr_BadArgument();
4029        return -1;
4030    }
4031    assert(PyUnicode_IS_READY(unicode));
4032    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4033        PyErr_SetString(PyExc_IndexError, "string index out of range");
4034        return -1;
4035    }
4036    if (unicode_check_modifiable(unicode))
4037        return -1;
4038    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4039        PyErr_SetString(PyExc_ValueError, "character out of range");
4040        return -1;
4041    }
4042    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4043                    index, ch);
4044    return 0;
4045}
4046
4047const char *
4048PyUnicode_GetDefaultEncoding(void)
4049{
4050    return "utf-8";
4051}
4052
4053/* create or adjust a UnicodeDecodeError */
4054static void
4055make_decode_exception(PyObject **exceptionObject,
4056                      const char *encoding,
4057                      const char *input, Py_ssize_t length,
4058                      Py_ssize_t startpos, Py_ssize_t endpos,
4059                      const char *reason)
4060{
4061    if (*exceptionObject == NULL) {
4062        *exceptionObject = PyUnicodeDecodeError_Create(
4063            encoding, input, length, startpos, endpos, reason);
4064    }
4065    else {
4066        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4067            goto onError;
4068        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4069            goto onError;
4070        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4071            goto onError;
4072    }
4073    return;
4074
4075onError:
4076    Py_DECREF(*exceptionObject);
4077    *exceptionObject = NULL;
4078}
4079
4080/* error handling callback helper:
4081   build arguments, call the callback and check the arguments,
4082   if no exception occurred, copy the replacement to the output
4083   and adjust various state variables.
4084   return 0 on success, -1 on error
4085*/
4086
4087static int
4088unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
4089                                 const char *encoding, const char *reason,
4090                                 const char **input, const char **inend, Py_ssize_t *startinpos,
4091                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4092                                 PyObject **output, Py_ssize_t *outpos)
4093{
4094    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4095
4096    PyObject *restuple = NULL;
4097    PyObject *repunicode = NULL;
4098    Py_ssize_t outsize;
4099    Py_ssize_t insize;
4100    Py_ssize_t requiredsize;
4101    Py_ssize_t newpos;
4102    PyObject *inputobj = NULL;
4103    int res = -1;
4104
4105    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4106        outsize = PyUnicode_GET_LENGTH(*output);
4107    else
4108        outsize = _PyUnicode_WSTR_LENGTH(*output);
4109
4110    if (*errorHandler == NULL) {
4111        *errorHandler = PyCodec_LookupError(errors);
4112        if (*errorHandler == NULL)
4113            goto onError;
4114    }
4115
4116    make_decode_exception(exceptionObject,
4117        encoding,
4118        *input, *inend - *input,
4119        *startinpos, *endinpos,
4120        reason);
4121    if (*exceptionObject == NULL)
4122        goto onError;
4123
4124    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4125    if (restuple == NULL)
4126        goto onError;
4127    if (!PyTuple_Check(restuple)) {
4128        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4129        goto onError;
4130    }
4131    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4132        goto onError;
4133    if (PyUnicode_READY(repunicode) == -1)
4134        goto onError;
4135
4136    /* Copy back the bytes variables, which might have been modified by the
4137       callback */
4138    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4139    if (!inputobj)
4140        goto onError;
4141    if (!PyBytes_Check(inputobj)) {
4142        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4143    }
4144    *input = PyBytes_AS_STRING(inputobj);
4145    insize = PyBytes_GET_SIZE(inputobj);
4146    *inend = *input + insize;
4147    /* we can DECREF safely, as the exception has another reference,
4148       so the object won't go away. */
4149    Py_DECREF(inputobj);
4150
4151    if (newpos<0)
4152        newpos = insize+newpos;
4153    if (newpos<0 || newpos>insize) {
4154        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4155        goto onError;
4156    }
4157
4158    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4159        /* need more space? (at least enough for what we
4160           have+the replacement+the rest of the string (starting
4161           at the new input position), so we won't have to check space
4162           when there are no errors in the rest of the string) */
4163        Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4164        requiredsize = *outpos + replen + insize-newpos;
4165        if (requiredsize > outsize) {
4166            if (requiredsize<2*outsize)
4167                requiredsize = 2*outsize;
4168            if (unicode_resize(output, requiredsize) < 0)
4169                goto onError;
4170        }
4171        if (unicode_widen(output, *outpos,
4172                          PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
4173            goto onError;
4174        copy_characters(*output, *outpos, repunicode, 0, replen);
4175        *outpos += replen;
4176    }
4177    else {
4178        wchar_t *repwstr;
4179        Py_ssize_t repwlen;
4180        repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4181        if (repwstr == NULL)
4182            goto onError;
4183        /* need more space? (at least enough for what we
4184           have+the replacement+the rest of the string (starting
4185           at the new input position), so we won't have to check space
4186           when there are no errors in the rest of the string) */
4187        requiredsize = *outpos + repwlen + insize-newpos;
4188        if (requiredsize > outsize) {
4189            if (requiredsize < 2*outsize)
4190                requiredsize = 2*outsize;
4191            if (unicode_resize(output, requiredsize) < 0)
4192                goto onError;
4193        }
4194        wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4195        *outpos += repwlen;
4196    }
4197    *endinpos = newpos;
4198    *inptr = *input + newpos;
4199
4200    /* we made it! */
4201    res = 0;
4202
4203  onError:
4204    Py_XDECREF(restuple);
4205    return res;
4206}
4207
4208/* --- UTF-7 Codec -------------------------------------------------------- */
4209
4210/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4211
4212/* Three simple macros defining base-64. */
4213
4214/* Is c a base-64 character? */
4215
4216#define IS_BASE64(c) \
4217    (((c) >= 'A' && (c) <= 'Z') ||     \
4218     ((c) >= 'a' && (c) <= 'z') ||     \
4219     ((c) >= '0' && (c) <= '9') ||     \
4220     (c) == '+' || (c) == '/')
4221
4222/* given that c is a base-64 character, what is its base-64 value? */
4223
4224#define FROM_BASE64(c)                                                  \
4225    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4226     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4227     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4228     (c) == '+' ? 62 : 63)
4229
4230/* What is the base-64 character of the bottom 6 bits of n? */
4231
4232#define TO_BASE64(n)  \
4233    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4234
4235/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4236 * decoded as itself.  We are permissive on decoding; the only ASCII
4237 * byte not decoding to itself is the + which begins a base64
4238 * string. */
4239
4240#define DECODE_DIRECT(c)                                \
4241    ((c) <= 127 && (c) != '+')
4242
4243/* The UTF-7 encoder treats ASCII characters differently according to
4244 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4245 * the above).  See RFC2152.  This array identifies these different
4246 * sets:
4247 * 0 : "Set D"
4248 *     alphanumeric and '(),-./:?
4249 * 1 : "Set O"
4250 *     !"#$%&*;<=>@[]^_`{|}
4251 * 2 : "whitespace"
4252 *     ht nl cr sp
4253 * 3 : special (must be base64 encoded)
4254 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4255 */
4256
4257static
4258char utf7_category[128] = {
4259/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4260    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4261/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4262    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4263/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4264    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4265/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4266    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4267/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4268    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4269/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4270    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4271/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4272    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4273/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4274    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4275};
4276
4277/* ENCODE_DIRECT: this character should be encoded as itself.  The
4278 * answer depends on whether we are encoding set O as itself, and also
4279 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4280 * clear that the answers to these questions vary between
4281 * applications, so this code needs to be flexible.  */
4282
4283#define ENCODE_DIRECT(c, directO, directWS)             \
4284    ((c) < 128 && (c) > 0 &&                            \
4285     ((utf7_category[(c)] == 0) ||                      \
4286      (directWS && (utf7_category[(c)] == 2)) ||        \
4287      (directO && (utf7_category[(c)] == 1))))
4288
4289PyObject *
4290PyUnicode_DecodeUTF7(const char *s,
4291                     Py_ssize_t size,
4292                     const char *errors)
4293{
4294    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4295}
4296
4297/* The decoder.  The only state we preserve is our read position,
4298 * i.e. how many characters we have consumed.  So if we end in the
4299 * middle of a shift sequence we have to back off the read position
4300 * and the output to the beginning of the sequence, otherwise we lose
4301 * all the shift state (seen bits, number of bits seen, high
4302 * surrogate). */
4303
4304PyObject *
4305PyUnicode_DecodeUTF7Stateful(const char *s,
4306                             Py_ssize_t size,
4307                             const char *errors,
4308                             Py_ssize_t *consumed)
4309{
4310    const char *starts = s;
4311    Py_ssize_t startinpos;
4312    Py_ssize_t endinpos;
4313    Py_ssize_t outpos;
4314    const char *e;
4315    PyObject *unicode;
4316    const char *errmsg = "";
4317    int inShift = 0;
4318    Py_ssize_t shiftOutStart;
4319    unsigned int base64bits = 0;
4320    unsigned long base64buffer = 0;
4321    Py_UCS4 surrogate = 0;
4322    PyObject *errorHandler = NULL;
4323    PyObject *exc = NULL;
4324
4325    /* Start off assuming it's all ASCII. Widen later as necessary. */
4326    unicode = PyUnicode_New(size, 127);
4327    if (!unicode)
4328        return NULL;
4329    if (size == 0) {
4330        if (consumed)
4331            *consumed = 0;
4332        return unicode;
4333    }
4334
4335    shiftOutStart = outpos = 0;
4336    e = s + size;
4337
4338    while (s < e) {
4339        Py_UCS4 ch;
4340      restart:
4341        ch = (unsigned char) *s;
4342
4343        if (inShift) { /* in a base-64 section */
4344            if (IS_BASE64(ch)) { /* consume a base-64 character */
4345                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4346                base64bits += 6;
4347                s++;
4348                if (base64bits >= 16) {
4349                    /* we have enough bits for a UTF-16 value */
4350                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4351                    base64bits -= 16;
4352                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4353                    if (surrogate) {
4354                        /* expecting a second surrogate */
4355                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4356                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4357                            if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4358                                goto onError;
4359                            surrogate = 0;
4360                            continue;
4361                        }
4362                        else {
4363                            if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4364                                goto onError;
4365                            surrogate = 0;
4366                        }
4367                    }
4368                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4369                        /* first surrogate */
4370                        surrogate = outCh;
4371                    }
4372                    else {
4373                        if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4374                            goto onError;
4375                    }
4376                }
4377            }
4378            else { /* now leaving a base-64 section */
4379                inShift = 0;
4380                s++;
4381                if (surrogate) {
4382                    if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4383                        goto onError;
4384                    surrogate = 0;
4385                }
4386                if (base64bits > 0) { /* left-over bits */
4387                    if (base64bits >= 6) {
4388                        /* We've seen at least one base-64 character */
4389                        errmsg = "partial character in shift sequence";
4390                        goto utf7Error;
4391                    }
4392                    else {
4393                        /* Some bits remain; they should be zero */
4394                        if (base64buffer != 0) {
4395                            errmsg = "non-zero padding bits in shift sequence";
4396                            goto utf7Error;
4397                        }
4398                    }
4399                }
4400                if (ch != '-') {
4401                    /* '-' is absorbed; other terminating
4402                       characters are preserved */
4403                    if (unicode_putchar(&unicode, &outpos, ch) < 0)
4404                        goto onError;
4405                }
4406            }
4407        }
4408        else if ( ch == '+' ) {
4409            startinpos = s-starts;
4410            s++; /* consume '+' */
4411            if (s < e && *s == '-') { /* '+-' encodes '+' */
4412                s++;
4413                if (unicode_putchar(&unicode, &outpos, '+') < 0)
4414                    goto onError;
4415            }
4416            else { /* begin base64-encoded section */
4417                inShift = 1;
4418                shiftOutStart = outpos;
4419                base64bits = 0;
4420            }
4421        }
4422        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4423            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4424                goto onError;
4425            s++;
4426        }
4427        else {
4428            startinpos = s-starts;
4429            s++;
4430            errmsg = "unexpected special character";
4431            goto utf7Error;
4432        }
4433        continue;
4434utf7Error:
4435        endinpos = s-starts;
4436        if (unicode_decode_call_errorhandler(
4437                errors, &errorHandler,
4438                "utf7", errmsg,
4439                &starts, &e, &startinpos, &endinpos, &exc, &s,
4440                &unicode, &outpos))
4441            goto onError;
4442    }
4443
4444    /* end of string */
4445
4446    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4447        /* if we're in an inconsistent state, that's an error */
4448        if (surrogate ||
4449                (base64bits >= 6) ||
4450                (base64bits > 0 && base64buffer != 0)) {
4451            endinpos = size;
4452            if (unicode_decode_call_errorhandler(
4453                    errors, &errorHandler,
4454                    "utf7", "unterminated shift sequence",
4455                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4456                    &unicode, &outpos))
4457                goto onError;
4458            if (s < e)
4459                goto restart;
4460        }
4461    }
4462
4463    /* return state */
4464    if (consumed) {
4465        if (inShift) {
4466            outpos = shiftOutStart; /* back off output */
4467            *consumed = startinpos;
4468        }
4469        else {
4470            *consumed = s-starts;
4471        }
4472    }
4473
4474    if (unicode_resize(&unicode, outpos) < 0)
4475        goto onError;
4476
4477    Py_XDECREF(errorHandler);
4478    Py_XDECREF(exc);
4479    return unicode_result(unicode);
4480
4481  onError:
4482    Py_XDECREF(errorHandler);
4483    Py_XDECREF(exc);
4484    Py_DECREF(unicode);
4485    return NULL;
4486}
4487
4488
4489PyObject *
4490_PyUnicode_EncodeUTF7(PyObject *str,
4491                      int base64SetO,
4492                      int base64WhiteSpace,
4493                      const char *errors)
4494{
4495    int kind;
4496    void *data;
4497    Py_ssize_t len;
4498    PyObject *v;
4499    Py_ssize_t allocated;
4500    int inShift = 0;
4501    Py_ssize_t i;
4502    unsigned int base64bits = 0;
4503    unsigned long base64buffer = 0;
4504    char * out;
4505    char * start;
4506
4507    if (PyUnicode_READY(str) == -1)
4508        return NULL;
4509    kind = PyUnicode_KIND(str);
4510    data = PyUnicode_DATA(str);
4511    len = PyUnicode_GET_LENGTH(str);
4512
4513    if (len == 0)
4514        return PyBytes_FromStringAndSize(NULL, 0);
4515
4516    /* It might be possible to tighten this worst case */
4517    allocated = 8 * len;
4518    if (allocated / 8 != len)
4519        return PyErr_NoMemory();
4520
4521    v = PyBytes_FromStringAndSize(NULL, allocated);
4522    if (v == NULL)
4523        return NULL;
4524
4525    start = out = PyBytes_AS_STRING(v);
4526    for (i = 0; i < len; ++i) {
4527        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4528
4529        if (inShift) {
4530            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4531                /* shifting out */
4532                if (base64bits) { /* output remaining bits */
4533                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4534                    base64buffer = 0;
4535                    base64bits = 0;
4536                }
4537                inShift = 0;
4538                /* Characters not in the BASE64 set implicitly unshift the sequence
4539                   so no '-' is required, except if the character is itself a '-' */
4540                if (IS_BASE64(ch) || ch == '-') {
4541                    *out++ = '-';
4542                }
4543                *out++ = (char) ch;
4544            }
4545            else {
4546                goto encode_char;
4547            }
4548        }
4549        else { /* not in a shift sequence */
4550            if (ch == '+') {
4551                *out++ = '+';
4552                        *out++ = '-';
4553            }
4554            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4555                *out++ = (char) ch;
4556            }
4557            else {
4558                *out++ = '+';
4559                inShift = 1;
4560                goto encode_char;
4561            }
4562        }
4563        continue;
4564encode_char:
4565        if (ch >= 0x10000) {
4566            assert(ch <= MAX_UNICODE);
4567
4568            /* code first surrogate */
4569            base64bits += 16;
4570            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4571            while (base64bits >= 6) {
4572                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4573                base64bits -= 6;
4574            }
4575            /* prepare second surrogate */
4576            ch = Py_UNICODE_LOW_SURROGATE(ch);
4577        }
4578        base64bits += 16;
4579        base64buffer = (base64buffer << 16) | ch;
4580        while (base64bits >= 6) {
4581            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4582            base64bits -= 6;
4583        }
4584    }
4585    if (base64bits)
4586        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4587    if (inShift)
4588        *out++ = '-';
4589    if (_PyBytes_Resize(&v, out - start) < 0)
4590        return NULL;
4591    return v;
4592}
4593PyObject *
4594PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4595                     Py_ssize_t size,
4596                     int base64SetO,
4597                     int base64WhiteSpace,
4598                     const char *errors)
4599{
4600    PyObject *result;
4601    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4602    if (tmp == NULL)
4603        return NULL;
4604    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4605                                   base64WhiteSpace, errors);
4606    Py_DECREF(tmp);
4607    return result;
4608}
4609
4610#undef IS_BASE64
4611#undef FROM_BASE64
4612#undef TO_BASE64
4613#undef DECODE_DIRECT
4614#undef ENCODE_DIRECT
4615
4616/* --- UTF-8 Codec -------------------------------------------------------- */
4617
4618PyObject *
4619PyUnicode_DecodeUTF8(const char *s,
4620                     Py_ssize_t size,
4621                     const char *errors)
4622{
4623    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4624}
4625
4626#include "stringlib/asciilib.h"
4627#include "stringlib/codecs.h"
4628#include "stringlib/undef.h"
4629
4630#include "stringlib/ucs1lib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
4634#include "stringlib/ucs2lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
4638#include "stringlib/ucs4lib.h"
4639#include "stringlib/codecs.h"
4640#include "stringlib/undef.h"
4641
4642/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4643#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4644
4645/* Mask to quickly check whether a C 'long' contains a
4646   non-ASCII, UTF8-encoded char. */
4647#if (SIZEOF_LONG == 8)
4648# define ASCII_CHAR_MASK 0x8080808080808080L
4649#elif (SIZEOF_LONG == 4)
4650# define ASCII_CHAR_MASK 0x80808080L
4651#else
4652# error C 'long' size should be either 4 or 8!
4653#endif
4654
4655static Py_ssize_t
4656ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4657{
4658    const char *p = start;
4659    const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
4660
4661#if SIZEOF_LONG <= SIZEOF_VOID_P
4662    assert(!((size_t) dest & LONG_PTR_MASK));
4663    if (!((size_t) p & LONG_PTR_MASK)) {
4664        /* Fast path, see in STRINGLIB(utf8_decode) for
4665           an explanation. */
4666        /* Help register allocation */
4667        register const char *_p = p;
4668        register Py_UCS1 * q = dest;
4669        while (_p < aligned_end) {
4670            unsigned long value = *(const unsigned long *) _p;
4671            if (value & ASCII_CHAR_MASK)
4672                break;
4673            *((unsigned long *)q) = value;
4674            _p += SIZEOF_LONG;
4675            q += SIZEOF_LONG;
4676        }
4677        p = _p;
4678        while (p < end) {
4679            if ((unsigned char)*p & 0x80)
4680                break;
4681            *q++ = *p++;
4682        }
4683        return p - start;
4684    }
4685#endif
4686    while (p < end) {
4687        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4688           for an explanation. */
4689        if (!((size_t) p & LONG_PTR_MASK)) {
4690            /* Help register allocation */
4691            register const char *_p = p;
4692            while (_p < aligned_end) {
4693                unsigned long value = *(unsigned long *) _p;
4694                if (value & ASCII_CHAR_MASK)
4695                    break;
4696                _p += SIZEOF_LONG;
4697            }
4698            p = _p;
4699            if (_p == end)
4700                break;
4701        }
4702        if ((unsigned char)*p & 0x80)
4703            break;
4704        ++p;
4705    }
4706    memcpy(dest, start, p - start);
4707    return p - start;
4708}
4709
4710PyObject *
4711PyUnicode_DecodeUTF8Stateful(const char *s,
4712                             Py_ssize_t size,
4713                             const char *errors,
4714                             Py_ssize_t *consumed)
4715{
4716    PyObject *unicode;
4717    const char *starts = s;
4718    const char *end = s + size;
4719    Py_ssize_t outpos;
4720
4721    Py_ssize_t startinpos;
4722    Py_ssize_t endinpos;
4723    const char *errmsg = "";
4724    PyObject *errorHandler = NULL;
4725    PyObject *exc = NULL;
4726
4727    if (size == 0) {
4728        if (consumed)
4729            *consumed = 0;
4730        Py_INCREF(unicode_empty);
4731        return unicode_empty;
4732    }
4733
4734    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4735    if (size == 1 && (unsigned char)s[0] < 128) {
4736        if (consumed)
4737            *consumed = 1;
4738        return get_latin1_char((unsigned char)s[0]);
4739    }
4740
4741    unicode = PyUnicode_New(size, 127);
4742    if (!unicode)
4743        return NULL;
4744
4745    outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4746    s += outpos;
4747    while (s < end) {
4748        Py_UCS4 ch;
4749        int kind = PyUnicode_KIND(unicode);
4750        if (kind == PyUnicode_1BYTE_KIND) {
4751            if (PyUnicode_IS_ASCII(unicode))
4752                ch = asciilib_utf8_decode(&s, end,
4753                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4754            else
4755                ch = ucs1lib_utf8_decode(&s, end,
4756                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4757        } else if (kind == PyUnicode_2BYTE_KIND) {
4758            ch = ucs2lib_utf8_decode(&s, end,
4759                    PyUnicode_2BYTE_DATA(unicode), &outpos);
4760        } else {
4761            assert(kind == PyUnicode_4BYTE_KIND);
4762            ch = ucs4lib_utf8_decode(&s, end,
4763                    PyUnicode_4BYTE_DATA(unicode), &outpos);
4764        }
4765
4766        switch (ch) {
4767        case 0:
4768            if (s == end || consumed)
4769                goto End;
4770            errmsg = "unexpected end of data";
4771            startinpos = s - starts;
4772            endinpos = startinpos + 1;
4773            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4774                endinpos++;
4775            break;
4776        case 1:
4777            errmsg = "invalid start byte";
4778            startinpos = s - starts;
4779            endinpos = startinpos + 1;
4780            break;
4781        case 2:
4782            errmsg = "invalid continuation byte";
4783            startinpos = s - starts;
4784            endinpos = startinpos + 1;
4785            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4786                endinpos++;
4787            break;
4788        default:
4789            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4790                goto onError;
4791            continue;
4792        }
4793
4794        if (unicode_decode_call_errorhandler(
4795                errors, &errorHandler,
4796                "utf-8", errmsg,
4797                &starts, &end, &startinpos, &endinpos, &exc, &s,
4798                &unicode, &outpos))
4799            goto onError;
4800    }
4801
4802End:
4803    if (unicode_resize(&unicode, outpos) < 0)
4804        goto onError;
4805
4806    if (consumed)
4807        *consumed = s - starts;
4808
4809    Py_XDECREF(errorHandler);
4810    Py_XDECREF(exc);
4811    assert(_PyUnicode_CheckConsistency(unicode, 1));
4812    return unicode;
4813
4814onError:
4815    Py_XDECREF(errorHandler);
4816    Py_XDECREF(exc);
4817    Py_XDECREF(unicode);
4818    return NULL;
4819}
4820
4821#ifdef __APPLE__
4822
4823/* Simplified UTF-8 decoder using surrogateescape error handler,
4824   used to decode the command line arguments on Mac OS X. */
4825
4826wchar_t*
4827_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4828{
4829    const char *e;
4830    wchar_t *unicode;
4831    Py_ssize_t outpos;
4832
4833    /* Note: size will always be longer than the resulting Unicode
4834       character count */
4835    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4836        PyErr_NoMemory();
4837        return NULL;
4838    }
4839    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4840    if (!unicode)
4841        return NULL;
4842
4843    /* Unpack UTF-8 encoded data */
4844    e = s + size;
4845    outpos = 0;
4846    while (s < e) {
4847        Py_UCS4 ch;
4848#if SIZEOF_WCHAR_T == 4
4849        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4850#else
4851        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4852#endif
4853        if (ch > 0xFF) {
4854#if SIZEOF_WCHAR_T == 4
4855            assert(0);
4856#else
4857            assert(Py_UNICODE_IS_SURROGATE(ch));
4858            /*  compute and append the two surrogates: */
4859            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4860            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4861#endif
4862        }
4863        else {
4864            if (!ch && s == e)
4865                break;
4866            /* surrogateescape */
4867            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4868        }
4869    }
4870    unicode[outpos] = L'\0';
4871    return unicode;
4872}
4873
4874#endif /* __APPLE__ */
4875
4876/* Primary internal function which creates utf8 encoded bytes objects.
4877
4878   Allocation strategy:  if the string is short, convert into a stack buffer
4879   and allocate exactly as much space needed at the end.  Else allocate the
4880   maximum possible needed (4 result bytes per Unicode character), and return
4881   the excess memory at the end.
4882*/
4883PyObject *
4884_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4885{
4886    enum PyUnicode_Kind kind;
4887    void *data;
4888    Py_ssize_t size;
4889
4890    if (!PyUnicode_Check(unicode)) {
4891        PyErr_BadArgument();
4892        return NULL;
4893    }
4894
4895    if (PyUnicode_READY(unicode) == -1)
4896        return NULL;
4897
4898    if (PyUnicode_UTF8(unicode))
4899        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4900                                         PyUnicode_UTF8_LENGTH(unicode));
4901
4902    kind = PyUnicode_KIND(unicode);
4903    data = PyUnicode_DATA(unicode);
4904    size = PyUnicode_GET_LENGTH(unicode);
4905
4906    switch (kind) {
4907    default:
4908        assert(0);
4909    case PyUnicode_1BYTE_KIND:
4910        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4911        assert(!PyUnicode_IS_ASCII(unicode));
4912        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4913    case PyUnicode_2BYTE_KIND:
4914        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4915    case PyUnicode_4BYTE_KIND:
4916        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4917    }
4918}
4919
4920PyObject *
4921PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4922                     Py_ssize_t size,
4923                     const char *errors)
4924{
4925    PyObject *v, *unicode;
4926
4927    unicode = PyUnicode_FromUnicode(s, size);
4928    if (unicode == NULL)
4929        return NULL;
4930    v = _PyUnicode_AsUTF8String(unicode, errors);
4931    Py_DECREF(unicode);
4932    return v;
4933}
4934
4935PyObject *
4936PyUnicode_AsUTF8String(PyObject *unicode)
4937{
4938    return _PyUnicode_AsUTF8String(unicode, NULL);
4939}
4940
4941/* --- UTF-32 Codec ------------------------------------------------------- */
4942
4943PyObject *
4944PyUnicode_DecodeUTF32(const char *s,
4945                      Py_ssize_t size,
4946                      const char *errors,
4947                      int *byteorder)
4948{
4949    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4950}
4951
4952PyObject *
4953PyUnicode_DecodeUTF32Stateful(const char *s,
4954                              Py_ssize_t size,
4955                              const char *errors,
4956                              int *byteorder,
4957                              Py_ssize_t *consumed)
4958{
4959    const char *starts = s;
4960    Py_ssize_t startinpos;
4961    Py_ssize_t endinpos;
4962    Py_ssize_t outpos;
4963    PyObject *unicode;
4964    const unsigned char *q, *e;
4965    int bo = 0;       /* assume native ordering by default */
4966    const char *errmsg = "";
4967    /* Offsets from q for retrieving bytes in the right order. */
4968#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4969    int iorder[] = {0, 1, 2, 3};
4970#else
4971    int iorder[] = {3, 2, 1, 0};
4972#endif
4973    PyObject *errorHandler = NULL;
4974    PyObject *exc = NULL;
4975
4976    q = (unsigned char *)s;
4977    e = q + size;
4978
4979    if (byteorder)
4980        bo = *byteorder;
4981
4982    /* Check for BOM marks (U+FEFF) in the input and adjust current
4983       byte order setting accordingly. In native mode, the leading BOM
4984       mark is skipped, in all other modes, it is copied to the output
4985       stream as-is (giving a ZWNBSP character). */
4986    if (bo == 0) {
4987        if (size >= 4) {
4988            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4989                (q[iorder[1]] << 8) | q[iorder[0]];
4990#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4991            if (bom == 0x0000FEFF) {
4992                q += 4;
4993                bo = -1;
4994            }
4995            else if (bom == 0xFFFE0000) {
4996                q += 4;
4997                bo = 1;
4998            }
4999#else
5000            if (bom == 0x0000FEFF) {
5001                q += 4;
5002                bo = 1;
5003            }
5004            else if (bom == 0xFFFE0000) {
5005                q += 4;
5006                bo = -1;
5007            }
5008#endif
5009        }
5010    }
5011
5012    if (bo == -1) {
5013        /* force LE */
5014        iorder[0] = 0;
5015        iorder[1] = 1;
5016        iorder[2] = 2;
5017        iorder[3] = 3;
5018    }
5019    else if (bo == 1) {
5020        /* force BE */
5021        iorder[0] = 3;
5022        iorder[1] = 2;
5023        iorder[2] = 1;
5024        iorder[3] = 0;
5025    }
5026
5027    /* This might be one to much, because of a BOM */
5028    unicode = PyUnicode_New((size+3)/4, 127);
5029    if (!unicode)
5030        return NULL;
5031    if (size == 0)
5032        return unicode;
5033    outpos = 0;
5034
5035    while (q < e) {
5036        Py_UCS4 ch;
5037        /* remaining bytes at the end? (size should be divisible by 4) */
5038        if (e-q<4) {
5039            if (consumed)
5040                break;
5041            errmsg = "truncated data";
5042            startinpos = ((const char *)q)-starts;
5043            endinpos = ((const char *)e)-starts;
5044            goto utf32Error;
5045            /* The remaining input chars are ignored if the callback
5046               chooses to skip the input */
5047        }
5048        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5049            (q[iorder[1]] << 8) | q[iorder[0]];
5050
5051        if (ch >= 0x110000)
5052        {
5053            errmsg = "codepoint not in range(0x110000)";
5054            startinpos = ((const char *)q)-starts;
5055            endinpos = startinpos+4;
5056            goto utf32Error;
5057        }
5058        if (unicode_putchar(&unicode, &outpos, ch) < 0)
5059            goto onError;
5060        q += 4;
5061        continue;
5062      utf32Error:
5063        if (unicode_decode_call_errorhandler(
5064                errors, &errorHandler,
5065                "utf32", errmsg,
5066                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5067                &unicode, &outpos))
5068            goto onError;
5069    }
5070
5071    if (byteorder)
5072        *byteorder = bo;
5073
5074    if (consumed)
5075        *consumed = (const char *)q-starts;
5076
5077    /* Adjust length */
5078    if (unicode_resize(&unicode, outpos) < 0)
5079        goto onError;
5080
5081    Py_XDECREF(errorHandler);
5082    Py_XDECREF(exc);
5083    return unicode_result(unicode);
5084
5085  onError:
5086    Py_DECREF(unicode);
5087    Py_XDECREF(errorHandler);
5088    Py_XDECREF(exc);
5089    return NULL;
5090}
5091
5092PyObject *
5093_PyUnicode_EncodeUTF32(PyObject *str,
5094                       const char *errors,
5095                       int byteorder)
5096{
5097    int kind;
5098    void *data;
5099    Py_ssize_t len;
5100    PyObject *v;
5101    unsigned char *p;
5102    Py_ssize_t nsize, bytesize, i;
5103    /* Offsets from p for storing byte pairs in the right order. */
5104#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5105    int iorder[] = {0, 1, 2, 3};
5106#else
5107    int iorder[] = {3, 2, 1, 0};
5108#endif
5109
5110#define STORECHAR(CH)                           \
5111    do {                                        \
5112        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5113        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5114        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5115        p[iorder[0]] = (CH) & 0xff;             \
5116        p += 4;                                 \
5117    } while(0)
5118
5119    if (!PyUnicode_Check(str)) {
5120        PyErr_BadArgument();
5121        return NULL;
5122    }
5123    if (PyUnicode_READY(str) == -1)
5124        return NULL;
5125    kind = PyUnicode_KIND(str);
5126    data = PyUnicode_DATA(str);
5127    len = PyUnicode_GET_LENGTH(str);
5128
5129    nsize = len + (byteorder == 0);
5130    bytesize = nsize * 4;
5131    if (bytesize / 4 != nsize)
5132        return PyErr_NoMemory();
5133    v = PyBytes_FromStringAndSize(NULL, bytesize);
5134    if (v == NULL)
5135        return NULL;
5136
5137    p = (unsigned char *)PyBytes_AS_STRING(v);
5138    if (byteorder == 0)
5139        STORECHAR(0xFEFF);
5140    if (len == 0)
5141        goto done;
5142
5143    if (byteorder == -1) {
5144        /* force LE */
5145        iorder[0] = 0;
5146        iorder[1] = 1;
5147        iorder[2] = 2;
5148        iorder[3] = 3;
5149    }
5150    else if (byteorder == 1) {
5151        /* force BE */
5152        iorder[0] = 3;
5153        iorder[1] = 2;
5154        iorder[2] = 1;
5155        iorder[3] = 0;
5156    }
5157
5158    for (i = 0; i < len; i++)
5159        STORECHAR(PyUnicode_READ(kind, data, i));
5160
5161  done:
5162    return v;
5163#undef STORECHAR
5164}
5165
5166PyObject *
5167PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5168                      Py_ssize_t size,
5169                      const char *errors,
5170                      int byteorder)
5171{
5172    PyObject *result;
5173    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5174    if (tmp == NULL)
5175        return NULL;
5176    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5177    Py_DECREF(tmp);
5178    return result;
5179}
5180
5181PyObject *
5182PyUnicode_AsUTF32String(PyObject *unicode)
5183{
5184    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5185}
5186
5187/* --- UTF-16 Codec ------------------------------------------------------- */
5188
5189PyObject *
5190PyUnicode_DecodeUTF16(const char *s,
5191                      Py_ssize_t size,
5192                      const char *errors,
5193                      int *byteorder)
5194{
5195    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5196}
5197
5198/* Two masks for fast checking of whether a C 'long' may contain
5199   UTF16-encoded surrogate characters. This is an efficient heuristic,
5200   assuming that non-surrogate characters with a code point >= 0x8000 are
5201   rare in most input.
5202   FAST_CHAR_MASK is used when the input is in native byte ordering,
5203   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
5204*/
5205#if (SIZEOF_LONG == 8)
5206# define FAST_CHAR_MASK         0x8000800080008000L
5207# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5208# define STRIPPED_MASK          0x00FF00FF00FF00FFL
5209#elif (SIZEOF_LONG == 4)
5210# define FAST_CHAR_MASK         0x80008000L
5211# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5212# define STRIPPED_MASK          0x00FF00FFL
5213#else
5214# error C 'long' size should be either 4 or 8!
5215#endif
5216
5217PyObject *
5218PyUnicode_DecodeUTF16Stateful(const char *s,
5219                              Py_ssize_t size,
5220                              const char *errors,
5221                              int *byteorder,
5222                              Py_ssize_t *consumed)
5223{
5224    const char *starts = s;
5225    Py_ssize_t startinpos;
5226    Py_ssize_t endinpos;
5227    Py_ssize_t outpos;
5228    PyObject *unicode;
5229    const unsigned char *q, *e, *aligned_end;
5230    int bo = 0;       /* assume native ordering by default */
5231    int native_ordering = 0;
5232    const char *errmsg = "";
5233    /* Offsets from q for retrieving byte pairs in the right order. */
5234#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5235    int ihi = 1, ilo = 0;
5236#else
5237    int ihi = 0, ilo = 1;
5238#endif
5239    PyObject *errorHandler = NULL;
5240    PyObject *exc = NULL;
5241
5242    /* Note: size will always be longer than the resulting Unicode
5243       character count */
5244    unicode = PyUnicode_New(size, 127);
5245    if (!unicode)
5246        return NULL;
5247    if (size == 0)
5248        return unicode;
5249    outpos = 0;
5250
5251    q = (unsigned char *)s;
5252    e = q + size - 1;
5253
5254    if (byteorder)
5255        bo = *byteorder;
5256
5257    /* Check for BOM marks (U+FEFF) in the input and adjust current
5258       byte order setting accordingly. In native mode, the leading BOM
5259       mark is skipped, in all other modes, it is copied to the output
5260       stream as-is (giving a ZWNBSP character). */
5261    if (bo == 0) {
5262        if (size >= 2) {
5263            const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
5264#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5265            if (bom == 0xFEFF) {
5266                q += 2;
5267                bo = -1;
5268            }
5269            else if (bom == 0xFFFE) {
5270                q += 2;
5271                bo = 1;
5272            }
5273#else
5274            if (bom == 0xFEFF) {
5275                q += 2;
5276                bo = 1;
5277            }
5278            else if (bom == 0xFFFE) {
5279                q += 2;
5280                bo = -1;
5281            }
5282#endif
5283        }
5284    }
5285
5286    if (bo == -1) {
5287        /* force LE */
5288        ihi = 1;
5289        ilo = 0;
5290    }
5291    else if (bo == 1) {
5292        /* force BE */
5293        ihi = 0;
5294        ilo = 1;
5295    }
5296#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5297    native_ordering = ilo < ihi;
5298#else
5299    native_ordering = ilo > ihi;
5300#endif
5301
5302    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5303    while (q < e) {
5304        Py_UCS4 ch;
5305        /* First check for possible aligned read of a C 'long'. Unaligned
5306           reads are more expensive, better to defer to another iteration. */
5307        if (!((size_t) q & LONG_PTR_MASK)) {
5308            /* Fast path for runs of non-surrogate chars. */
5309            register const unsigned char *_q = q;
5310            int kind = PyUnicode_KIND(unicode);
5311            void *data = PyUnicode_DATA(unicode);
5312            while (_q < aligned_end) {
5313                unsigned long block = * (unsigned long *) _q;
5314                Py_UCS4 maxch;
5315                if (native_ordering) {
5316                    /* Can use buffer directly */
5317                    if (block & FAST_CHAR_MASK)
5318                        break;
5319                }
5320                else {
5321                    /* Need to byte-swap */
5322                    if (block & SWAPPED_FAST_CHAR_MASK)
5323                        break;
5324                    block = ((block >> 8) & STRIPPED_MASK) |
5325                            ((block & STRIPPED_MASK) << 8);
5326                }
5327                maxch = (Py_UCS2)(block & 0xFFFF);
5328#if SIZEOF_LONG == 8
5329                ch = (Py_UCS2)((block >> 16) & 0xFFFF);
5330                maxch = MAX_MAXCHAR(maxch, ch);
5331                ch = (Py_UCS2)((block >> 32) & 0xFFFF);
5332                maxch = MAX_MAXCHAR(maxch, ch);
5333                ch = (Py_UCS2)(block >> 48);
5334                maxch = MAX_MAXCHAR(maxch, ch);
5335#else
5336                ch = (Py_UCS2)(block >> 16);
5337                maxch = MAX_MAXCHAR(maxch, ch);
5338#endif
5339                if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5340                    if (unicode_widen(&unicode, outpos, maxch) < 0)
5341                        goto onError;
5342                    kind = PyUnicode_KIND(unicode);
5343                    data = PyUnicode_DATA(unicode);
5344                }
5345#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5346                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
5347#if SIZEOF_LONG == 8
5348                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5349                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5350                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5351#else
5352                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5353#endif
5354#else
5355#if SIZEOF_LONG == 8
5356                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5357                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5358                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5359#else
5360                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5361#endif
5362                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
5363#endif
5364                _q += SIZEOF_LONG;
5365            }
5366            q = _q;
5367            if (q >= e)
5368                break;
5369        }
5370        ch = (q[ihi] << 8) | q[ilo];
5371
5372        q += 2;
5373
5374        if (!Py_UNICODE_IS_SURROGATE(ch)) {
5375            if (unicode_putchar(&unicode, &outpos, ch) < 0)
5376                goto onError;
5377            continue;
5378        }
5379
5380        /* UTF-16 code pair: */
5381        if (q > e) {
5382            errmsg = "unexpected end of data";
5383            startinpos = (((const char *)q) - 2) - starts;
5384            endinpos = ((const char *)e) + 1 - starts;
5385            goto utf16Error;
5386        }
5387        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5388            Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
5389            q += 2;
5390            if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
5391                if (unicode_putchar(&unicode, &outpos,
5392                                    Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
5393                    goto onError;
5394                continue;
5395            }
5396            else {
5397                errmsg = "illegal UTF-16 surrogate";
5398                startinpos = (((const char *)q)-4)-starts;
5399                endinpos = startinpos+2;
5400                goto utf16Error;
5401            }
5402
5403        }
5404        errmsg = "illegal encoding";
5405        startinpos = (((const char *)q)-2)-starts;
5406        endinpos = startinpos+2;
5407        /* Fall through to report the error */
5408
5409      utf16Error:
5410        if (unicode_decode_call_errorhandler(
5411                errors,
5412                &errorHandler,
5413                "utf16", errmsg,
5414                &starts,
5415                (const char **)&e,
5416                &startinpos,
5417                &endinpos,
5418                &exc,
5419                (const char **)&q,
5420                &unicode,
5421                &outpos))
5422            goto onError;
5423    }
5424    /* remaining byte at the end? (size should be even) */
5425    if (e == q) {
5426        if (!consumed) {
5427            errmsg = "truncated data";
5428            startinpos = ((const char *)q) - starts;
5429            endinpos = ((const char *)e) + 1 - starts;
5430            if (unicode_decode_call_errorhandler(
5431                    errors,
5432                    &errorHandler,
5433                    "utf16", errmsg,
5434                    &starts,
5435                    (const char **)&e,
5436                    &startinpos,
5437                    &endinpos,
5438                    &exc,
5439                    (const char **)&q,
5440                    &unicode,
5441                    &outpos))
5442                goto onError;
5443            /* The remaining input chars are ignored if the callback
5444               chooses to skip the input */
5445        }
5446    }
5447
5448    if (byteorder)
5449        *byteorder = bo;
5450
5451    if (consumed)
5452        *consumed = (const char *)q-starts;
5453
5454    /* Adjust length */
5455    if (unicode_resize(&unicode, outpos) < 0)
5456        goto onError;
5457
5458    Py_XDECREF(errorHandler);
5459    Py_XDECREF(exc);
5460    return unicode_result(unicode);
5461
5462  onError:
5463    Py_DECREF(unicode);
5464    Py_XDECREF(errorHandler);
5465    Py_XDECREF(exc);
5466    return NULL;
5467}
5468
5469#undef FAST_CHAR_MASK
5470#undef SWAPPED_FAST_CHAR_MASK
5471
5472PyObject *
5473_PyUnicode_EncodeUTF16(PyObject *str,
5474                       const char *errors,
5475                       int byteorder)
5476{
5477    int kind;
5478    void *data;
5479    Py_ssize_t len;
5480    PyObject *v;
5481    unsigned char *p;
5482    Py_ssize_t nsize, bytesize;
5483    Py_ssize_t i, pairs;
5484    /* Offsets from p for storing byte pairs in the right order. */
5485#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5486    int ihi = 1, ilo = 0;
5487#else
5488    int ihi = 0, ilo = 1;
5489#endif
5490
5491#define STORECHAR(CH)                           \
5492    do {                                        \
5493        p[ihi] = ((CH) >> 8) & 0xff;            \
5494        p[ilo] = (CH) & 0xff;                   \
5495        p += 2;                                 \
5496    } while(0)
5497
5498    if (!PyUnicode_Check(str)) {
5499        PyErr_BadArgument();
5500        return NULL;
5501    }
5502    if (PyUnicode_READY(str) == -1)
5503        return NULL;
5504    kind = PyUnicode_KIND(str);
5505    data = PyUnicode_DATA(str);
5506    len = PyUnicode_GET_LENGTH(str);
5507
5508    pairs = 0;
5509    if (kind == PyUnicode_4BYTE_KIND)
5510        for (i = 0; i < len; i++)
5511            if (PyUnicode_READ(kind, data, i) >= 0x10000)
5512                pairs++;
5513    /* 2 * (len + pairs + (byteorder == 0)) */
5514    if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5515        return PyErr_NoMemory();
5516    nsize = len + pairs + (byteorder == 0);
5517    bytesize = nsize * 2;
5518    if (bytesize / 2 != nsize)
5519        return PyErr_NoMemory();
5520    v = PyBytes_FromStringAndSize(NULL, bytesize);
5521    if (v == NULL)
5522        return NULL;
5523
5524    p = (unsigned char *)PyBytes_AS_STRING(v);
5525    if (byteorder == 0)
5526        STORECHAR(0xFEFF);
5527    if (len == 0)
5528        goto done;
5529
5530    if (byteorder == -1) {
5531        /* force LE */
5532        ihi = 1;
5533        ilo = 0;
5534    }
5535    else if (byteorder == 1) {
5536        /* force BE */
5537        ihi = 0;
5538        ilo = 1;
5539    }
5540
5541    for (i = 0; i < len; i++) {
5542        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5543        Py_UCS4 ch2 = 0;
5544        if (ch >= 0x10000) {
5545            ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5546            ch  = Py_UNICODE_HIGH_SURROGATE(ch);
5547        }
5548        STORECHAR(ch);
5549        if (ch2)
5550            STORECHAR(ch2);
5551    }
5552
5553  done:
5554    return v;
5555#undef STORECHAR
5556}
5557
5558PyObject *
5559PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5560                      Py_ssize_t size,
5561                      const char *errors,
5562                      int byteorder)
5563{
5564    PyObject *result;
5565    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5566    if (tmp == NULL)
5567        return NULL;
5568    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5569    Py_DECREF(tmp);
5570    return result;
5571}
5572
5573PyObject *
5574PyUnicode_AsUTF16String(PyObject *unicode)
5575{
5576    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5577}
5578
5579/* --- Unicode Escape Codec ----------------------------------------------- */
5580
5581/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5582   if all the escapes in the string make it still a valid ASCII string.
5583   Returns -1 if any escapes were found which cause the string to
5584   pop out of ASCII range.  Otherwise returns the length of the
5585   required buffer to hold the string.
5586   */
5587static Py_ssize_t
5588length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5589{
5590    const unsigned char *p = (const unsigned char *)s;
5591    const unsigned char *end = p + size;
5592    Py_ssize_t length = 0;
5593
5594    if (size < 0)
5595        return -1;
5596
5597    for (; p < end; ++p) {
5598        if (*p > 127) {
5599            /* Non-ASCII */
5600            return -1;
5601        }
5602        else if (*p != '\\') {
5603            /* Normal character */
5604            ++length;
5605        }
5606        else {
5607            /* Backslash-escape, check next char */
5608            ++p;
5609            /* Escape sequence reaches till end of string or
5610               non-ASCII follow-up. */
5611            if (p >= end || *p > 127)
5612                return -1;
5613            switch (*p) {
5614            case '\n':
5615                /* backslash + \n result in zero characters */
5616                break;
5617            case '\\': case '\'': case '\"':
5618            case 'b': case 'f': case 't':
5619            case 'n': case 'r': case 'v': case 'a':
5620                ++length;
5621                break;
5622            case '0': case '1': case '2': case '3':
5623            case '4': case '5': case '6': case '7':
5624            case 'x': case 'u': case 'U': case 'N':
5625                /* these do not guarantee ASCII characters */
5626                return -1;
5627            default:
5628                /* count the backslash + the other character */
5629                length += 2;
5630            }
5631        }
5632    }
5633    return length;
5634}
5635
5636static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5637
5638PyObject *
5639PyUnicode_DecodeUnicodeEscape(const char *s,
5640                              Py_ssize_t size,
5641                              const char *errors)
5642{
5643    const char *starts = s;
5644    Py_ssize_t startinpos;
5645    Py_ssize_t endinpos;
5646    int j;
5647    PyObject *v;
5648    const char *end;
5649    char* message;
5650    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5651    PyObject *errorHandler = NULL;
5652    PyObject *exc = NULL;
5653    Py_ssize_t len;
5654    Py_ssize_t i;
5655
5656    len = length_of_escaped_ascii_string(s, size);
5657
5658    /* After length_of_escaped_ascii_string() there are two alternatives,
5659       either the string is pure ASCII with named escapes like \n, etc.
5660       and we determined it's exact size (common case)
5661       or it contains \x, \u, ... escape sequences.  then we create a
5662       legacy wchar string and resize it at the end of this function. */
5663    if (len >= 0) {
5664        v = PyUnicode_New(len, 127);
5665        if (!v)
5666            goto onError;
5667        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5668    }
5669    else {
5670        /* Escaped strings will always be longer than the resulting
5671           Unicode string, so we start with size here and then reduce the
5672           length after conversion to the true value.
5673           (but if the error callback returns a long replacement string
5674           we'll have to allocate more space) */
5675        v = PyUnicode_New(size, 127);
5676        if (!v)
5677            goto onError;
5678        len = size;
5679    }
5680
5681    if (size == 0)
5682        return v;
5683    i = 0;
5684    end = s + size;
5685
5686    while (s < end) {
5687        unsigned char c;
5688        Py_UCS4 x;
5689        int digits;
5690
5691        /* The only case in which i == ascii_length is a backslash
5692           followed by a newline. */
5693        assert(i <= len);
5694
5695        /* Non-escape characters are interpreted as Unicode ordinals */
5696        if (*s != '\\') {
5697            if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5698                goto onError;
5699            continue;
5700        }
5701
5702        startinpos = s-starts;
5703        /* \ - Escapes */
5704        s++;
5705        c = *s++;
5706        if (s > end)
5707            c = '\0'; /* Invalid after \ */
5708
5709        /* The only case in which i == ascii_length is a backslash
5710           followed by a newline. */
5711        assert(i < len || (i == len && c == '\n'));
5712
5713        switch (c) {
5714
5715            /* \x escapes */
5716#define WRITECHAR(ch)                                   \
5717            do {                                        \
5718                if (unicode_putchar(&v, &i, ch) < 0)    \
5719                    goto onError;                       \
5720            }while(0)
5721
5722        case '\n': break;
5723        case '\\': WRITECHAR('\\'); break;
5724        case '\'': WRITECHAR('\''); break;
5725        case '\"': WRITECHAR('\"'); break;
5726        case 'b': WRITECHAR('\b'); break;
5727        /* FF */
5728        case 'f': WRITECHAR('\014'); break;
5729        case 't': WRITECHAR('\t'); break;
5730        case 'n': WRITECHAR('\n'); break;
5731        case 'r': WRITECHAR('\r'); break;
5732        /* VT */
5733        case 'v': WRITECHAR('\013'); break;
5734        /* BEL, not classic C */
5735        case 'a': WRITECHAR('\007'); break;
5736
5737            /* \OOO (octal) escapes */
5738        case '0': case '1': case '2': case '3':
5739        case '4': case '5': case '6': case '7':
5740            x = s[-1] - '0';
5741            if (s < end && '0' <= *s && *s <= '7') {
5742                x = (x<<3) + *s++ - '0';
5743                if (s < end && '0' <= *s && *s <= '7')
5744                    x = (x<<3) + *s++ - '0';
5745            }
5746            WRITECHAR(x);
5747            break;
5748
5749            /* hex escapes */
5750            /* \xXX */
5751        case 'x':
5752            digits = 2;
5753            message = "truncated \\xXX escape";
5754            goto hexescape;
5755
5756            /* \uXXXX */
5757        case 'u':
5758            digits = 4;
5759            message = "truncated \\uXXXX escape";
5760            goto hexescape;
5761
5762            /* \UXXXXXXXX */
5763        case 'U':
5764            digits = 8;
5765            message = "truncated \\UXXXXXXXX escape";
5766        hexescape:
5767            chr = 0;
5768            if (s+digits>end) {
5769                endinpos = size;
5770                if (unicode_decode_call_errorhandler(
5771                        errors, &errorHandler,
5772                        "unicodeescape", "end of string in escape sequence",
5773                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5774                        &v, &i))
5775                    goto onError;
5776                goto nextByte;
5777            }
5778            for (j = 0; j < digits; ++j) {
5779                c = (unsigned char) s[j];
5780                if (!Py_ISXDIGIT(c)) {
5781                    endinpos = (s+j+1)-starts;
5782                    if (unicode_decode_call_errorhandler(
5783                            errors, &errorHandler,
5784                            "unicodeescape", message,
5785                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5786                            &v, &i))
5787                        goto onError;
5788                    len = PyUnicode_GET_LENGTH(v);
5789                    goto nextByte;
5790                }
5791                chr = (chr<<4) & ~0xF;
5792                if (c >= '0' && c <= '9')
5793                    chr += c - '0';
5794                else if (c >= 'a' && c <= 'f')
5795                    chr += 10 + c - 'a';
5796                else
5797                    chr += 10 + c - 'A';
5798            }
5799            s += j;
5800            if (chr == 0xffffffff && PyErr_Occurred())
5801                /* _decoding_error will have already written into the
5802                   target buffer. */
5803                break;
5804        store:
5805            /* when we get here, chr is a 32-bit unicode character */
5806            if (chr <= MAX_UNICODE) {
5807                WRITECHAR(chr);
5808            } else {
5809                endinpos = s-starts;
5810                if (unicode_decode_call_errorhandler(
5811                        errors, &errorHandler,
5812                        "unicodeescape", "illegal Unicode character",
5813                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5814                        &v, &i))
5815                    goto onError;
5816            }
5817            break;
5818
5819            /* \N{name} */
5820        case 'N':
5821            message = "malformed \\N character escape";
5822            if (ucnhash_CAPI == NULL) {
5823                /* load the unicode data module */
5824                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5825                                                PyUnicodeData_CAPSULE_NAME, 1);
5826                if (ucnhash_CAPI == NULL)
5827                    goto ucnhashError;
5828            }
5829            if (*s == '{') {
5830                const char *start = s+1;
5831                /* look for the closing brace */
5832                while (*s != '}' && s < end)
5833                    s++;
5834                if (s > start && s < end && *s == '}') {
5835                    /* found a name.  look it up in the unicode database */
5836                    message = "unknown Unicode character name";
5837                    s++;
5838                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5839                                              &chr, 0))
5840                        goto store;
5841                }
5842            }
5843            endinpos = s-starts;
5844            if (unicode_decode_call_errorhandler(
5845                    errors, &errorHandler,
5846                    "unicodeescape", message,
5847                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5848                    &v, &i))
5849                goto onError;
5850            break;
5851
5852        default:
5853            if (s > end) {
5854                message = "\\ at end of string";
5855                s--;
5856                endinpos = s-starts;
5857                if (unicode_decode_call_errorhandler(
5858                        errors, &errorHandler,
5859                        "unicodeescape", message,
5860                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5861                        &v, &i))
5862                    goto onError;
5863            }
5864            else {
5865                WRITECHAR('\\');
5866                WRITECHAR(s[-1]);
5867            }
5868            break;
5869        }
5870      nextByte:
5871        ;
5872    }
5873#undef WRITECHAR
5874
5875    if (unicode_resize(&v, i) < 0)
5876        goto onError;
5877    Py_XDECREF(errorHandler);
5878    Py_XDECREF(exc);
5879    return unicode_result(v);
5880
5881  ucnhashError:
5882    PyErr_SetString(
5883        PyExc_UnicodeError,
5884        "\\N escapes not supported (can't load unicodedata module)"
5885        );
5886    Py_XDECREF(v);
5887    Py_XDECREF(errorHandler);
5888    Py_XDECREF(exc);
5889    return NULL;
5890
5891  onError:
5892    Py_XDECREF(v);
5893    Py_XDECREF(errorHandler);
5894    Py_XDECREF(exc);
5895    return NULL;
5896}
5897
5898/* Return a Unicode-Escape string version of the Unicode object.
5899
5900   If quotes is true, the string is enclosed in u"" or u'' quotes as
5901   appropriate.
5902
5903*/
5904
5905PyObject *
5906PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5907{
5908    Py_ssize_t i, len;
5909    PyObject *repr;
5910    char *p;
5911    int kind;
5912    void *data;
5913    Py_ssize_t expandsize = 0;
5914
5915    /* Initial allocation is based on the longest-possible unichr
5916       escape.
5917
5918       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5919       unichr, so in this case it's the longest unichr escape. In
5920       narrow (UTF-16) builds this is five chars per source unichr
5921       since there are two unichrs in the surrogate pair, so in narrow
5922       (UTF-16) builds it's not the longest unichr escape.
5923
5924       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5925       so in the narrow (UTF-16) build case it's the longest unichr
5926       escape.
5927    */
5928
5929    if (!PyUnicode_Check(unicode)) {
5930        PyErr_BadArgument();
5931        return NULL;
5932    }
5933    if (PyUnicode_READY(unicode) == -1)
5934        return NULL;
5935    len = PyUnicode_GET_LENGTH(unicode);
5936    kind = PyUnicode_KIND(unicode);
5937    data = PyUnicode_DATA(unicode);
5938    switch (kind) {
5939    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5940    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5941    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5942    }
5943
5944    if (len == 0)
5945        return PyBytes_FromStringAndSize(NULL, 0);
5946
5947    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5948        return PyErr_NoMemory();
5949
5950    repr = PyBytes_FromStringAndSize(NULL,
5951                                     2
5952                                     + expandsize*len
5953                                     + 1);
5954    if (repr == NULL)
5955        return NULL;
5956
5957    p = PyBytes_AS_STRING(repr);
5958
5959    for (i = 0; i < len; i++) {
5960        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5961
5962        /* Escape backslashes */
5963        if (ch == '\\') {
5964            *p++ = '\\';
5965            *p++ = (char) ch;
5966            continue;
5967        }
5968
5969        /* Map 21-bit characters to '\U00xxxxxx' */
5970        else if (ch >= 0x10000) {
5971            assert(ch <= MAX_UNICODE);
5972            *p++ = '\\';
5973            *p++ = 'U';
5974            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5975            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5976            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5977            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5978            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5979            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5980            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5981            *p++ = Py_hexdigits[ch & 0x0000000F];
5982            continue;
5983        }
5984
5985        /* Map 16-bit characters to '\uxxxx' */
5986        if (ch >= 256) {
5987            *p++ = '\\';
5988            *p++ = 'u';
5989            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5990            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5991            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5992            *p++ = Py_hexdigits[ch & 0x000F];
5993        }
5994
5995        /* Map special whitespace to '\t', \n', '\r' */
5996        else if (ch == '\t') {
5997            *p++ = '\\';
5998            *p++ = 't';
5999        }
6000        else if (ch == '\n') {
6001            *p++ = '\\';
6002            *p++ = 'n';
6003        }
6004        else if (ch == '\r') {
6005            *p++ = '\\';
6006            *p++ = 'r';
6007        }
6008
6009        /* Map non-printable US ASCII to '\xhh' */
6010        else if (ch < ' ' || ch >= 0x7F) {
6011            *p++ = '\\';
6012            *p++ = 'x';
6013            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6014            *p++ = Py_hexdigits[ch & 0x000F];
6015        }
6016
6017        /* Copy everything else as-is */
6018        else
6019            *p++ = (char) ch;
6020    }
6021
6022    assert(p - PyBytes_AS_STRING(repr) > 0);
6023    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6024        return NULL;
6025    return repr;
6026}
6027
6028PyObject *
6029PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6030                              Py_ssize_t size)
6031{
6032    PyObject *result;
6033    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6034    if (tmp == NULL)
6035        return NULL;
6036    result = PyUnicode_AsUnicodeEscapeString(tmp);
6037    Py_DECREF(tmp);
6038    return result;
6039}
6040
6041/* --- Raw Unicode Escape Codec ------------------------------------------- */
6042
6043PyObject *
6044PyUnicode_DecodeRawUnicodeEscape(const char *s,
6045                                 Py_ssize_t size,
6046                                 const char *errors)
6047{
6048    const char *starts = s;
6049    Py_ssize_t startinpos;
6050    Py_ssize_t endinpos;
6051    Py_ssize_t outpos;
6052    PyObject *v;
6053    const char *end;
6054    const char *bs;
6055    PyObject *errorHandler = NULL;
6056    PyObject *exc = NULL;
6057
6058    /* Escaped strings will always be longer than the resulting
6059       Unicode string, so we start with size here and then reduce the
6060       length after conversion to the true value. (But decoding error
6061       handler might have to resize the string) */
6062    v = PyUnicode_New(size, 127);
6063    if (v == NULL)
6064        goto onError;
6065    if (size == 0)
6066        return v;
6067    outpos = 0;
6068    end = s + size;
6069    while (s < end) {
6070        unsigned char c;
6071        Py_UCS4 x;
6072        int i;
6073        int count;
6074
6075        /* Non-escape characters are interpreted as Unicode ordinals */
6076        if (*s != '\\') {
6077            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6078                goto onError;
6079            continue;
6080        }
6081        startinpos = s-starts;
6082
6083        /* \u-escapes are only interpreted iff the number of leading
6084           backslashes if odd */
6085        bs = s;
6086        for (;s < end;) {
6087            if (*s != '\\')
6088                break;
6089            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6090                goto onError;
6091        }
6092        if (((s - bs) & 1) == 0 ||
6093            s >= end ||
6094            (*s != 'u' && *s != 'U')) {
6095            continue;
6096        }
6097        outpos--;
6098        count = *s=='u' ? 4 : 8;
6099        s++;
6100
6101        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6102        for (x = 0, i = 0; i < count; ++i, ++s) {
6103            c = (unsigned char)*s;
6104            if (!Py_ISXDIGIT(c)) {
6105                endinpos = s-starts;
6106                if (unicode_decode_call_errorhandler(
6107                        errors, &errorHandler,
6108                        "rawunicodeescape", "truncated \\uXXXX",
6109                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6110                        &v, &outpos))
6111                    goto onError;
6112                goto nextByte;
6113            }
6114            x = (x<<4) & ~0xF;
6115            if (c >= '0' && c <= '9')
6116                x += c - '0';
6117            else if (c >= 'a' && c <= 'f')
6118                x += 10 + c - 'a';
6119            else
6120                x += 10 + c - 'A';
6121        }
6122        if (x <= MAX_UNICODE) {
6123            if (unicode_putchar(&v, &outpos, x) < 0)
6124                goto onError;
6125        } else {
6126            endinpos = s-starts;
6127            if (unicode_decode_call_errorhandler(
6128                    errors, &errorHandler,
6129                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6130                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6131                    &v, &outpos))
6132                goto onError;
6133        }
6134      nextByte:
6135        ;
6136    }
6137    if (unicode_resize(&v, outpos) < 0)
6138        goto onError;
6139    Py_XDECREF(errorHandler);
6140    Py_XDECREF(exc);
6141    return unicode_result(v);
6142
6143  onError:
6144    Py_XDECREF(v);
6145    Py_XDECREF(errorHandler);
6146    Py_XDECREF(exc);
6147    return NULL;
6148}
6149
6150
6151PyObject *
6152PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6153{
6154    PyObject *repr;
6155    char *p;
6156    char *q;
6157    Py_ssize_t expandsize, pos;
6158    int kind;
6159    void *data;
6160    Py_ssize_t len;
6161
6162    if (!PyUnicode_Check(unicode)) {
6163        PyErr_BadArgument();
6164        return NULL;
6165    }
6166    if (PyUnicode_READY(unicode) == -1)
6167        return NULL;
6168    kind = PyUnicode_KIND(unicode);
6169    data = PyUnicode_DATA(unicode);
6170    len = PyUnicode_GET_LENGTH(unicode);
6171    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6172       bytes, and 1 byte characters 4. */
6173    expandsize = kind * 2 + 2;
6174
6175    if (len > PY_SSIZE_T_MAX / expandsize)
6176        return PyErr_NoMemory();
6177
6178    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6179    if (repr == NULL)
6180        return NULL;
6181    if (len == 0)
6182        return repr;
6183
6184    p = q = PyBytes_AS_STRING(repr);
6185    for (pos = 0; pos < len; pos++) {
6186        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6187        /* Map 32-bit characters to '\Uxxxxxxxx' */
6188        if (ch >= 0x10000) {
6189            assert(ch <= MAX_UNICODE);
6190            *p++ = '\\';
6191            *p++ = 'U';
6192            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6193            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6194            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6195            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6196            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6197            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6198            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6199            *p++ = Py_hexdigits[ch & 15];
6200        }
6201        /* Map 16-bit characters to '\uxxxx' */
6202        else if (ch >= 256) {
6203            *p++ = '\\';
6204            *p++ = 'u';
6205            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6206            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6207            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6208            *p++ = Py_hexdigits[ch & 15];
6209        }
6210        /* Copy everything else as-is */
6211        else
6212            *p++ = (char) ch;
6213    }
6214
6215    assert(p > q);
6216    if (_PyBytes_Resize(&repr, p - q) < 0)
6217        return NULL;
6218    return repr;
6219}
6220
6221PyObject *
6222PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6223                                 Py_ssize_t size)
6224{
6225    PyObject *result;
6226    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6227    if (tmp == NULL)
6228        return NULL;
6229    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6230    Py_DECREF(tmp);
6231    return result;
6232}
6233
6234/* --- Unicode Internal Codec ------------------------------------------- */
6235
6236PyObject *
6237_PyUnicode_DecodeUnicodeInternal(const char *s,
6238                                 Py_ssize_t size,
6239                                 const char *errors)
6240{
6241    const char *starts = s;
6242    Py_ssize_t startinpos;
6243    Py_ssize_t endinpos;
6244    Py_ssize_t outpos;
6245    PyObject *v;
6246    const char *end;
6247    const char *reason;
6248    PyObject *errorHandler = NULL;
6249    PyObject *exc = NULL;
6250
6251    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6252                     "unicode_internal codec has been deprecated",
6253                     1))
6254        return NULL;
6255
6256    /* XXX overflow detection missing */
6257    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
6258    if (v == NULL)
6259        goto onError;
6260    if (PyUnicode_GET_LENGTH(v) == 0)
6261        return v;
6262    outpos = 0;
6263    end = s + size;
6264
6265    while (s < end) {
6266        Py_UNICODE uch;
6267        Py_UCS4 ch;
6268        /* We copy the raw representation one byte at a time because the
6269           pointer may be unaligned (see test_codeccallbacks). */
6270        ((char *) &uch)[0] = s[0];
6271        ((char *) &uch)[1] = s[1];
6272#ifdef Py_UNICODE_WIDE
6273        ((char *) &uch)[2] = s[2];
6274        ((char *) &uch)[3] = s[3];
6275#endif
6276        ch = uch;
6277
6278        /* We have to sanity check the raw data, otherwise doom looms for
6279           some malformed UCS-4 data. */
6280        if (
6281#ifdef Py_UNICODE_WIDE
6282            ch > 0x10ffff ||
6283#endif
6284            end-s < Py_UNICODE_SIZE
6285            )
6286        {
6287            startinpos = s - starts;
6288            if (end-s < Py_UNICODE_SIZE) {
6289                endinpos = end-starts;
6290                reason = "truncated input";
6291            }
6292            else {
6293                endinpos = s - starts + Py_UNICODE_SIZE;
6294                reason = "illegal code point (> 0x10FFFF)";
6295            }
6296            if (unicode_decode_call_errorhandler(
6297                    errors, &errorHandler,
6298                    "unicode_internal", reason,
6299                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6300                    &v, &outpos))
6301                goto onError;
6302            continue;
6303        }
6304
6305        s += Py_UNICODE_SIZE;
6306#ifndef Py_UNICODE_WIDE
6307        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
6308        {
6309            Py_UNICODE uch2;
6310            ((char *) &uch2)[0] = s[0];
6311            ((char *) &uch2)[1] = s[1];
6312            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6313            {
6314                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6315                s += Py_UNICODE_SIZE;
6316            }
6317        }
6318#endif
6319
6320        if (unicode_putchar(&v, &outpos, ch) < 0)
6321            goto onError;
6322    }
6323
6324    if (unicode_resize(&v, outpos) < 0)
6325        goto onError;
6326    Py_XDECREF(errorHandler);
6327    Py_XDECREF(exc);
6328    return unicode_result(v);
6329
6330  onError:
6331    Py_XDECREF(v);
6332    Py_XDECREF(errorHandler);
6333    Py_XDECREF(exc);
6334    return NULL;
6335}
6336
6337/* --- Latin-1 Codec ------------------------------------------------------ */
6338
6339PyObject *
6340PyUnicode_DecodeLatin1(const char *s,
6341                       Py_ssize_t size,
6342                       const char *errors)
6343{
6344    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6345    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6346}
6347
6348/* create or adjust a UnicodeEncodeError */
6349static void
6350make_encode_exception(PyObject **exceptionObject,
6351                      const char *encoding,
6352                      PyObject *unicode,
6353                      Py_ssize_t startpos, Py_ssize_t endpos,
6354                      const char *reason)
6355{
6356    if (*exceptionObject == NULL) {
6357        *exceptionObject = PyObject_CallFunction(
6358            PyExc_UnicodeEncodeError, "sOnns",
6359            encoding, unicode, startpos, endpos, reason);
6360    }
6361    else {
6362        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6363            goto onError;
6364        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6365            goto onError;
6366        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6367            goto onError;
6368        return;
6369      onError:
6370        Py_DECREF(*exceptionObject);
6371        *exceptionObject = NULL;
6372    }
6373}
6374
6375/* raises a UnicodeEncodeError */
6376static void
6377raise_encode_exception(PyObject **exceptionObject,
6378                       const char *encoding,
6379                       PyObject *unicode,
6380                       Py_ssize_t startpos, Py_ssize_t endpos,
6381                       const char *reason)
6382{
6383    make_encode_exception(exceptionObject,
6384                          encoding, unicode, startpos, endpos, reason);
6385    if (*exceptionObject != NULL)
6386        PyCodec_StrictErrors(*exceptionObject);
6387}
6388
6389/* error handling callback helper:
6390   build arguments, call the callback and check the arguments,
6391   put the result into newpos and return the replacement string, which
6392   has to be freed by the caller */
6393static PyObject *
6394unicode_encode_call_errorhandler(const char *errors,
6395                                 PyObject **errorHandler,
6396                                 const char *encoding, const char *reason,
6397                                 PyObject *unicode, PyObject **exceptionObject,
6398                                 Py_ssize_t startpos, Py_ssize_t endpos,
6399                                 Py_ssize_t *newpos)
6400{
6401    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6402    Py_ssize_t len;
6403    PyObject *restuple;
6404    PyObject *resunicode;
6405
6406    if (*errorHandler == NULL) {
6407        *errorHandler = PyCodec_LookupError(errors);
6408        if (*errorHandler == NULL)
6409            return NULL;
6410    }
6411
6412    if (PyUnicode_READY(unicode) == -1)
6413        return NULL;
6414    len = PyUnicode_GET_LENGTH(unicode);
6415
6416    make_encode_exception(exceptionObject,
6417                          encoding, unicode, startpos, endpos, reason);
6418    if (*exceptionObject == NULL)
6419        return NULL;
6420
6421    restuple = PyObject_CallFunctionObjArgs(
6422        *errorHandler, *exceptionObject, NULL);
6423    if (restuple == NULL)
6424        return NULL;
6425    if (!PyTuple_Check(restuple)) {
6426        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6427        Py_DECREF(restuple);
6428        return NULL;
6429    }
6430    if (!PyArg_ParseTuple(restuple, argparse,
6431                          &resunicode, newpos)) {
6432        Py_DECREF(restuple);
6433        return NULL;
6434    }
6435    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6436        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6437        Py_DECREF(restuple);
6438        return NULL;
6439    }
6440    if (*newpos<0)
6441        *newpos = len + *newpos;
6442    if (*newpos<0 || *newpos>len) {
6443        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6444        Py_DECREF(restuple);
6445        return NULL;
6446    }
6447    Py_INCREF(resunicode);
6448    Py_DECREF(restuple);
6449    return resunicode;
6450}
6451
6452static PyObject *
6453unicode_encode_ucs1(PyObject *unicode,
6454                    const char *errors,
6455                    unsigned int limit)
6456{
6457    /* input state */
6458    Py_ssize_t pos=0, size;
6459    int kind;
6460    void *data;
6461    /* output object */
6462    PyObject *res;
6463    /* pointer into the output */
6464    char *str;
6465    /* current output position */
6466    Py_ssize_t ressize;
6467    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6468    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6469    PyObject *errorHandler = NULL;
6470    PyObject *exc = NULL;
6471    /* the following variable is used for caching string comparisons
6472     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6473    int known_errorHandler = -1;
6474
6475    if (PyUnicode_READY(unicode) == -1)
6476        return NULL;
6477    size = PyUnicode_GET_LENGTH(unicode);
6478    kind = PyUnicode_KIND(unicode);
6479    data = PyUnicode_DATA(unicode);
6480    /* allocate enough for a simple encoding without
6481       replacements, if we need more, we'll resize */
6482    if (size == 0)
6483        return PyBytes_FromStringAndSize(NULL, 0);
6484    res = PyBytes_FromStringAndSize(NULL, size);
6485    if (res == NULL)
6486        return NULL;
6487    str = PyBytes_AS_STRING(res);
6488    ressize = size;
6489
6490    while (pos < size) {
6491        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6492
6493        /* can we encode this? */
6494        if (c<limit) {
6495            /* no overflow check, because we know that the space is enough */
6496            *str++ = (char)c;
6497            ++pos;
6498        }
6499        else {
6500            Py_ssize_t requiredsize;
6501            PyObject *repunicode;
6502            Py_ssize_t repsize, newpos, respos, i;
6503            /* startpos for collecting unencodable chars */
6504            Py_ssize_t collstart = pos;
6505            Py_ssize_t collend = pos;
6506            /* find all unecodable characters */
6507            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6508                ++collend;
6509            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6510            if (known_errorHandler==-1) {
6511                if ((errors==NULL) || (!strcmp(errors, "strict")))
6512                    known_errorHandler = 1;
6513                else if (!strcmp(errors, "replace"))
6514                    known_errorHandler = 2;
6515                else if (!strcmp(errors, "ignore"))
6516                    known_errorHandler = 3;
6517                else if (!strcmp(errors, "xmlcharrefreplace"))
6518                    known_errorHandler = 4;
6519                else
6520                    known_errorHandler = 0;
6521            }
6522            switch (known_errorHandler) {
6523            case 1: /* strict */
6524                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6525                goto onError;
6526            case 2: /* replace */
6527                while (collstart++<collend)
6528                    *str++ = '?'; /* fall through */
6529            case 3: /* ignore */
6530                pos = collend;
6531                break;
6532            case 4: /* xmlcharrefreplace */
6533                respos = str - PyBytes_AS_STRING(res);
6534                /* determine replacement size */
6535                for (i = collstart, repsize = 0; i < collend; ++i) {
6536                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6537                    if (ch < 10)
6538                        repsize += 2+1+1;
6539                    else if (ch < 100)
6540                        repsize += 2+2+1;
6541                    else if (ch < 1000)
6542                        repsize += 2+3+1;
6543                    else if (ch < 10000)
6544                        repsize += 2+4+1;
6545                    else if (ch < 100000)
6546                        repsize += 2+5+1;
6547                    else if (ch < 1000000)
6548                        repsize += 2+6+1;
6549                    else {
6550                        assert(ch <= MAX_UNICODE);
6551                        repsize += 2+7+1;
6552                    }
6553                }
6554                requiredsize = respos+repsize+(size-collend);
6555                if (requiredsize > ressize) {
6556                    if (requiredsize<2*ressize)
6557                        requiredsize = 2*ressize;
6558                    if (_PyBytes_Resize(&res, requiredsize))
6559                        goto onError;
6560                    str = PyBytes_AS_STRING(res) + respos;
6561                    ressize = requiredsize;
6562                }
6563                /* generate replacement */
6564                for (i = collstart; i < collend; ++i) {
6565                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6566                }
6567                pos = collend;
6568                break;
6569            default:
6570                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6571                                                              encoding, reason, unicode, &exc,
6572                                                              collstart, collend, &newpos);
6573                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6574                                           PyUnicode_READY(repunicode) == -1))
6575                    goto onError;
6576                if (PyBytes_Check(repunicode)) {
6577                    /* Directly copy bytes result to output. */
6578                    repsize = PyBytes_Size(repunicode);
6579                    if (repsize > 1) {
6580                        /* Make room for all additional bytes. */
6581                        respos = str - PyBytes_AS_STRING(res);
6582                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6583                            Py_DECREF(repunicode);
6584                            goto onError;
6585                        }
6586                        str = PyBytes_AS_STRING(res) + respos;
6587                        ressize += repsize-1;
6588                    }
6589                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6590                    str += repsize;
6591                    pos = newpos;
6592                    Py_DECREF(repunicode);
6593                    break;
6594                }
6595                /* need more space? (at least enough for what we
6596                   have+the replacement+the rest of the string, so
6597                   we won't have to check space for encodable characters) */
6598                respos = str - PyBytes_AS_STRING(res);
6599                repsize = PyUnicode_GET_LENGTH(repunicode);
6600                requiredsize = respos+repsize+(size-collend);
6601                if (requiredsize > ressize) {
6602                    if (requiredsize<2*ressize)
6603                        requiredsize = 2*ressize;
6604                    if (_PyBytes_Resize(&res, requiredsize)) {
6605                        Py_DECREF(repunicode);
6606                        goto onError;
6607                    }
6608                    str = PyBytes_AS_STRING(res) + respos;
6609                    ressize = requiredsize;
6610                }
6611                /* check if there is anything unencodable in the replacement
6612                   and copy it to the output */
6613                for (i = 0; repsize-->0; ++i, ++str) {
6614                    c = PyUnicode_READ_CHAR(repunicode, i);
6615                    if (c >= limit) {
6616                        raise_encode_exception(&exc, encoding, unicode,
6617                                               pos, pos+1, reason);
6618                        Py_DECREF(repunicode);
6619                        goto onError;
6620                    }
6621                    *str = (char)c;
6622                }
6623                pos = newpos;
6624                Py_DECREF(repunicode);
6625            }
6626        }
6627    }
6628    /* Resize if we allocated to much */
6629    size = str - PyBytes_AS_STRING(res);
6630    if (size < ressize) { /* If this falls res will be NULL */
6631        assert(size >= 0);
6632        if (_PyBytes_Resize(&res, size) < 0)
6633            goto onError;
6634    }
6635
6636    Py_XDECREF(errorHandler);
6637    Py_XDECREF(exc);
6638    return res;
6639
6640  onError:
6641    Py_XDECREF(res);
6642    Py_XDECREF(errorHandler);
6643    Py_XDECREF(exc);
6644    return NULL;
6645}
6646
6647/* Deprecated */
6648PyObject *
6649PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6650                       Py_ssize_t size,
6651                       const char *errors)
6652{
6653    PyObject *result;
6654    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6655    if (unicode == NULL)
6656        return NULL;
6657    result = unicode_encode_ucs1(unicode, errors, 256);
6658    Py_DECREF(unicode);
6659    return result;
6660}
6661
6662PyObject *
6663_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6664{
6665    if (!PyUnicode_Check(unicode)) {
6666        PyErr_BadArgument();
6667        return NULL;
6668    }
6669    if (PyUnicode_READY(unicode) == -1)
6670        return NULL;
6671    /* Fast path: if it is a one-byte string, construct
6672       bytes object directly. */
6673    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6674        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6675                                         PyUnicode_GET_LENGTH(unicode));
6676    /* Non-Latin-1 characters present. Defer to above function to
6677       raise the exception. */
6678    return unicode_encode_ucs1(unicode, errors, 256);
6679}
6680
6681PyObject*
6682PyUnicode_AsLatin1String(PyObject *unicode)
6683{
6684    return _PyUnicode_AsLatin1String(unicode, NULL);
6685}
6686
6687/* --- 7-bit ASCII Codec -------------------------------------------------- */
6688
6689PyObject *
6690PyUnicode_DecodeASCII(const char *s,
6691                      Py_ssize_t size,
6692                      const char *errors)
6693{
6694    const char *starts = s;
6695    PyObject *unicode;
6696    int kind;
6697    void *data;
6698    Py_ssize_t startinpos;
6699    Py_ssize_t endinpos;
6700    Py_ssize_t outpos;
6701    const char *e;
6702    PyObject *errorHandler = NULL;
6703    PyObject *exc = NULL;
6704
6705    if (size == 0) {
6706        Py_INCREF(unicode_empty);
6707        return unicode_empty;
6708    }
6709
6710    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6711    if (size == 1 && (unsigned char)s[0] < 128)
6712        return get_latin1_char((unsigned char)s[0]);
6713
6714    unicode = PyUnicode_New(size, 127);
6715    if (unicode == NULL)
6716        goto onError;
6717
6718    e = s + size;
6719    data = PyUnicode_1BYTE_DATA(unicode);
6720    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6721    if (outpos == size)
6722        return unicode;
6723
6724    s += outpos;
6725    kind = PyUnicode_1BYTE_KIND;
6726    while (s < e) {
6727        register unsigned char c = (unsigned char)*s;
6728        if (c < 128) {
6729            PyUnicode_WRITE(kind, data, outpos++, c);
6730            ++s;
6731        }
6732        else {
6733            startinpos = s-starts;
6734            endinpos = startinpos + 1;
6735            if (unicode_decode_call_errorhandler(
6736                    errors, &errorHandler,
6737                    "ascii", "ordinal not in range(128)",
6738                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6739                    &unicode, &outpos))
6740                goto onError;
6741            kind = PyUnicode_KIND(unicode);
6742            data = PyUnicode_DATA(unicode);
6743        }
6744    }
6745    if (unicode_resize(&unicode, outpos) < 0)
6746        goto onError;
6747    Py_XDECREF(errorHandler);
6748    Py_XDECREF(exc);
6749    assert(_PyUnicode_CheckConsistency(unicode, 1));
6750    return unicode;
6751
6752  onError:
6753    Py_XDECREF(unicode);
6754    Py_XDECREF(errorHandler);
6755    Py_XDECREF(exc);
6756    return NULL;
6757}
6758
6759/* Deprecated */
6760PyObject *
6761PyUnicode_EncodeASCII(const Py_UNICODE *p,
6762                      Py_ssize_t size,
6763                      const char *errors)
6764{
6765    PyObject *result;
6766    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6767    if (unicode == NULL)
6768        return NULL;
6769    result = unicode_encode_ucs1(unicode, errors, 128);
6770    Py_DECREF(unicode);
6771    return result;
6772}
6773
6774PyObject *
6775_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6776{
6777    if (!PyUnicode_Check(unicode)) {
6778        PyErr_BadArgument();
6779        return NULL;
6780    }
6781    if (PyUnicode_READY(unicode) == -1)
6782        return NULL;
6783    /* Fast path: if it is an ASCII-only string, construct bytes object
6784       directly. Else defer to above function to raise the exception. */
6785    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6786        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6787                                         PyUnicode_GET_LENGTH(unicode));
6788    return unicode_encode_ucs1(unicode, errors, 128);
6789}
6790
6791PyObject *
6792PyUnicode_AsASCIIString(PyObject *unicode)
6793{
6794    return _PyUnicode_AsASCIIString(unicode, NULL);
6795}
6796
6797#ifdef HAVE_MBCS
6798
6799/* --- MBCS codecs for Windows -------------------------------------------- */
6800
6801#if SIZEOF_INT < SIZEOF_SIZE_T
6802#define NEED_RETRY
6803#endif
6804
6805#ifndef WC_ERR_INVALID_CHARS
6806#  define WC_ERR_INVALID_CHARS 0x0080
6807#endif
6808
6809static char*
6810code_page_name(UINT code_page, PyObject **obj)
6811{
6812    *obj = NULL;
6813    if (code_page == CP_ACP)
6814        return "mbcs";
6815    if (code_page == CP_UTF7)
6816        return "CP_UTF7";
6817    if (code_page == CP_UTF8)
6818        return "CP_UTF8";
6819
6820    *obj = PyBytes_FromFormat("cp%u", code_page);
6821    if (*obj == NULL)
6822        return NULL;
6823    return PyBytes_AS_STRING(*obj);
6824}
6825
6826static int
6827is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6828{
6829    const char *curr = s + offset;
6830    const char *prev;
6831
6832    if (!IsDBCSLeadByteEx(code_page, *curr))
6833        return 0;
6834
6835    prev = CharPrevExA(code_page, s, curr, 0);
6836    if (prev == curr)
6837        return 1;
6838    /* FIXME: This code is limited to "true" double-byte encodings,
6839       as it assumes an incomplete character consists of a single
6840       byte. */
6841    if (curr - prev == 2)
6842        return 1;
6843    if (!IsDBCSLeadByteEx(code_page, *prev))
6844        return 1;
6845    return 0;
6846}
6847
6848static DWORD
6849decode_code_page_flags(UINT code_page)
6850{
6851    if (code_page == CP_UTF7) {
6852        /* The CP_UTF7 decoder only supports flags=0 */
6853        return 0;
6854    }
6855    else
6856        return MB_ERR_INVALID_CHARS;
6857}
6858
6859/*
6860 * Decode a byte string from a Windows code page into unicode object in strict
6861 * mode.
6862 *
6863 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6864 * WindowsError and returns -1 on other error.
6865 */
6866static int
6867decode_code_page_strict(UINT code_page,
6868                        PyObject **v,
6869                        const char *in,
6870                        int insize)
6871{
6872    const DWORD flags = decode_code_page_flags(code_page);
6873    wchar_t *out;
6874    DWORD outsize;
6875
6876    /* First get the size of the result */
6877    assert(insize > 0);
6878    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6879    if (outsize <= 0)
6880        goto error;
6881
6882    if (*v == NULL) {
6883        /* Create unicode object */
6884        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6885        *v = (PyObject*)_PyUnicode_New(outsize);
6886        if (*v == NULL)
6887            return -1;
6888        out = PyUnicode_AS_UNICODE(*v);
6889    }
6890    else {
6891        /* Extend unicode object */
6892        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6893        if (unicode_resize(v, n + outsize) < 0)
6894            return -1;
6895        out = PyUnicode_AS_UNICODE(*v) + n;
6896    }
6897
6898    /* Do the conversion */
6899    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6900    if (outsize <= 0)
6901        goto error;
6902    return insize;
6903
6904error:
6905    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6906        return -2;
6907    PyErr_SetFromWindowsErr(0);
6908    return -1;
6909}
6910
6911/*
6912 * Decode a byte string from a code page into unicode object with an error
6913 * handler.
6914 *
6915 * Returns consumed size if succeed, or raise a WindowsError or
6916 * UnicodeDecodeError exception and returns -1 on error.
6917 */
6918static int
6919decode_code_page_errors(UINT code_page,
6920                        PyObject **v,
6921                        const char *in, const int size,
6922                        const char *errors)
6923{
6924    const char *startin = in;
6925    const char *endin = in + size;
6926    const DWORD flags = decode_code_page_flags(code_page);
6927    /* Ideally, we should get reason from FormatMessage. This is the Windows
6928       2000 English version of the message. */
6929    const char *reason = "No mapping for the Unicode character exists "
6930                         "in the target code page.";
6931    /* each step cannot decode more than 1 character, but a character can be
6932       represented as a surrogate pair */
6933    wchar_t buffer[2], *startout, *out;
6934    int insize, outsize;
6935    PyObject *errorHandler = NULL;
6936    PyObject *exc = NULL;
6937    PyObject *encoding_obj = NULL;
6938    char *encoding;
6939    DWORD err;
6940    int ret = -1;
6941
6942    assert(size > 0);
6943
6944    encoding = code_page_name(code_page, &encoding_obj);
6945    if (encoding == NULL)
6946        return -1;
6947
6948    if (errors == NULL || strcmp(errors, "strict") == 0) {
6949        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6950           UnicodeDecodeError. */
6951        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6952        if (exc != NULL) {
6953            PyCodec_StrictErrors(exc);
6954            Py_CLEAR(exc);
6955        }
6956        goto error;
6957    }
6958
6959    if (*v == NULL) {
6960        /* Create unicode object */
6961        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6962            PyErr_NoMemory();
6963            goto error;
6964        }
6965        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6966        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6967        if (*v == NULL)
6968            goto error;
6969        startout = PyUnicode_AS_UNICODE(*v);
6970    }
6971    else {
6972        /* Extend unicode object */
6973        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6974        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6975            PyErr_NoMemory();
6976            goto error;
6977        }
6978        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6979            goto error;
6980        startout = PyUnicode_AS_UNICODE(*v) + n;
6981    }
6982
6983    /* Decode the byte string character per character */
6984    out = startout;
6985    while (in < endin)
6986    {
6987        /* Decode a character */
6988        insize = 1;
6989        do
6990        {
6991            outsize = MultiByteToWideChar(code_page, flags,
6992                                          in, insize,
6993                                          buffer, Py_ARRAY_LENGTH(buffer));
6994            if (outsize > 0)
6995                break;
6996            err = GetLastError();
6997            if (err != ERROR_NO_UNICODE_TRANSLATION
6998                && err != ERROR_INSUFFICIENT_BUFFER)
6999            {
7000                PyErr_SetFromWindowsErr(0);
7001                goto error;
7002            }
7003            insize++;
7004        }
7005        /* 4=maximum length of a UTF-8 sequence */
7006        while (insize <= 4 && (in + insize) <= endin);
7007
7008        if (outsize <= 0) {
7009            Py_ssize_t startinpos, endinpos, outpos;
7010
7011            startinpos = in - startin;
7012            endinpos = startinpos + 1;
7013            outpos = out - PyUnicode_AS_UNICODE(*v);
7014            if (unicode_decode_call_errorhandler(
7015                    errors, &errorHandler,
7016                    encoding, reason,
7017                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7018                    v, &outpos))
7019            {
7020                goto error;
7021            }
7022            out = PyUnicode_AS_UNICODE(*v) + outpos;
7023        }
7024        else {
7025            in += insize;
7026            memcpy(out, buffer, outsize * sizeof(wchar_t));
7027            out += outsize;
7028        }
7029    }
7030
7031    /* write a NUL character at the end */
7032    *out = 0;
7033
7034    /* Extend unicode object */
7035    outsize = out - startout;
7036    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7037    if (unicode_resize(v, outsize) < 0)
7038        goto error;
7039    ret = size;
7040
7041error:
7042    Py_XDECREF(encoding_obj);
7043    Py_XDECREF(errorHandler);
7044    Py_XDECREF(exc);
7045    return ret;
7046}
7047
7048static PyObject *
7049decode_code_page_stateful(int code_page,
7050                          const char *s, Py_ssize_t size,
7051                          const char *errors, Py_ssize_t *consumed)
7052{
7053    PyObject *v = NULL;
7054    int chunk_size, final, converted, done;
7055
7056    if (code_page < 0) {
7057        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7058        return NULL;
7059    }
7060
7061    if (consumed)
7062        *consumed = 0;
7063
7064    do
7065    {
7066#ifdef NEED_RETRY
7067        if (size > INT_MAX) {
7068            chunk_size = INT_MAX;
7069            final = 0;
7070            done = 0;
7071        }
7072        else
7073#endif
7074        {
7075            chunk_size = (int)size;
7076            final = (consumed == NULL);
7077            done = 1;
7078        }
7079
7080        /* Skip trailing lead-byte unless 'final' is set */
7081        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7082            --chunk_size;
7083
7084        if (chunk_size == 0 && done) {
7085            if (v != NULL)
7086                break;
7087            Py_INCREF(unicode_empty);
7088            return unicode_empty;
7089        }
7090
7091
7092        converted = decode_code_page_strict(code_page, &v,
7093                                            s, chunk_size);
7094        if (converted == -2)
7095            converted = decode_code_page_errors(code_page, &v,
7096                                                s, chunk_size,
7097                                                errors);
7098        assert(converted != 0);
7099
7100        if (converted < 0) {
7101            Py_XDECREF(v);
7102            return NULL;
7103        }
7104
7105        if (consumed)
7106            *consumed += converted;
7107
7108        s += converted;
7109        size -= converted;
7110    } while (!done);
7111
7112    return unicode_result(v);
7113}
7114
7115PyObject *
7116PyUnicode_DecodeCodePageStateful(int code_page,
7117                                 const char *s,
7118                                 Py_ssize_t size,
7119                                 const char *errors,
7120                                 Py_ssize_t *consumed)
7121{
7122    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7123}
7124
7125PyObject *
7126PyUnicode_DecodeMBCSStateful(const char *s,
7127                             Py_ssize_t size,
7128                             const char *errors,
7129                             Py_ssize_t *consumed)
7130{
7131    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7132}
7133
7134PyObject *
7135PyUnicode_DecodeMBCS(const char *s,
7136                     Py_ssize_t size,
7137                     const char *errors)
7138{
7139    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7140}
7141
7142static DWORD
7143encode_code_page_flags(UINT code_page, const char *errors)
7144{
7145    if (code_page == CP_UTF8) {
7146        if (winver.dwMajorVersion >= 6)
7147            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7148               and later */
7149            return WC_ERR_INVALID_CHARS;
7150        else
7151            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7152            return 0;
7153    }
7154    else if (code_page == CP_UTF7) {
7155        /* CP_UTF7 only supports flags=0 */
7156        return 0;
7157    }
7158    else {
7159        if (errors != NULL && strcmp(errors, "replace") == 0)
7160            return 0;
7161        else
7162            return WC_NO_BEST_FIT_CHARS;
7163    }
7164}
7165
7166/*
7167 * Encode a Unicode string to a Windows code page into a byte string in strict
7168 * mode.
7169 *
7170 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7171 * a WindowsError and returns -1 on other error.
7172 */
7173static int
7174encode_code_page_strict(UINT code_page, PyObject **outbytes,
7175                        PyObject *unicode, Py_ssize_t offset, int len,
7176                        const char* errors)
7177{
7178    BOOL usedDefaultChar = FALSE;
7179    BOOL *pusedDefaultChar = &usedDefaultChar;
7180    int outsize;
7181    PyObject *exc = NULL;
7182    wchar_t *p;
7183    Py_ssize_t size;
7184    const DWORD flags = encode_code_page_flags(code_page, NULL);
7185    char *out;
7186    /* Create a substring so that we can get the UTF-16 representation
7187       of just the slice under consideration. */
7188    PyObject *substring;
7189
7190    assert(len > 0);
7191
7192    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7193        pusedDefaultChar = &usedDefaultChar;
7194    else
7195        pusedDefaultChar = NULL;
7196
7197    substring = PyUnicode_Substring(unicode, offset, offset+len);
7198    if (substring == NULL)
7199        return -1;
7200    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7201    if (p == NULL) {
7202        Py_DECREF(substring);
7203        return -1;
7204    }
7205
7206    /* First get the size of the result */
7207    outsize = WideCharToMultiByte(code_page, flags,
7208                                  p, size,
7209                                  NULL, 0,
7210                                  NULL, pusedDefaultChar);
7211    if (outsize <= 0)
7212        goto error;
7213    /* If we used a default char, then we failed! */
7214    if (pusedDefaultChar && *pusedDefaultChar) {
7215        Py_DECREF(substring);
7216        return -2;
7217    }
7218
7219    if (*outbytes == NULL) {
7220        /* Create string object */
7221        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7222        if (*outbytes == NULL) {
7223            Py_DECREF(substring);
7224            return -1;
7225        }
7226        out = PyBytes_AS_STRING(*outbytes);
7227    }
7228    else {
7229        /* Extend string object */
7230        const Py_ssize_t n = PyBytes_Size(*outbytes);
7231        if (outsize > PY_SSIZE_T_MAX - n) {
7232            PyErr_NoMemory();
7233            Py_DECREF(substring);
7234            return -1;
7235        }
7236        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7237            Py_DECREF(substring);
7238            return -1;
7239        }
7240        out = PyBytes_AS_STRING(*outbytes) + n;
7241    }
7242
7243    /* Do the conversion */
7244    outsize = WideCharToMultiByte(code_page, flags,
7245                                  p, size,
7246                                  out, outsize,
7247                                  NULL, pusedDefaultChar);
7248    Py_CLEAR(substring);
7249    if (outsize <= 0)
7250        goto error;
7251    if (pusedDefaultChar && *pusedDefaultChar)
7252        return -2;
7253    return 0;
7254
7255error:
7256    Py_XDECREF(substring);
7257    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7258        return -2;
7259    PyErr_SetFromWindowsErr(0);
7260    return -1;
7261}
7262
7263/*
7264 * Encode a Unicode string to a Windows code page into a byte string using a
7265 * error handler.
7266 *
7267 * Returns consumed characters if succeed, or raise a WindowsError and returns
7268 * -1 on other error.
7269 */
7270static int
7271encode_code_page_errors(UINT code_page, PyObject **outbytes,
7272                        PyObject *unicode, Py_ssize_t unicode_offset,
7273                        Py_ssize_t insize, const char* errors)
7274{
7275    const DWORD flags = encode_code_page_flags(code_page, errors);
7276    Py_ssize_t pos = unicode_offset;
7277    Py_ssize_t endin = unicode_offset + insize;
7278    /* Ideally, we should get reason from FormatMessage. This is the Windows
7279       2000 English version of the message. */
7280    const char *reason = "invalid character";
7281    /* 4=maximum length of a UTF-8 sequence */
7282    char buffer[4];
7283    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7284    Py_ssize_t outsize;
7285    char *out;
7286    PyObject *errorHandler = NULL;
7287    PyObject *exc = NULL;
7288    PyObject *encoding_obj = NULL;
7289    char *encoding;
7290    Py_ssize_t newpos, newoutsize;
7291    PyObject *rep;
7292    int ret = -1;
7293
7294    assert(insize > 0);
7295
7296    encoding = code_page_name(code_page, &encoding_obj);
7297    if (encoding == NULL)
7298        return -1;
7299
7300    if (errors == NULL || strcmp(errors, "strict") == 0) {
7301        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7302           then we raise a UnicodeEncodeError. */
7303        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7304        if (exc != NULL) {
7305            PyCodec_StrictErrors(exc);
7306            Py_DECREF(exc);
7307        }
7308        Py_XDECREF(encoding_obj);
7309        return -1;
7310    }
7311
7312    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7313        pusedDefaultChar = &usedDefaultChar;
7314    else
7315        pusedDefaultChar = NULL;
7316
7317    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7318        PyErr_NoMemory();
7319        goto error;
7320    }
7321    outsize = insize * Py_ARRAY_LENGTH(buffer);
7322
7323    if (*outbytes == NULL) {
7324        /* Create string object */
7325        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7326        if (*outbytes == NULL)
7327            goto error;
7328        out = PyBytes_AS_STRING(*outbytes);
7329    }
7330    else {
7331        /* Extend string object */
7332        Py_ssize_t n = PyBytes_Size(*outbytes);
7333        if (n > PY_SSIZE_T_MAX - outsize) {
7334            PyErr_NoMemory();
7335            goto error;
7336        }
7337        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7338            goto error;
7339        out = PyBytes_AS_STRING(*outbytes) + n;
7340    }
7341
7342    /* Encode the string character per character */
7343    while (pos < endin)
7344    {
7345        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7346        wchar_t chars[2];
7347        int charsize;
7348        if (ch < 0x10000) {
7349            chars[0] = (wchar_t)ch;
7350            charsize = 1;
7351        }
7352        else {
7353            ch -= 0x10000;
7354            chars[0] = 0xd800 + (ch >> 10);
7355            chars[1] = 0xdc00 + (ch & 0x3ff);
7356            charsize = 2;
7357        }
7358
7359        outsize = WideCharToMultiByte(code_page, flags,
7360                                      chars, charsize,
7361                                      buffer, Py_ARRAY_LENGTH(buffer),
7362                                      NULL, pusedDefaultChar);
7363        if (outsize > 0) {
7364            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7365            {
7366                pos++;
7367                memcpy(out, buffer, outsize);
7368                out += outsize;
7369                continue;
7370            }
7371        }
7372        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7373            PyErr_SetFromWindowsErr(0);
7374            goto error;
7375        }
7376
7377        rep = unicode_encode_call_errorhandler(
7378                  errors, &errorHandler, encoding, reason,
7379                  unicode, &exc,
7380                  pos, pos + 1, &newpos);
7381        if (rep == NULL)
7382            goto error;
7383        pos = newpos;
7384
7385        if (PyBytes_Check(rep)) {
7386            outsize = PyBytes_GET_SIZE(rep);
7387            if (outsize != 1) {
7388                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7389                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7390                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7391                    Py_DECREF(rep);
7392                    goto error;
7393                }
7394                out = PyBytes_AS_STRING(*outbytes) + offset;
7395            }
7396            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7397            out += outsize;
7398        }
7399        else {
7400            Py_ssize_t i;
7401            enum PyUnicode_Kind kind;
7402            void *data;
7403
7404            if (PyUnicode_READY(rep) == -1) {
7405                Py_DECREF(rep);
7406                goto error;
7407            }
7408
7409            outsize = PyUnicode_GET_LENGTH(rep);
7410            if (outsize != 1) {
7411                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7412                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7413                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7414                    Py_DECREF(rep);
7415                    goto error;
7416                }
7417                out = PyBytes_AS_STRING(*outbytes) + offset;
7418            }
7419            kind = PyUnicode_KIND(rep);
7420            data = PyUnicode_DATA(rep);
7421            for (i=0; i < outsize; i++) {
7422                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7423                if (ch > 127) {
7424                    raise_encode_exception(&exc,
7425                        encoding, unicode,
7426                        pos, pos + 1,
7427                        "unable to encode error handler result to ASCII");
7428                    Py_DECREF(rep);
7429                    goto error;
7430                }
7431                *out = (unsigned char)ch;
7432                out++;
7433            }
7434        }
7435        Py_DECREF(rep);
7436    }
7437    /* write a NUL byte */
7438    *out = 0;
7439    outsize = out - PyBytes_AS_STRING(*outbytes);
7440    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7441    if (_PyBytes_Resize(outbytes, outsize) < 0)
7442        goto error;
7443    ret = 0;
7444
7445error:
7446    Py_XDECREF(encoding_obj);
7447    Py_XDECREF(errorHandler);
7448    Py_XDECREF(exc);
7449    return ret;
7450}
7451
7452static PyObject *
7453encode_code_page(int code_page,
7454                 PyObject *unicode,
7455                 const char *errors)
7456{
7457    Py_ssize_t len;
7458    PyObject *outbytes = NULL;
7459    Py_ssize_t offset;
7460    int chunk_len, ret, done;
7461
7462    if (PyUnicode_READY(unicode) == -1)
7463        return NULL;
7464    len = PyUnicode_GET_LENGTH(unicode);
7465
7466    if (code_page < 0) {
7467        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7468        return NULL;
7469    }
7470
7471    if (len == 0)
7472        return PyBytes_FromStringAndSize(NULL, 0);
7473
7474    offset = 0;
7475    do
7476    {
7477#ifdef NEED_RETRY
7478        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7479           chunks. */
7480        if (len > INT_MAX/2) {
7481            chunk_len = INT_MAX/2;
7482            done = 0;
7483        }
7484        else
7485#endif
7486        {
7487            chunk_len = (int)len;
7488            done = 1;
7489        }
7490
7491        ret = encode_code_page_strict(code_page, &outbytes,
7492                                      unicode, offset, chunk_len,
7493                                      errors);
7494        if (ret == -2)
7495            ret = encode_code_page_errors(code_page, &outbytes,
7496                                          unicode, offset,
7497                                          chunk_len, errors);
7498        if (ret < 0) {
7499            Py_XDECREF(outbytes);
7500            return NULL;
7501        }
7502
7503        offset += chunk_len;
7504        len -= chunk_len;
7505    } while (!done);
7506
7507    return outbytes;
7508}
7509
7510PyObject *
7511PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7512                     Py_ssize_t size,
7513                     const char *errors)
7514{
7515    PyObject *unicode, *res;
7516    unicode = PyUnicode_FromUnicode(p, size);
7517    if (unicode == NULL)
7518        return NULL;
7519    res = encode_code_page(CP_ACP, unicode, errors);
7520    Py_DECREF(unicode);
7521    return res;
7522}
7523
7524PyObject *
7525PyUnicode_EncodeCodePage(int code_page,
7526                         PyObject *unicode,
7527                         const char *errors)
7528{
7529    return encode_code_page(code_page, unicode, errors);
7530}
7531
7532PyObject *
7533PyUnicode_AsMBCSString(PyObject *unicode)
7534{
7535    if (!PyUnicode_Check(unicode)) {
7536        PyErr_BadArgument();
7537        return NULL;
7538    }
7539    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7540}
7541
7542#undef NEED_RETRY
7543
7544#endif /* HAVE_MBCS */
7545
7546/* --- Character Mapping Codec -------------------------------------------- */
7547
7548PyObject *
7549PyUnicode_DecodeCharmap(const char *s,
7550                        Py_ssize_t size,
7551                        PyObject *mapping,
7552                        const char *errors)
7553{
7554    const char *starts = s;
7555    Py_ssize_t startinpos;
7556    Py_ssize_t endinpos;
7557    Py_ssize_t outpos;
7558    const char *e;
7559    PyObject *v;
7560    Py_ssize_t extrachars = 0;
7561    PyObject *errorHandler = NULL;
7562    PyObject *exc = NULL;
7563
7564    /* Default to Latin-1 */
7565    if (mapping == NULL)
7566        return PyUnicode_DecodeLatin1(s, size, errors);
7567
7568    v = PyUnicode_New(size, 127);
7569    if (v == NULL)
7570        goto onError;
7571    if (size == 0)
7572        return v;
7573    outpos = 0;
7574    e = s + size;
7575    if (PyUnicode_CheckExact(mapping)) {
7576        Py_ssize_t maplen;
7577        enum PyUnicode_Kind kind;
7578        void *data;
7579        Py_UCS4 x;
7580
7581        if (PyUnicode_READY(mapping) == -1)
7582            return NULL;
7583
7584        maplen = PyUnicode_GET_LENGTH(mapping);
7585        data = PyUnicode_DATA(mapping);
7586        kind = PyUnicode_KIND(mapping);
7587        while (s < e) {
7588            unsigned char ch = *s;
7589
7590            if (ch < maplen)
7591                x = PyUnicode_READ(kind, data, ch);
7592            else
7593                x = 0xfffe; /* invalid value */
7594
7595            if (x == 0xfffe)
7596            {
7597                /* undefined mapping */
7598                startinpos = s-starts;
7599                endinpos = startinpos+1;
7600                if (unicode_decode_call_errorhandler(
7601                        errors, &errorHandler,
7602                        "charmap", "character maps to <undefined>",
7603                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7604                        &v, &outpos)) {
7605                    goto onError;
7606                }
7607                continue;
7608            }
7609
7610            if (unicode_putchar(&v, &outpos, x) < 0)
7611                goto onError;
7612            ++s;
7613        }
7614    }
7615    else {
7616        while (s < e) {
7617            unsigned char ch = *s;
7618            PyObject *w, *x;
7619
7620            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7621            w = PyLong_FromLong((long)ch);
7622            if (w == NULL)
7623                goto onError;
7624            x = PyObject_GetItem(mapping, w);
7625            Py_DECREF(w);
7626            if (x == NULL) {
7627                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7628                    /* No mapping found means: mapping is undefined. */
7629                    PyErr_Clear();
7630                    x = Py_None;
7631                    Py_INCREF(x);
7632                } else
7633                    goto onError;
7634            }
7635
7636            /* Apply mapping */
7637            if (PyLong_Check(x)) {
7638                long value = PyLong_AS_LONG(x);
7639                if (value < 0 || value > 65535) {
7640                    PyErr_SetString(PyExc_TypeError,
7641                                    "character mapping must be in range(65536)");
7642                    Py_DECREF(x);
7643                    goto onError;
7644                }
7645                if (unicode_putchar(&v, &outpos, value) < 0)
7646                    goto onError;
7647            }
7648            else if (x == Py_None) {
7649                /* undefined mapping */
7650                startinpos = s-starts;
7651                endinpos = startinpos+1;
7652                if (unicode_decode_call_errorhandler(
7653                        errors, &errorHandler,
7654                        "charmap", "character maps to <undefined>",
7655                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7656                        &v, &outpos)) {
7657                    Py_DECREF(x);
7658                    goto onError;
7659                }
7660                Py_DECREF(x);
7661                continue;
7662            }
7663            else if (PyUnicode_Check(x)) {
7664                Py_ssize_t targetsize;
7665
7666                if (PyUnicode_READY(x) == -1)
7667                    goto onError;
7668                targetsize = PyUnicode_GET_LENGTH(x);
7669
7670                if (targetsize == 1) {
7671                    /* 1-1 mapping */
7672                    if (unicode_putchar(&v, &outpos,
7673                                        PyUnicode_READ_CHAR(x, 0)) < 0)
7674                        goto onError;
7675                }
7676                else if (targetsize > 1) {
7677                    /* 1-n mapping */
7678                    if (targetsize > extrachars) {
7679                        /* resize first */
7680                        Py_ssize_t needed = (targetsize - extrachars) + \
7681                            (targetsize << 2);
7682                        extrachars += needed;
7683                        /* XXX overflow detection missing */
7684                        if (unicode_resize(&v,
7685                                           PyUnicode_GET_LENGTH(v) + needed) < 0)
7686                        {
7687                            Py_DECREF(x);
7688                            goto onError;
7689                        }
7690                    }
7691                    if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7692                        goto onError;
7693                    PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7694                    outpos += targetsize;
7695                    extrachars -= targetsize;
7696                }
7697                /* 1-0 mapping: skip the character */
7698            }
7699            else {
7700                /* wrong return value */
7701                PyErr_SetString(PyExc_TypeError,
7702                                "character mapping must return integer, None or str");
7703                Py_DECREF(x);
7704                goto onError;
7705            }
7706            Py_DECREF(x);
7707            ++s;
7708        }
7709    }
7710    if (unicode_resize(&v, outpos) < 0)
7711        goto onError;
7712    Py_XDECREF(errorHandler);
7713    Py_XDECREF(exc);
7714    return unicode_result(v);
7715
7716  onError:
7717    Py_XDECREF(errorHandler);
7718    Py_XDECREF(exc);
7719    Py_XDECREF(v);
7720    return NULL;
7721}
7722
7723/* Charmap encoding: the lookup table */
7724
7725struct encoding_map {
7726    PyObject_HEAD
7727    unsigned char level1[32];
7728    int count2, count3;
7729    unsigned char level23[1];
7730};
7731
7732static PyObject*
7733encoding_map_size(PyObject *obj, PyObject* args)
7734{
7735    struct encoding_map *map = (struct encoding_map*)obj;
7736    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7737                           128*map->count3);
7738}
7739
7740static PyMethodDef encoding_map_methods[] = {
7741    {"size", encoding_map_size, METH_NOARGS,
7742     PyDoc_STR("Return the size (in bytes) of this object") },
7743    { 0 }
7744};
7745
7746static void
7747encoding_map_dealloc(PyObject* o)
7748{
7749    PyObject_FREE(o);
7750}
7751
7752static PyTypeObject EncodingMapType = {
7753    PyVarObject_HEAD_INIT(NULL, 0)
7754    "EncodingMap",          /*tp_name*/
7755    sizeof(struct encoding_map),   /*tp_basicsize*/
7756    0,                      /*tp_itemsize*/
7757    /* methods */
7758    encoding_map_dealloc,   /*tp_dealloc*/
7759    0,                      /*tp_print*/
7760    0,                      /*tp_getattr*/
7761    0,                      /*tp_setattr*/
7762    0,                      /*tp_reserved*/
7763    0,                      /*tp_repr*/
7764    0,                      /*tp_as_number*/
7765    0,                      /*tp_as_sequence*/
7766    0,                      /*tp_as_mapping*/
7767    0,                      /*tp_hash*/
7768    0,                      /*tp_call*/
7769    0,                      /*tp_str*/
7770    0,                      /*tp_getattro*/
7771    0,                      /*tp_setattro*/
7772    0,                      /*tp_as_buffer*/
7773    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7774    0,                      /*tp_doc*/
7775    0,                      /*tp_traverse*/
7776    0,                      /*tp_clear*/
7777    0,                      /*tp_richcompare*/
7778    0,                      /*tp_weaklistoffset*/
7779    0,                      /*tp_iter*/
7780    0,                      /*tp_iternext*/
7781    encoding_map_methods,   /*tp_methods*/
7782    0,                      /*tp_members*/
7783    0,                      /*tp_getset*/
7784    0,                      /*tp_base*/
7785    0,                      /*tp_dict*/
7786    0,                      /*tp_descr_get*/
7787    0,                      /*tp_descr_set*/
7788    0,                      /*tp_dictoffset*/
7789    0,                      /*tp_init*/
7790    0,                      /*tp_alloc*/
7791    0,                      /*tp_new*/
7792    0,                      /*tp_free*/
7793    0,                      /*tp_is_gc*/
7794};
7795
7796PyObject*
7797PyUnicode_BuildEncodingMap(PyObject* string)
7798{
7799    PyObject *result;
7800    struct encoding_map *mresult;
7801    int i;
7802    int need_dict = 0;
7803    unsigned char level1[32];
7804    unsigned char level2[512];
7805    unsigned char *mlevel1, *mlevel2, *mlevel3;
7806    int count2 = 0, count3 = 0;
7807    int kind;
7808    void *data;
7809    Py_UCS4 ch;
7810
7811    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7812        PyErr_BadArgument();
7813        return NULL;
7814    }
7815    kind = PyUnicode_KIND(string);
7816    data = PyUnicode_DATA(string);
7817    memset(level1, 0xFF, sizeof level1);
7818    memset(level2, 0xFF, sizeof level2);
7819
7820    /* If there isn't a one-to-one mapping of NULL to \0,
7821       or if there are non-BMP characters, we need to use
7822       a mapping dictionary. */
7823    if (PyUnicode_READ(kind, data, 0) != 0)
7824        need_dict = 1;
7825    for (i = 1; i < 256; i++) {
7826        int l1, l2;
7827        ch = PyUnicode_READ(kind, data, i);
7828        if (ch == 0 || ch > 0xFFFF) {
7829            need_dict = 1;
7830            break;
7831        }
7832        if (ch == 0xFFFE)
7833            /* unmapped character */
7834            continue;
7835        l1 = ch >> 11;
7836        l2 = ch >> 7;
7837        if (level1[l1] == 0xFF)
7838            level1[l1] = count2++;
7839        if (level2[l2] == 0xFF)
7840            level2[l2] = count3++;
7841    }
7842
7843    if (count2 >= 0xFF || count3 >= 0xFF)
7844        need_dict = 1;
7845
7846    if (need_dict) {
7847        PyObject *result = PyDict_New();
7848        PyObject *key, *value;
7849        if (!result)
7850            return NULL;
7851        for (i = 0; i < 256; i++) {
7852            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7853            value = PyLong_FromLong(i);
7854            if (!key || !value)
7855                goto failed1;
7856            if (PyDict_SetItem(result, key, value) == -1)
7857                goto failed1;
7858            Py_DECREF(key);
7859            Py_DECREF(value);
7860        }
7861        return result;
7862      failed1:
7863        Py_XDECREF(key);
7864        Py_XDECREF(value);
7865        Py_DECREF(result);
7866        return NULL;
7867    }
7868
7869    /* Create a three-level trie */
7870    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7871                             16*count2 + 128*count3 - 1);
7872    if (!result)
7873        return PyErr_NoMemory();
7874    PyObject_Init(result, &EncodingMapType);
7875    mresult = (struct encoding_map*)result;
7876    mresult->count2 = count2;
7877    mresult->count3 = count3;
7878    mlevel1 = mresult->level1;
7879    mlevel2 = mresult->level23;
7880    mlevel3 = mresult->level23 + 16*count2;
7881    memcpy(mlevel1, level1, 32);
7882    memset(mlevel2, 0xFF, 16*count2);
7883    memset(mlevel3, 0, 128*count3);
7884    count3 = 0;
7885    for (i = 1; i < 256; i++) {
7886        int o1, o2, o3, i2, i3;
7887        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7888            /* unmapped character */
7889            continue;
7890        o1 = PyUnicode_READ(kind, data, i)>>11;
7891        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7892        i2 = 16*mlevel1[o1] + o2;
7893        if (mlevel2[i2] == 0xFF)
7894            mlevel2[i2] = count3++;
7895        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7896        i3 = 128*mlevel2[i2] + o3;
7897        mlevel3[i3] = i;
7898    }
7899    return result;
7900}
7901
7902static int
7903encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7904{
7905    struct encoding_map *map = (struct encoding_map*)mapping;
7906    int l1 = c>>11;
7907    int l2 = (c>>7) & 0xF;
7908    int l3 = c & 0x7F;
7909    int i;
7910
7911    if (c > 0xFFFF)
7912        return -1;
7913    if (c == 0)
7914        return 0;
7915    /* level 1*/
7916    i = map->level1[l1];
7917    if (i == 0xFF) {
7918        return -1;
7919    }
7920    /* level 2*/
7921    i = map->level23[16*i+l2];
7922    if (i == 0xFF) {
7923        return -1;
7924    }
7925    /* level 3 */
7926    i = map->level23[16*map->count2 + 128*i + l3];
7927    if (i == 0) {
7928        return -1;
7929    }
7930    return i;
7931}
7932
7933/* Lookup the character ch in the mapping. If the character
7934   can't be found, Py_None is returned (or NULL, if another
7935   error occurred). */
7936static PyObject *
7937charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7938{
7939    PyObject *w = PyLong_FromLong((long)c);
7940    PyObject *x;
7941
7942    if (w == NULL)
7943        return NULL;
7944    x = PyObject_GetItem(mapping, w);
7945    Py_DECREF(w);
7946    if (x == NULL) {
7947        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7948            /* No mapping found means: mapping is undefined. */
7949            PyErr_Clear();
7950            x = Py_None;
7951            Py_INCREF(x);
7952            return x;
7953        } else
7954            return NULL;
7955    }
7956    else if (x == Py_None)
7957        return x;
7958    else if (PyLong_Check(x)) {
7959        long value = PyLong_AS_LONG(x);
7960        if (value < 0 || value > 255) {
7961            PyErr_SetString(PyExc_TypeError,
7962                            "character mapping must be in range(256)");
7963            Py_DECREF(x);
7964            return NULL;
7965        }
7966        return x;
7967    }
7968    else if (PyBytes_Check(x))
7969        return x;
7970    else {
7971        /* wrong return value */
7972        PyErr_Format(PyExc_TypeError,
7973                     "character mapping must return integer, bytes or None, not %.400s",
7974                     x->ob_type->tp_name);
7975        Py_DECREF(x);
7976        return NULL;
7977    }
7978}
7979
7980static int
7981charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7982{
7983    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7984    /* exponentially overallocate to minimize reallocations */
7985    if (requiredsize < 2*outsize)
7986        requiredsize = 2*outsize;
7987    if (_PyBytes_Resize(outobj, requiredsize))
7988        return -1;
7989    return 0;
7990}
7991
7992typedef enum charmapencode_result {
7993    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7994} charmapencode_result;
7995/* lookup the character, put the result in the output string and adjust
7996   various state variables. Resize the output bytes object if not enough
7997   space is available. Return a new reference to the object that
7998   was put in the output buffer, or Py_None, if the mapping was undefined
7999   (in which case no character was written) or NULL, if a
8000   reallocation error occurred. The caller must decref the result */
8001static charmapencode_result
8002charmapencode_output(Py_UCS4 c, PyObject *mapping,
8003                     PyObject **outobj, Py_ssize_t *outpos)
8004{
8005    PyObject *rep;
8006    char *outstart;
8007    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8008
8009    if (Py_TYPE(mapping) == &EncodingMapType) {
8010        int res = encoding_map_lookup(c, mapping);
8011        Py_ssize_t requiredsize = *outpos+1;
8012        if (res == -1)
8013            return enc_FAILED;
8014        if (outsize<requiredsize)
8015            if (charmapencode_resize(outobj, outpos, requiredsize))
8016                return enc_EXCEPTION;
8017        outstart = PyBytes_AS_STRING(*outobj);
8018        outstart[(*outpos)++] = (char)res;
8019        return enc_SUCCESS;
8020    }
8021
8022    rep = charmapencode_lookup(c, mapping);
8023    if (rep==NULL)
8024        return enc_EXCEPTION;
8025    else if (rep==Py_None) {
8026        Py_DECREF(rep);
8027        return enc_FAILED;
8028    } else {
8029        if (PyLong_Check(rep)) {
8030            Py_ssize_t requiredsize = *outpos+1;
8031            if (outsize<requiredsize)
8032                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8033                    Py_DECREF(rep);
8034                    return enc_EXCEPTION;
8035                }
8036            outstart = PyBytes_AS_STRING(*outobj);
8037            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8038        }
8039        else {
8040            const char *repchars = PyBytes_AS_STRING(rep);
8041            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8042            Py_ssize_t requiredsize = *outpos+repsize;
8043            if (outsize<requiredsize)
8044                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8045                    Py_DECREF(rep);
8046                    return enc_EXCEPTION;
8047                }
8048            outstart = PyBytes_AS_STRING(*outobj);
8049            memcpy(outstart + *outpos, repchars, repsize);
8050            *outpos += repsize;
8051        }
8052    }
8053    Py_DECREF(rep);
8054    return enc_SUCCESS;
8055}
8056
8057/* handle an error in PyUnicode_EncodeCharmap
8058   Return 0 on success, -1 on error */
8059static int
8060charmap_encoding_error(
8061    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8062    PyObject **exceptionObject,
8063    int *known_errorHandler, PyObject **errorHandler, const char *errors,
8064    PyObject **res, Py_ssize_t *respos)
8065{
8066    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8067    Py_ssize_t size, repsize;
8068    Py_ssize_t newpos;
8069    enum PyUnicode_Kind kind;
8070    void *data;
8071    Py_ssize_t index;
8072    /* startpos for collecting unencodable chars */
8073    Py_ssize_t collstartpos = *inpos;
8074    Py_ssize_t collendpos = *inpos+1;
8075    Py_ssize_t collpos;
8076    char *encoding = "charmap";
8077    char *reason = "character maps to <undefined>";
8078    charmapencode_result x;
8079    Py_UCS4 ch;
8080    int val;
8081
8082    if (PyUnicode_READY(unicode) == -1)
8083        return -1;
8084    size = PyUnicode_GET_LENGTH(unicode);
8085    /* find all unencodable characters */
8086    while (collendpos < size) {
8087        PyObject *rep;
8088        if (Py_TYPE(mapping) == &EncodingMapType) {
8089            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8090            val = encoding_map_lookup(ch, mapping);
8091            if (val != -1)
8092                break;
8093            ++collendpos;
8094            continue;
8095        }
8096
8097        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8098        rep = charmapencode_lookup(ch, mapping);
8099        if (rep==NULL)
8100            return -1;
8101        else if (rep!=Py_None) {
8102            Py_DECREF(rep);
8103            break;
8104        }
8105        Py_DECREF(rep);
8106        ++collendpos;
8107    }
8108    /* cache callback name lookup
8109     * (if not done yet, i.e. it's the first error) */
8110    if (*known_errorHandler==-1) {
8111        if ((errors==NULL) || (!strcmp(errors, "strict")))
8112            *known_errorHandler = 1;
8113        else if (!strcmp(errors, "replace"))
8114            *known_errorHandler = 2;
8115        else if (!strcmp(errors, "ignore"))
8116            *known_errorHandler = 3;
8117        else if (!strcmp(errors, "xmlcharrefreplace"))
8118            *known_errorHandler = 4;
8119        else
8120            *known_errorHandler = 0;
8121    }
8122    switch (*known_errorHandler) {
8123    case 1: /* strict */
8124        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8125        return -1;
8126    case 2: /* replace */
8127        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8128            x = charmapencode_output('?', mapping, res, respos);
8129            if (x==enc_EXCEPTION) {
8130                return -1;
8131            }
8132            else if (x==enc_FAILED) {
8133                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8134                return -1;
8135            }
8136        }
8137        /* fall through */
8138    case 3: /* ignore */
8139        *inpos = collendpos;
8140        break;
8141    case 4: /* xmlcharrefreplace */
8142        /* generate replacement (temporarily (mis)uses p) */
8143        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8144            char buffer[2+29+1+1];
8145            char *cp;
8146            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8147            for (cp = buffer; *cp; ++cp) {
8148                x = charmapencode_output(*cp, mapping, res, respos);
8149                if (x==enc_EXCEPTION)
8150                    return -1;
8151                else if (x==enc_FAILED) {
8152                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8153                    return -1;
8154                }
8155            }
8156        }
8157        *inpos = collendpos;
8158        break;
8159    default:
8160        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8161                                                      encoding, reason, unicode, exceptionObject,
8162                                                      collstartpos, collendpos, &newpos);
8163        if (repunicode == NULL)
8164            return -1;
8165        if (PyBytes_Check(repunicode)) {
8166            /* Directly copy bytes result to output. */
8167            Py_ssize_t outsize = PyBytes_Size(*res);
8168            Py_ssize_t requiredsize;
8169            repsize = PyBytes_Size(repunicode);
8170            requiredsize = *respos + repsize;
8171            if (requiredsize > outsize)
8172                /* Make room for all additional bytes. */
8173                if (charmapencode_resize(res, respos, requiredsize)) {
8174                    Py_DECREF(repunicode);
8175                    return -1;
8176                }
8177            memcpy(PyBytes_AsString(*res) + *respos,
8178                   PyBytes_AsString(repunicode),  repsize);
8179            *respos += repsize;
8180            *inpos = newpos;
8181            Py_DECREF(repunicode);
8182            break;
8183        }
8184        /* generate replacement  */
8185        if (PyUnicode_READY(repunicode) == -1) {
8186            Py_DECREF(repunicode);
8187            return -1;
8188        }
8189        repsize = PyUnicode_GET_LENGTH(repunicode);
8190        data = PyUnicode_DATA(repunicode);
8191        kind = PyUnicode_KIND(repunicode);
8192        for (index = 0; index < repsize; index++) {
8193            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8194            x = charmapencode_output(repch, mapping, res, respos);
8195            if (x==enc_EXCEPTION) {
8196                Py_DECREF(repunicode);
8197                return -1;
8198            }
8199            else if (x==enc_FAILED) {
8200                Py_DECREF(repunicode);
8201                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8202                return -1;
8203            }
8204        }
8205        *inpos = newpos;
8206        Py_DECREF(repunicode);
8207    }
8208    return 0;
8209}
8210
8211PyObject *
8212_PyUnicode_EncodeCharmap(PyObject *unicode,
8213                         PyObject *mapping,
8214                         const char *errors)
8215{
8216    /* output object */
8217    PyObject *res = NULL;
8218    /* current input position */
8219    Py_ssize_t inpos = 0;
8220    Py_ssize_t size;
8221    /* current output position */
8222    Py_ssize_t respos = 0;
8223    PyObject *errorHandler = NULL;
8224    PyObject *exc = NULL;
8225    /* the following variable is used for caching string comparisons
8226     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8227     * 3=ignore, 4=xmlcharrefreplace */
8228    int known_errorHandler = -1;
8229
8230    if (PyUnicode_READY(unicode) == -1)
8231        return NULL;
8232    size = PyUnicode_GET_LENGTH(unicode);
8233
8234    /* Default to Latin-1 */
8235    if (mapping == NULL)
8236        return unicode_encode_ucs1(unicode, errors, 256);
8237
8238    /* allocate enough for a simple encoding without
8239       replacements, if we need more, we'll resize */
8240    res = PyBytes_FromStringAndSize(NULL, size);
8241    if (res == NULL)
8242        goto onError;
8243    if (size == 0)
8244        return res;
8245
8246    while (inpos<size) {
8247        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
8248        /* try to encode it */
8249        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8250        if (x==enc_EXCEPTION) /* error */
8251            goto onError;
8252        if (x==enc_FAILED) { /* unencodable character */
8253            if (charmap_encoding_error(unicode, &inpos, mapping,
8254                                       &exc,
8255                                       &known_errorHandler, &errorHandler, errors,
8256                                       &res, &respos)) {
8257                goto onError;
8258            }
8259        }
8260        else
8261            /* done with this character => adjust input position */
8262            ++inpos;
8263    }
8264
8265    /* Resize if we allocated to much */
8266    if (respos<PyBytes_GET_SIZE(res))
8267        if (_PyBytes_Resize(&res, respos) < 0)
8268            goto onError;
8269
8270    Py_XDECREF(exc);
8271    Py_XDECREF(errorHandler);
8272    return res;
8273
8274  onError:
8275    Py_XDECREF(res);
8276    Py_XDECREF(exc);
8277    Py_XDECREF(errorHandler);
8278    return NULL;
8279}
8280
8281/* Deprecated */
8282PyObject *
8283PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8284                        Py_ssize_t size,
8285                        PyObject *mapping,
8286                        const char *errors)
8287{
8288    PyObject *result;
8289    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8290    if (unicode == NULL)
8291        return NULL;
8292    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8293    Py_DECREF(unicode);
8294    return result;
8295}
8296
8297PyObject *
8298PyUnicode_AsCharmapString(PyObject *unicode,
8299                          PyObject *mapping)
8300{
8301    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8302        PyErr_BadArgument();
8303        return NULL;
8304    }
8305    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8306}
8307
8308/* create or adjust a UnicodeTranslateError */
8309static void
8310make_translate_exception(PyObject **exceptionObject,
8311                         PyObject *unicode,
8312                         Py_ssize_t startpos, Py_ssize_t endpos,
8313                         const char *reason)
8314{
8315    if (*exceptionObject == NULL) {
8316        *exceptionObject = _PyUnicodeTranslateError_Create(
8317            unicode, startpos, endpos, reason);
8318    }
8319    else {
8320        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8321            goto onError;
8322        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8323            goto onError;
8324        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8325            goto onError;
8326        return;
8327      onError:
8328        Py_DECREF(*exceptionObject);
8329        *exceptionObject = NULL;
8330    }
8331}
8332
8333/* raises a UnicodeTranslateError */
8334static void
8335raise_translate_exception(PyObject **exceptionObject,
8336                          PyObject *unicode,
8337                          Py_ssize_t startpos, Py_ssize_t endpos,
8338                          const char *reason)
8339{
8340    make_translate_exception(exceptionObject,
8341                             unicode, startpos, endpos, reason);
8342    if (*exceptionObject != NULL)
8343        PyCodec_StrictErrors(*exceptionObject);
8344}
8345
8346/* error handling callback helper:
8347   build arguments, call the callback and check the arguments,
8348   put the result into newpos and return the replacement string, which
8349   has to be freed by the caller */
8350static PyObject *
8351unicode_translate_call_errorhandler(const char *errors,
8352                                    PyObject **errorHandler,
8353                                    const char *reason,
8354                                    PyObject *unicode, PyObject **exceptionObject,
8355                                    Py_ssize_t startpos, Py_ssize_t endpos,
8356                                    Py_ssize_t *newpos)
8357{
8358    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8359
8360    Py_ssize_t i_newpos;
8361    PyObject *restuple;
8362    PyObject *resunicode;
8363
8364    if (*errorHandler == NULL) {
8365        *errorHandler = PyCodec_LookupError(errors);
8366        if (*errorHandler == NULL)
8367            return NULL;
8368    }
8369
8370    make_translate_exception(exceptionObject,
8371                             unicode, startpos, endpos, reason);
8372    if (*exceptionObject == NULL)
8373        return NULL;
8374
8375    restuple = PyObject_CallFunctionObjArgs(
8376        *errorHandler, *exceptionObject, NULL);
8377    if (restuple == NULL)
8378        return NULL;
8379    if (!PyTuple_Check(restuple)) {
8380        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8381        Py_DECREF(restuple);
8382        return NULL;
8383    }
8384    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8385                          &resunicode, &i_newpos)) {
8386        Py_DECREF(restuple);
8387        return NULL;
8388    }
8389    if (i_newpos<0)
8390        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8391    else
8392        *newpos = i_newpos;
8393    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8394        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8395        Py_DECREF(restuple);
8396        return NULL;
8397    }
8398    Py_INCREF(resunicode);
8399    Py_DECREF(restuple);
8400    return resunicode;
8401}
8402
8403/* Lookup the character ch in the mapping and put the result in result,
8404   which must be decrefed by the caller.
8405   Return 0 on success, -1 on error */
8406static int
8407charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8408{
8409    PyObject *w = PyLong_FromLong((long)c);
8410    PyObject *x;
8411
8412    if (w == NULL)
8413        return -1;
8414    x = PyObject_GetItem(mapping, w);
8415    Py_DECREF(w);
8416    if (x == NULL) {
8417        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8418            /* No mapping found means: use 1:1 mapping. */
8419            PyErr_Clear();
8420            *result = NULL;
8421            return 0;
8422        } else
8423            return -1;
8424    }
8425    else if (x == Py_None) {
8426        *result = x;
8427        return 0;
8428    }
8429    else if (PyLong_Check(x)) {
8430        long value = PyLong_AS_LONG(x);
8431        long max = PyUnicode_GetMax();
8432        if (value < 0 || value > max) {
8433            PyErr_Format(PyExc_TypeError,
8434                         "character mapping must be in range(0x%x)", max+1);
8435            Py_DECREF(x);
8436            return -1;
8437        }
8438        *result = x;
8439        return 0;
8440    }
8441    else if (PyUnicode_Check(x)) {
8442        *result = x;
8443        return 0;
8444    }
8445    else {
8446        /* wrong return value */
8447        PyErr_SetString(PyExc_TypeError,
8448                        "character mapping must return integer, None or str");
8449        Py_DECREF(x);
8450        return -1;
8451    }
8452}
8453/* ensure that *outobj is at least requiredsize characters long,
8454   if not reallocate and adjust various state variables.
8455   Return 0 on success, -1 on error */
8456static int
8457charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8458                               Py_ssize_t requiredsize)
8459{
8460    Py_ssize_t oldsize = *psize;
8461    if (requiredsize > oldsize) {
8462        /* exponentially overallocate to minimize reallocations */
8463        if (requiredsize < 2 * oldsize)
8464            requiredsize = 2 * oldsize;
8465        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8466        if (*outobj == 0)
8467            return -1;
8468        *psize = requiredsize;
8469    }
8470    return 0;
8471}
8472/* lookup the character, put the result in the output string and adjust
8473   various state variables. Return a new reference to the object that
8474   was put in the output buffer in *result, or Py_None, if the mapping was
8475   undefined (in which case no character was written).
8476   The called must decref result.
8477   Return 0 on success, -1 on error. */
8478static int
8479charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8480                        PyObject *mapping, Py_UCS4 **output,
8481                        Py_ssize_t *osize, Py_ssize_t *opos,
8482                        PyObject **res)
8483{
8484    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8485    if (charmaptranslate_lookup(curinp, mapping, res))
8486        return -1;
8487    if (*res==NULL) {
8488        /* not found => default to 1:1 mapping */
8489        (*output)[(*opos)++] = curinp;
8490    }
8491    else if (*res==Py_None)
8492        ;
8493    else if (PyLong_Check(*res)) {
8494        /* no overflow check, because we know that the space is enough */
8495        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8496    }
8497    else if (PyUnicode_Check(*res)) {
8498        Py_ssize_t repsize;
8499        if (PyUnicode_READY(*res) == -1)
8500            return -1;
8501        repsize = PyUnicode_GET_LENGTH(*res);
8502        if (repsize==1) {
8503            /* no overflow check, because we know that the space is enough */
8504            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8505        }
8506        else if (repsize!=0) {
8507            /* more than one character */
8508            Py_ssize_t requiredsize = *opos +
8509                (PyUnicode_GET_LENGTH(input) - ipos) +
8510                repsize - 1;
8511            Py_ssize_t i;
8512            if (charmaptranslate_makespace(output, osize, requiredsize))
8513                return -1;
8514            for(i = 0; i < repsize; i++)
8515                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8516        }
8517    }
8518    else
8519        return -1;
8520    return 0;
8521}
8522
8523PyObject *
8524_PyUnicode_TranslateCharmap(PyObject *input,
8525                            PyObject *mapping,
8526                            const char *errors)
8527{
8528    /* input object */
8529    char *idata;
8530    Py_ssize_t size, i;
8531    int kind;
8532    /* output buffer */
8533    Py_UCS4 *output = NULL;
8534    Py_ssize_t osize;
8535    PyObject *res;
8536    /* current output position */
8537    Py_ssize_t opos;
8538    char *reason = "character maps to <undefined>";
8539    PyObject *errorHandler = NULL;
8540    PyObject *exc = NULL;
8541    /* the following variable is used for caching string comparisons
8542     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8543     * 3=ignore, 4=xmlcharrefreplace */
8544    int known_errorHandler = -1;
8545
8546    if (mapping == NULL) {
8547        PyErr_BadArgument();
8548        return NULL;
8549    }
8550
8551    if (PyUnicode_READY(input) == -1)
8552        return NULL;
8553    idata = (char*)PyUnicode_DATA(input);
8554    kind = PyUnicode_KIND(input);
8555    size = PyUnicode_GET_LENGTH(input);
8556    i = 0;
8557
8558    if (size == 0) {
8559        Py_INCREF(input);
8560        return input;
8561    }
8562
8563    /* allocate enough for a simple 1:1 translation without
8564       replacements, if we need more, we'll resize */
8565    osize = size;
8566    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8567    opos = 0;
8568    if (output == NULL) {
8569        PyErr_NoMemory();
8570        goto onError;
8571    }
8572
8573    while (i<size) {
8574        /* try to encode it */
8575        PyObject *x = NULL;
8576        if (charmaptranslate_output(input, i, mapping,
8577                                    &output, &osize, &opos, &x)) {
8578            Py_XDECREF(x);
8579            goto onError;
8580        }
8581        Py_XDECREF(x);
8582        if (x!=Py_None) /* it worked => adjust input pointer */
8583            ++i;
8584        else { /* untranslatable character */
8585            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8586            Py_ssize_t repsize;
8587            Py_ssize_t newpos;
8588            Py_ssize_t uni2;
8589            /* startpos for collecting untranslatable chars */
8590            Py_ssize_t collstart = i;
8591            Py_ssize_t collend = i+1;
8592            Py_ssize_t coll;
8593
8594            /* find all untranslatable characters */
8595            while (collend < size) {
8596                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8597                    goto onError;
8598                Py_XDECREF(x);
8599                if (x!=Py_None)
8600                    break;
8601                ++collend;
8602            }
8603            /* cache callback name lookup
8604             * (if not done yet, i.e. it's the first error) */
8605            if (known_errorHandler==-1) {
8606                if ((errors==NULL) || (!strcmp(errors, "strict")))
8607                    known_errorHandler = 1;
8608                else if (!strcmp(errors, "replace"))
8609                    known_errorHandler = 2;
8610                else if (!strcmp(errors, "ignore"))
8611                    known_errorHandler = 3;
8612                else if (!strcmp(errors, "xmlcharrefreplace"))
8613                    known_errorHandler = 4;
8614                else
8615                    known_errorHandler = 0;
8616            }
8617            switch (known_errorHandler) {
8618            case 1: /* strict */
8619                raise_translate_exception(&exc, input, collstart,
8620                                          collend, reason);
8621                goto onError;
8622            case 2: /* replace */
8623                /* No need to check for space, this is a 1:1 replacement */
8624                for (coll = collstart; coll<collend; coll++)
8625                    output[opos++] = '?';
8626                /* fall through */
8627            case 3: /* ignore */
8628                i = collend;
8629                break;
8630            case 4: /* xmlcharrefreplace */
8631                /* generate replacement (temporarily (mis)uses i) */
8632                for (i = collstart; i < collend; ++i) {
8633                    char buffer[2+29+1+1];
8634                    char *cp;
8635                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8636                    if (charmaptranslate_makespace(&output, &osize,
8637                                                   opos+strlen(buffer)+(size-collend)))
8638                        goto onError;
8639                    for (cp = buffer; *cp; ++cp)
8640                        output[opos++] = *cp;
8641                }
8642                i = collend;
8643                break;
8644            default:
8645                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8646                                                                 reason, input, &exc,
8647                                                                 collstart, collend, &newpos);
8648                if (repunicode == NULL)
8649                    goto onError;
8650                if (PyUnicode_READY(repunicode) == -1) {
8651                    Py_DECREF(repunicode);
8652                    goto onError;
8653                }
8654                /* generate replacement  */
8655                repsize = PyUnicode_GET_LENGTH(repunicode);
8656                if (charmaptranslate_makespace(&output, &osize,
8657                                               opos+repsize+(size-collend))) {
8658                    Py_DECREF(repunicode);
8659                    goto onError;
8660                }
8661                for (uni2 = 0; repsize-->0; ++uni2)
8662                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8663                i = newpos;
8664                Py_DECREF(repunicode);
8665            }
8666        }
8667    }
8668    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8669    if (!res)
8670        goto onError;
8671    PyMem_Free(output);
8672    Py_XDECREF(exc);
8673    Py_XDECREF(errorHandler);
8674    return res;
8675
8676  onError:
8677    PyMem_Free(output);
8678    Py_XDECREF(exc);
8679    Py_XDECREF(errorHandler);
8680    return NULL;
8681}
8682
8683/* Deprecated. Use PyUnicode_Translate instead. */
8684PyObject *
8685PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8686                           Py_ssize_t size,
8687                           PyObject *mapping,
8688                           const char *errors)
8689{
8690    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8691    if (!unicode)
8692        return NULL;
8693    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8694}
8695
8696PyObject *
8697PyUnicode_Translate(PyObject *str,
8698                    PyObject *mapping,
8699                    const char *errors)
8700{
8701    PyObject *result;
8702
8703    str = PyUnicode_FromObject(str);
8704    if (str == NULL)
8705        goto onError;
8706    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8707    Py_DECREF(str);
8708    return result;
8709
8710  onError:
8711    Py_XDECREF(str);
8712    return NULL;
8713}
8714
8715static Py_UCS4
8716fix_decimal_and_space_to_ascii(PyObject *self)
8717{
8718    /* No need to call PyUnicode_READY(self) because this function is only
8719       called as a callback from fixup() which does it already. */
8720    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8721    const int kind = PyUnicode_KIND(self);
8722    void *data = PyUnicode_DATA(self);
8723    Py_UCS4 maxchar = 127, ch, fixed;
8724    int modified = 0;
8725    Py_ssize_t i;
8726
8727    for (i = 0; i < len; ++i) {
8728        ch = PyUnicode_READ(kind, data, i);
8729        fixed = 0;
8730        if (ch > 127) {
8731            if (Py_UNICODE_ISSPACE(ch))
8732                fixed = ' ';
8733            else {
8734                const int decimal = Py_UNICODE_TODECIMAL(ch);
8735                if (decimal >= 0)
8736                    fixed = '0' + decimal;
8737            }
8738            if (fixed != 0) {
8739                modified = 1;
8740                maxchar = MAX_MAXCHAR(maxchar, fixed);
8741                PyUnicode_WRITE(kind, data, i, fixed);
8742            }
8743            else
8744                maxchar = MAX_MAXCHAR(maxchar, ch);
8745        }
8746    }
8747
8748    return (modified) ? maxchar : 0;
8749}
8750
8751PyObject *
8752_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8753{
8754    if (!PyUnicode_Check(unicode)) {
8755        PyErr_BadInternalCall();
8756        return NULL;
8757    }
8758    if (PyUnicode_READY(unicode) == -1)
8759        return NULL;
8760    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8761        /* If the string is already ASCII, just return the same string */
8762        Py_INCREF(unicode);
8763        return unicode;
8764    }
8765    return fixup(unicode, fix_decimal_and_space_to_ascii);
8766}
8767
8768PyObject *
8769PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8770                                  Py_ssize_t length)
8771{
8772    PyObject *decimal;
8773    Py_ssize_t i;
8774    Py_UCS4 maxchar;
8775    enum PyUnicode_Kind kind;
8776    void *data;
8777
8778    maxchar = 127;
8779    for (i = 0; i < length; i++) {
8780        Py_UNICODE ch = s[i];
8781        if (ch > 127) {
8782            int decimal = Py_UNICODE_TODECIMAL(ch);
8783            if (decimal >= 0)
8784                ch = '0' + decimal;
8785            maxchar = MAX_MAXCHAR(maxchar, ch);
8786        }
8787    }
8788
8789    /* Copy to a new string */
8790    decimal = PyUnicode_New(length, maxchar);
8791    if (decimal == NULL)
8792        return decimal;
8793    kind = PyUnicode_KIND(decimal);
8794    data = PyUnicode_DATA(decimal);
8795    /* Iterate over code points */
8796    for (i = 0; i < length; i++) {
8797        Py_UNICODE ch = s[i];
8798        if (ch > 127) {
8799            int decimal = Py_UNICODE_TODECIMAL(ch);
8800            if (decimal >= 0)
8801                ch = '0' + decimal;
8802        }
8803        PyUnicode_WRITE(kind, data, i, ch);
8804    }
8805    return unicode_result(decimal);
8806}
8807/* --- Decimal Encoder ---------------------------------------------------- */
8808
8809int
8810PyUnicode_EncodeDecimal(Py_UNICODE *s,
8811                        Py_ssize_t length,
8812                        char *output,
8813                        const char *errors)
8814{
8815    PyObject *unicode;
8816    Py_ssize_t i;
8817    enum PyUnicode_Kind kind;
8818    void *data;
8819
8820    if (output == NULL) {
8821        PyErr_BadArgument();
8822        return -1;
8823    }
8824
8825    unicode = PyUnicode_FromUnicode(s, length);
8826    if (unicode == NULL)
8827        return -1;
8828
8829    if (PyUnicode_READY(unicode) == -1) {
8830        Py_DECREF(unicode);
8831        return -1;
8832    }
8833    kind = PyUnicode_KIND(unicode);
8834    data = PyUnicode_DATA(unicode);
8835
8836    for (i=0; i < length; ) {
8837        PyObject *exc;
8838        Py_UCS4 ch;
8839        int decimal;
8840        Py_ssize_t startpos;
8841
8842        ch = PyUnicode_READ(kind, data, i);
8843
8844        if (Py_UNICODE_ISSPACE(ch)) {
8845            *output++ = ' ';
8846            i++;
8847            continue;
8848        }
8849        decimal = Py_UNICODE_TODECIMAL(ch);
8850        if (decimal >= 0) {
8851            *output++ = '0' + decimal;
8852            i++;
8853            continue;
8854        }
8855        if (0 < ch && ch < 256) {
8856            *output++ = (char)ch;
8857            i++;
8858            continue;
8859        }
8860
8861        startpos = i;
8862        exc = NULL;
8863        raise_encode_exception(&exc, "decimal", unicode,
8864                               startpos, startpos+1,
8865                               "invalid decimal Unicode string");
8866        Py_XDECREF(exc);
8867        Py_DECREF(unicode);
8868        return -1;
8869    }
8870    /* 0-terminate the output string */
8871    *output++ = '\0';
8872    Py_DECREF(unicode);
8873    return 0;
8874}
8875
8876/* --- Helpers ------------------------------------------------------------ */
8877
8878static Py_ssize_t
8879any_find_slice(int direction, PyObject* s1, PyObject* s2,
8880               Py_ssize_t start,
8881               Py_ssize_t end)
8882{
8883    int kind1, kind2, kind;
8884    void *buf1, *buf2;
8885    Py_ssize_t len1, len2, result;
8886
8887    kind1 = PyUnicode_KIND(s1);
8888    kind2 = PyUnicode_KIND(s2);
8889    kind = kind1 > kind2 ? kind1 : kind2;
8890    buf1 = PyUnicode_DATA(s1);
8891    buf2 = PyUnicode_DATA(s2);
8892    if (kind1 != kind)
8893        buf1 = _PyUnicode_AsKind(s1, kind);
8894    if (!buf1)
8895        return -2;
8896    if (kind2 != kind)
8897        buf2 = _PyUnicode_AsKind(s2, kind);
8898    if (!buf2) {
8899        if (kind1 != kind) PyMem_Free(buf1);
8900        return -2;
8901    }
8902    len1 = PyUnicode_GET_LENGTH(s1);
8903    len2 = PyUnicode_GET_LENGTH(s2);
8904
8905    if (direction > 0) {
8906        switch (kind) {
8907        case PyUnicode_1BYTE_KIND:
8908            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8909                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8910            else
8911                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8912            break;
8913        case PyUnicode_2BYTE_KIND:
8914            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8915            break;
8916        case PyUnicode_4BYTE_KIND:
8917            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8918            break;
8919        default:
8920            assert(0); result = -2;
8921        }
8922    }
8923    else {
8924        switch (kind) {
8925        case PyUnicode_1BYTE_KIND:
8926            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8927                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8928            else
8929                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8930            break;
8931        case PyUnicode_2BYTE_KIND:
8932            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8933            break;
8934        case PyUnicode_4BYTE_KIND:
8935            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8936            break;
8937        default:
8938            assert(0); result = -2;
8939        }
8940    }
8941
8942    if (kind1 != kind)
8943        PyMem_Free(buf1);
8944    if (kind2 != kind)
8945        PyMem_Free(buf2);
8946
8947    return result;
8948}
8949
8950Py_ssize_t
8951_PyUnicode_InsertThousandsGrouping(
8952    PyObject *unicode, Py_ssize_t index,
8953    Py_ssize_t n_buffer,
8954    void *digits, Py_ssize_t n_digits,
8955    Py_ssize_t min_width,
8956    const char *grouping, PyObject *thousands_sep,
8957    Py_UCS4 *maxchar)
8958{
8959    unsigned int kind, thousands_sep_kind;
8960    char *data, *thousands_sep_data;
8961    Py_ssize_t thousands_sep_len;
8962    Py_ssize_t len;
8963
8964    if (unicode != NULL) {
8965        kind = PyUnicode_KIND(unicode);
8966        data = (char *) PyUnicode_DATA(unicode) + index * kind;
8967    }
8968    else {
8969        kind = PyUnicode_1BYTE_KIND;
8970        data = NULL;
8971    }
8972    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8973    thousands_sep_data = PyUnicode_DATA(thousands_sep);
8974    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8975    if (unicode != NULL && thousands_sep_kind != kind) {
8976        if (thousands_sep_kind < kind) {
8977            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8978            if (!thousands_sep_data)
8979                return -1;
8980        }
8981        else {
8982            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8983            if (!data)
8984                return -1;
8985        }
8986    }
8987
8988    switch (kind) {
8989    case PyUnicode_1BYTE_KIND:
8990        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8991            len = asciilib_InsertThousandsGrouping(
8992                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
8993                min_width, grouping,
8994                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8995        else
8996            len = ucs1lib_InsertThousandsGrouping(
8997                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8998                min_width, grouping,
8999                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9000        break;
9001    case PyUnicode_2BYTE_KIND:
9002        len = ucs2lib_InsertThousandsGrouping(
9003            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9004            min_width, grouping,
9005            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9006        break;
9007    case PyUnicode_4BYTE_KIND:
9008        len = ucs4lib_InsertThousandsGrouping(
9009            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9010            min_width, grouping,
9011            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9012        break;
9013    default:
9014        assert(0);
9015        return -1;
9016    }
9017    if (unicode != NULL && thousands_sep_kind != kind) {
9018        if (thousands_sep_kind < kind)
9019            PyMem_Free(thousands_sep_data);
9020        else
9021            PyMem_Free(data);
9022    }
9023    if (unicode == NULL) {
9024        *maxchar = 127;
9025        if (len != n_digits) {
9026            *maxchar = MAX_MAXCHAR(*maxchar,
9027                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9028        }
9029    }
9030    return len;
9031}
9032
9033
9034/* helper macro to fixup start/end slice values */
9035#define ADJUST_INDICES(start, end, len)         \
9036    if (end > len)                              \
9037        end = len;                              \
9038    else if (end < 0) {                         \
9039        end += len;                             \
9040        if (end < 0)                            \
9041            end = 0;                            \
9042    }                                           \
9043    if (start < 0) {                            \
9044        start += len;                           \
9045        if (start < 0)                          \
9046            start = 0;                          \
9047    }
9048
9049Py_ssize_t
9050PyUnicode_Count(PyObject *str,
9051                PyObject *substr,
9052                Py_ssize_t start,
9053                Py_ssize_t end)
9054{
9055    Py_ssize_t result;
9056    PyObject* str_obj;
9057    PyObject* sub_obj;
9058    int kind1, kind2, kind;
9059    void *buf1 = NULL, *buf2 = NULL;
9060    Py_ssize_t len1, len2;
9061
9062    str_obj = PyUnicode_FromObject(str);
9063    if (!str_obj)
9064        return -1;
9065    sub_obj = PyUnicode_FromObject(substr);
9066    if (!sub_obj) {
9067        Py_DECREF(str_obj);
9068        return -1;
9069    }
9070    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
9071        Py_DECREF(sub_obj);
9072        Py_DECREF(str_obj);
9073        return -1;
9074    }
9075
9076    kind1 = PyUnicode_KIND(str_obj);
9077    kind2 = PyUnicode_KIND(sub_obj);
9078    kind = kind1;
9079    buf1 = PyUnicode_DATA(str_obj);
9080    buf2 = PyUnicode_DATA(sub_obj);
9081    if (kind2 != kind) {
9082        if (kind2 > kind)
9083            return 0;
9084        buf2 = _PyUnicode_AsKind(sub_obj, kind);
9085    }
9086    if (!buf2)
9087        goto onError;
9088    len1 = PyUnicode_GET_LENGTH(str_obj);
9089    len2 = PyUnicode_GET_LENGTH(sub_obj);
9090
9091    ADJUST_INDICES(start, end, len1);
9092    switch (kind) {
9093    case PyUnicode_1BYTE_KIND:
9094        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9095            result = asciilib_count(
9096                ((Py_UCS1*)buf1) + start, end - start,
9097                buf2, len2, PY_SSIZE_T_MAX
9098                );
9099        else
9100            result = ucs1lib_count(
9101                ((Py_UCS1*)buf1) + start, end - start,
9102                buf2, len2, PY_SSIZE_T_MAX
9103                );
9104        break;
9105    case PyUnicode_2BYTE_KIND:
9106        result = ucs2lib_count(
9107            ((Py_UCS2*)buf1) + start, end - start,
9108            buf2, len2, PY_SSIZE_T_MAX
9109            );
9110        break;
9111    case PyUnicode_4BYTE_KIND:
9112        result = ucs4lib_count(
9113            ((Py_UCS4*)buf1) + start, end - start,
9114            buf2, len2, PY_SSIZE_T_MAX
9115            );
9116        break;
9117    default:
9118        assert(0); result = 0;
9119    }
9120
9121    Py_DECREF(sub_obj);
9122    Py_DECREF(str_obj);
9123
9124    if (kind2 != kind)
9125        PyMem_Free(buf2);
9126
9127    return result;
9128  onError:
9129    Py_DECREF(sub_obj);
9130    Py_DECREF(str_obj);
9131    if (kind2 != kind && buf2)
9132        PyMem_Free(buf2);
9133    return -1;
9134}
9135
9136Py_ssize_t
9137PyUnicode_Find(PyObject *str,
9138               PyObject *sub,
9139               Py_ssize_t start,
9140               Py_ssize_t end,
9141               int direction)
9142{
9143    Py_ssize_t result;
9144
9145    str = PyUnicode_FromObject(str);
9146    if (!str)
9147        return -2;
9148    sub = PyUnicode_FromObject(sub);
9149    if (!sub) {
9150        Py_DECREF(str);
9151        return -2;
9152    }
9153    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9154        Py_DECREF(sub);
9155        Py_DECREF(str);
9156        return -2;
9157    }
9158
9159    result = any_find_slice(direction,
9160        str, sub, start, end
9161        );
9162
9163    Py_DECREF(str);
9164    Py_DECREF(sub);
9165
9166    return result;
9167}
9168
9169Py_ssize_t
9170PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9171                   Py_ssize_t start, Py_ssize_t end,
9172                   int direction)
9173{
9174    int kind;
9175    Py_ssize_t result;
9176    if (PyUnicode_READY(str) == -1)
9177        return -2;
9178    if (start < 0 || end < 0) {
9179        PyErr_SetString(PyExc_IndexError, "string index out of range");
9180        return -2;
9181    }
9182    if (end > PyUnicode_GET_LENGTH(str))
9183        end = PyUnicode_GET_LENGTH(str);
9184    kind = PyUnicode_KIND(str);
9185    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9186                      kind, end-start, ch, direction);
9187    if (result == -1)
9188        return -1;
9189    else
9190        return start + result;
9191}
9192
9193static int
9194tailmatch(PyObject *self,
9195          PyObject *substring,
9196          Py_ssize_t start,
9197          Py_ssize_t end,
9198          int direction)
9199{
9200    int kind_self;
9201    int kind_sub;
9202    void *data_self;
9203    void *data_sub;
9204    Py_ssize_t offset;
9205    Py_ssize_t i;
9206    Py_ssize_t end_sub;
9207
9208    if (PyUnicode_READY(self) == -1 ||
9209        PyUnicode_READY(substring) == -1)
9210        return 0;
9211
9212    if (PyUnicode_GET_LENGTH(substring) == 0)
9213        return 1;
9214
9215    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9216    end -= PyUnicode_GET_LENGTH(substring);
9217    if (end < start)
9218        return 0;
9219
9220    kind_self = PyUnicode_KIND(self);
9221    data_self = PyUnicode_DATA(self);
9222    kind_sub = PyUnicode_KIND(substring);
9223    data_sub = PyUnicode_DATA(substring);
9224    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9225
9226    if (direction > 0)
9227        offset = end;
9228    else
9229        offset = start;
9230
9231    if (PyUnicode_READ(kind_self, data_self, offset) ==
9232        PyUnicode_READ(kind_sub, data_sub, 0) &&
9233        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9234        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9235        /* If both are of the same kind, memcmp is sufficient */
9236        if (kind_self == kind_sub) {
9237            return ! memcmp((char *)data_self +
9238                                (offset * PyUnicode_KIND(substring)),
9239                            data_sub,
9240                            PyUnicode_GET_LENGTH(substring) *
9241                                PyUnicode_KIND(substring));
9242        }
9243        /* otherwise we have to compare each character by first accesing it */
9244        else {
9245            /* We do not need to compare 0 and len(substring)-1 because
9246               the if statement above ensured already that they are equal
9247               when we end up here. */
9248            // TODO: honor direction and do a forward or backwards search
9249            for (i = 1; i < end_sub; ++i) {
9250                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9251                    PyUnicode_READ(kind_sub, data_sub, i))
9252                    return 0;
9253            }
9254            return 1;
9255        }
9256    }
9257
9258    return 0;
9259}
9260
9261Py_ssize_t
9262PyUnicode_Tailmatch(PyObject *str,
9263                    PyObject *substr,
9264                    Py_ssize_t start,
9265                    Py_ssize_t end,
9266                    int direction)
9267{
9268    Py_ssize_t result;
9269
9270    str = PyUnicode_FromObject(str);
9271    if (str == NULL)
9272        return -1;
9273    substr = PyUnicode_FromObject(substr);
9274    if (substr == NULL) {
9275        Py_DECREF(str);
9276        return -1;
9277    }
9278
9279    result = tailmatch(str, substr,
9280                       start, end, direction);
9281    Py_DECREF(str);
9282    Py_DECREF(substr);
9283    return result;
9284}
9285
9286/* Apply fixfct filter to the Unicode object self and return a
9287   reference to the modified object */
9288
9289static PyObject *
9290fixup(PyObject *self,
9291      Py_UCS4 (*fixfct)(PyObject *s))
9292{
9293    PyObject *u;
9294    Py_UCS4 maxchar_old, maxchar_new = 0;
9295    PyObject *v;
9296
9297    u = _PyUnicode_Copy(self);
9298    if (u == NULL)
9299        return NULL;
9300    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9301
9302    /* fix functions return the new maximum character in a string,
9303       if the kind of the resulting unicode object does not change,
9304       everything is fine.  Otherwise we need to change the string kind
9305       and re-run the fix function. */
9306    maxchar_new = fixfct(u);
9307
9308    if (maxchar_new == 0) {
9309        /* no changes */;
9310        if (PyUnicode_CheckExact(self)) {
9311            Py_DECREF(u);
9312            Py_INCREF(self);
9313            return self;
9314        }
9315        else
9316            return u;
9317    }
9318
9319    maxchar_new = align_maxchar(maxchar_new);
9320
9321    if (maxchar_new == maxchar_old)
9322        return u;
9323
9324    /* In case the maximum character changed, we need to
9325       convert the string to the new category. */
9326    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9327    if (v == NULL) {
9328        Py_DECREF(u);
9329        return NULL;
9330    }
9331    if (maxchar_new > maxchar_old) {
9332        /* If the maxchar increased so that the kind changed, not all
9333           characters are representable anymore and we need to fix the
9334           string again. This only happens in very few cases. */
9335        copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9336        maxchar_old = fixfct(v);
9337        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9338    }
9339    else {
9340        copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
9341    }
9342    Py_DECREF(u);
9343    assert(_PyUnicode_CheckConsistency(v, 1));
9344    return v;
9345}
9346
9347static PyObject *
9348ascii_upper_or_lower(PyObject *self, int lower)
9349{
9350    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9351    char *resdata, *data = PyUnicode_DATA(self);
9352    PyObject *res;
9353
9354    res = PyUnicode_New(len, 127);
9355    if (res == NULL)
9356        return NULL;
9357    resdata = PyUnicode_DATA(res);
9358    if (lower)
9359        _Py_bytes_lower(resdata, data, len);
9360    else
9361        _Py_bytes_upper(resdata, data, len);
9362    return res;
9363}
9364
9365static Py_UCS4
9366handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9367{
9368    Py_ssize_t j;
9369    int final_sigma;
9370    Py_UCS4 c;
9371    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9372
9373     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9374
9375    where ! is a negation and \p{xxx} is a character with property xxx.
9376    */
9377    for (j = i - 1; j >= 0; j--) {
9378        c = PyUnicode_READ(kind, data, j);
9379        if (!_PyUnicode_IsCaseIgnorable(c))
9380            break;
9381    }
9382    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9383    if (final_sigma) {
9384        for (j = i + 1; j < length; j++) {
9385            c = PyUnicode_READ(kind, data, j);
9386            if (!_PyUnicode_IsCaseIgnorable(c))
9387                break;
9388        }
9389        final_sigma = j == length || !_PyUnicode_IsCased(c);
9390    }
9391    return (final_sigma) ? 0x3C2 : 0x3C3;
9392}
9393
9394static int
9395lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9396           Py_UCS4 c, Py_UCS4 *mapped)
9397{
9398    /* Obscure special case. */
9399    if (c == 0x3A3) {
9400        mapped[0] = handle_capital_sigma(kind, data, length, i);
9401        return 1;
9402    }
9403    return _PyUnicode_ToLowerFull(c, mapped);
9404}
9405
9406static Py_ssize_t
9407do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9408{
9409    Py_ssize_t i, k = 0;
9410    int n_res, j;
9411    Py_UCS4 c, mapped[3];
9412
9413    c = PyUnicode_READ(kind, data, 0);
9414    n_res = _PyUnicode_ToUpperFull(c, mapped);
9415    for (j = 0; j < n_res; j++) {
9416        *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9417        res[k++] = mapped[j];
9418    }
9419    for (i = 1; i < length; i++) {
9420        c = PyUnicode_READ(kind, data, i);
9421        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9422        for (j = 0; j < n_res; j++) {
9423            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9424            res[k++] = mapped[j];
9425        }
9426    }
9427    return k;
9428}
9429
9430static Py_ssize_t
9431do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9432    Py_ssize_t i, k = 0;
9433
9434    for (i = 0; i < length; i++) {
9435        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9436        int n_res, j;
9437        if (Py_UNICODE_ISUPPER(c)) {
9438            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9439        }
9440        else if (Py_UNICODE_ISLOWER(c)) {
9441            n_res = _PyUnicode_ToUpperFull(c, mapped);
9442        }
9443        else {
9444            n_res = 1;
9445            mapped[0] = c;
9446        }
9447        for (j = 0; j < n_res; j++) {
9448            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9449            res[k++] = mapped[j];
9450        }
9451    }
9452    return k;
9453}
9454
9455static Py_ssize_t
9456do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9457                  Py_UCS4 *maxchar, int lower)
9458{
9459    Py_ssize_t i, k = 0;
9460
9461    for (i = 0; i < length; i++) {
9462        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9463        int n_res, j;
9464        if (lower)
9465            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9466        else
9467            n_res = _PyUnicode_ToUpperFull(c, mapped);
9468        for (j = 0; j < n_res; j++) {
9469            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9470            res[k++] = mapped[j];
9471        }
9472    }
9473    return k;
9474}
9475
9476static Py_ssize_t
9477do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9478{
9479    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9480}
9481
9482static Py_ssize_t
9483do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9484{
9485    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9486}
9487
9488static Py_ssize_t
9489do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9490{
9491    Py_ssize_t i, k = 0;
9492
9493    for (i = 0; i < length; i++) {
9494        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9495        Py_UCS4 mapped[3];
9496        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9497        for (j = 0; j < n_res; j++) {
9498            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9499            res[k++] = mapped[j];
9500        }
9501    }
9502    return k;
9503}
9504
9505static Py_ssize_t
9506do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9507{
9508    Py_ssize_t i, k = 0;
9509    int previous_is_cased;
9510
9511    previous_is_cased = 0;
9512    for (i = 0; i < length; i++) {
9513        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9514        Py_UCS4 mapped[3];
9515        int n_res, j;
9516
9517        if (previous_is_cased)
9518            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9519        else
9520            n_res = _PyUnicode_ToTitleFull(c, mapped);
9521
9522        for (j = 0; j < n_res; j++) {
9523            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9524            res[k++] = mapped[j];
9525        }
9526
9527        previous_is_cased = _PyUnicode_IsCased(c);
9528    }
9529    return k;
9530}
9531
9532static PyObject *
9533case_operation(PyObject *self,
9534               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9535{
9536    PyObject *res = NULL;
9537    Py_ssize_t length, newlength = 0;
9538    int kind, outkind;
9539    void *data, *outdata;
9540    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9541
9542    assert(PyUnicode_IS_READY(self));
9543
9544    kind = PyUnicode_KIND(self);
9545    data = PyUnicode_DATA(self);
9546    length = PyUnicode_GET_LENGTH(self);
9547    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9548    if (tmp == NULL)
9549        return PyErr_NoMemory();
9550    newlength = perform(kind, data, length, tmp, &maxchar);
9551    res = PyUnicode_New(newlength, maxchar);
9552    if (res == NULL)
9553        goto leave;
9554    tmpend = tmp + newlength;
9555    outdata = PyUnicode_DATA(res);
9556    outkind = PyUnicode_KIND(res);
9557    switch (outkind) {
9558    case PyUnicode_1BYTE_KIND:
9559        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9560        break;
9561    case PyUnicode_2BYTE_KIND:
9562        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9563        break;
9564    case PyUnicode_4BYTE_KIND:
9565        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9566        break;
9567    default:
9568        assert(0);
9569        break;
9570    }
9571  leave:
9572    PyMem_FREE(tmp);
9573    return res;
9574}
9575
9576PyObject *
9577PyUnicode_Join(PyObject *separator, PyObject *seq)
9578{
9579    PyObject *sep = NULL;
9580    Py_ssize_t seplen;
9581    PyObject *res = NULL; /* the result */
9582    PyObject *fseq;          /* PySequence_Fast(seq) */
9583    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9584    PyObject **items;
9585    PyObject *item;
9586    Py_ssize_t sz, i, res_offset;
9587    Py_UCS4 maxchar;
9588    Py_UCS4 item_maxchar;
9589    int use_memcpy;
9590    unsigned char *res_data = NULL, *sep_data = NULL;
9591    PyObject *last_obj;
9592    unsigned int kind = 0;
9593
9594    fseq = PySequence_Fast(seq, "");
9595    if (fseq == NULL) {
9596        return NULL;
9597    }
9598
9599    /* NOTE: the following code can't call back into Python code,
9600     * so we are sure that fseq won't be mutated.
9601     */
9602
9603    seqlen = PySequence_Fast_GET_SIZE(fseq);
9604    /* If empty sequence, return u"". */
9605    if (seqlen == 0) {
9606        Py_DECREF(fseq);
9607        Py_INCREF(unicode_empty);
9608        res = unicode_empty;
9609        return res;
9610    }
9611
9612    /* If singleton sequence with an exact Unicode, return that. */
9613    last_obj = NULL;
9614    items = PySequence_Fast_ITEMS(fseq);
9615    if (seqlen == 1) {
9616        if (PyUnicode_CheckExact(items[0])) {
9617            res = items[0];
9618            Py_INCREF(res);
9619            Py_DECREF(fseq);
9620            return res;
9621        }
9622        seplen = 0;
9623        maxchar = 0;
9624    }
9625    else {
9626        /* Set up sep and seplen */
9627        if (separator == NULL) {
9628            /* fall back to a blank space separator */
9629            sep = PyUnicode_FromOrdinal(' ');
9630            if (!sep)
9631                goto onError;
9632            seplen = 1;
9633            maxchar = 32;
9634        }
9635        else {
9636            if (!PyUnicode_Check(separator)) {
9637                PyErr_Format(PyExc_TypeError,
9638                             "separator: expected str instance,"
9639                             " %.80s found",
9640                             Py_TYPE(separator)->tp_name);
9641                goto onError;
9642            }
9643            if (PyUnicode_READY(separator))
9644                goto onError;
9645            sep = separator;
9646            seplen = PyUnicode_GET_LENGTH(separator);
9647            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9648            /* inc refcount to keep this code path symmetric with the
9649               above case of a blank separator */
9650            Py_INCREF(sep);
9651        }
9652        last_obj = sep;
9653    }
9654
9655    /* There are at least two things to join, or else we have a subclass
9656     * of str in the sequence.
9657     * Do a pre-pass to figure out the total amount of space we'll
9658     * need (sz), and see whether all argument are strings.
9659     */
9660    sz = 0;
9661#ifdef Py_DEBUG
9662    use_memcpy = 0;
9663#else
9664    use_memcpy = 1;
9665#endif
9666    for (i = 0; i < seqlen; i++) {
9667        const Py_ssize_t old_sz = sz;
9668        item = items[i];
9669        if (!PyUnicode_Check(item)) {
9670            PyErr_Format(PyExc_TypeError,
9671                         "sequence item %zd: expected str instance,"
9672                         " %.80s found",
9673                         i, Py_TYPE(item)->tp_name);
9674            goto onError;
9675        }
9676        if (PyUnicode_READY(item) == -1)
9677            goto onError;
9678        sz += PyUnicode_GET_LENGTH(item);
9679        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9680        maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
9681        if (i != 0)
9682            sz += seplen;
9683        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9684            PyErr_SetString(PyExc_OverflowError,
9685                            "join() result is too long for a Python string");
9686            goto onError;
9687        }
9688        if (use_memcpy && last_obj != NULL) {
9689            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9690                use_memcpy = 0;
9691        }
9692        last_obj = item;
9693    }
9694
9695    res = PyUnicode_New(sz, maxchar);
9696    if (res == NULL)
9697        goto onError;
9698
9699    /* Catenate everything. */
9700#ifdef Py_DEBUG
9701    use_memcpy = 0;
9702#else
9703    if (use_memcpy) {
9704        res_data = PyUnicode_1BYTE_DATA(res);
9705        kind = PyUnicode_KIND(res);
9706        if (seplen != 0)
9707            sep_data = PyUnicode_1BYTE_DATA(sep);
9708    }
9709#endif
9710    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9711        Py_ssize_t itemlen;
9712        item = items[i];
9713        /* Copy item, and maybe the separator. */
9714        if (i && seplen != 0) {
9715            if (use_memcpy) {
9716                Py_MEMCPY(res_data,
9717                          sep_data,
9718                          kind * seplen);
9719                res_data += kind * seplen;
9720            }
9721            else {
9722                copy_characters(res, res_offset, sep, 0, seplen);
9723                res_offset += seplen;
9724            }
9725        }
9726        itemlen = PyUnicode_GET_LENGTH(item);
9727        if (itemlen != 0) {
9728            if (use_memcpy) {
9729                Py_MEMCPY(res_data,
9730                          PyUnicode_DATA(item),
9731                          kind * itemlen);
9732                res_data += kind * itemlen;
9733            }
9734            else {
9735                copy_characters(res, res_offset, item, 0, itemlen);
9736                res_offset += itemlen;
9737            }
9738        }
9739    }
9740    if (use_memcpy)
9741        assert(res_data == PyUnicode_1BYTE_DATA(res)
9742                           + kind * PyUnicode_GET_LENGTH(res));
9743    else
9744        assert(res_offset == PyUnicode_GET_LENGTH(res));
9745
9746    Py_DECREF(fseq);
9747    Py_XDECREF(sep);
9748    assert(_PyUnicode_CheckConsistency(res, 1));
9749    return res;
9750
9751  onError:
9752    Py_DECREF(fseq);
9753    Py_XDECREF(sep);
9754    Py_XDECREF(res);
9755    return NULL;
9756}
9757
9758#define FILL(kind, data, value, start, length) \
9759    do { \
9760        Py_ssize_t i_ = 0; \
9761        assert(kind != PyUnicode_WCHAR_KIND); \
9762        switch ((kind)) { \
9763        case PyUnicode_1BYTE_KIND: { \
9764            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9765            memset(to_, (unsigned char)value, (length)); \
9766            break; \
9767        } \
9768        case PyUnicode_2BYTE_KIND: { \
9769            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9770            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9771            break; \
9772        } \
9773        case PyUnicode_4BYTE_KIND: { \
9774            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9775            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9776            break; \
9777        default: assert(0); \
9778        } \
9779        } \
9780    } while (0)
9781
9782Py_ssize_t
9783PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9784               Py_UCS4 fill_char)
9785{
9786    Py_ssize_t maxlen;
9787    enum PyUnicode_Kind kind;
9788    void *data;
9789
9790    if (!PyUnicode_Check(unicode)) {
9791        PyErr_BadInternalCall();
9792        return -1;
9793    }
9794    if (PyUnicode_READY(unicode) == -1)
9795        return -1;
9796    if (unicode_check_modifiable(unicode))
9797        return -1;
9798
9799    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9800        PyErr_SetString(PyExc_ValueError,
9801                         "fill character is bigger than "
9802                         "the string maximum character");
9803        return -1;
9804    }
9805
9806    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9807    length = Py_MIN(maxlen, length);
9808    if (length <= 0)
9809        return 0;
9810
9811    kind = PyUnicode_KIND(unicode);
9812    data = PyUnicode_DATA(unicode);
9813    FILL(kind, data, fill_char, start, length);
9814    return length;
9815}
9816
9817static PyObject *
9818pad(PyObject *self,
9819    Py_ssize_t left,
9820    Py_ssize_t right,
9821    Py_UCS4 fill)
9822{
9823    PyObject *u;
9824    Py_UCS4 maxchar;
9825    int kind;
9826    void *data;
9827
9828    if (left < 0)
9829        left = 0;
9830    if (right < 0)
9831        right = 0;
9832
9833    if (left == 0 && right == 0)
9834        return unicode_result_unchanged(self);
9835
9836    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9837        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9838        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9839        return NULL;
9840    }
9841    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9842    maxchar = MAX_MAXCHAR(maxchar, fill);
9843    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9844    if (!u)
9845        return NULL;
9846
9847    kind = PyUnicode_KIND(u);
9848    data = PyUnicode_DATA(u);
9849    if (left)
9850        FILL(kind, data, fill, 0, left);
9851    if (right)
9852        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9853    copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
9854    assert(_PyUnicode_CheckConsistency(u, 1));
9855    return u;
9856}
9857
9858PyObject *
9859PyUnicode_Splitlines(PyObject *string, int keepends)
9860{
9861    PyObject *list;
9862
9863    string = PyUnicode_FromObject(string);
9864    if (string == NULL)
9865        return NULL;
9866    if (PyUnicode_READY(string) == -1) {
9867        Py_DECREF(string);
9868        return NULL;
9869    }
9870
9871    switch (PyUnicode_KIND(string)) {
9872    case PyUnicode_1BYTE_KIND:
9873        if (PyUnicode_IS_ASCII(string))
9874            list = asciilib_splitlines(
9875                string, PyUnicode_1BYTE_DATA(string),
9876                PyUnicode_GET_LENGTH(string), keepends);
9877        else
9878            list = ucs1lib_splitlines(
9879                string, PyUnicode_1BYTE_DATA(string),
9880                PyUnicode_GET_LENGTH(string), keepends);
9881        break;
9882    case PyUnicode_2BYTE_KIND:
9883        list = ucs2lib_splitlines(
9884            string, PyUnicode_2BYTE_DATA(string),
9885            PyUnicode_GET_LENGTH(string), keepends);
9886        break;
9887    case PyUnicode_4BYTE_KIND:
9888        list = ucs4lib_splitlines(
9889            string, PyUnicode_4BYTE_DATA(string),
9890            PyUnicode_GET_LENGTH(string), keepends);
9891        break;
9892    default:
9893        assert(0);
9894        list = 0;
9895    }
9896    Py_DECREF(string);
9897    return list;
9898}
9899
9900static PyObject *
9901split(PyObject *self,
9902      PyObject *substring,
9903      Py_ssize_t maxcount)
9904{
9905    int kind1, kind2, kind;
9906    void *buf1, *buf2;
9907    Py_ssize_t len1, len2;
9908    PyObject* out;
9909
9910    if (maxcount < 0)
9911        maxcount = PY_SSIZE_T_MAX;
9912
9913    if (PyUnicode_READY(self) == -1)
9914        return NULL;
9915
9916    if (substring == NULL)
9917        switch (PyUnicode_KIND(self)) {
9918        case PyUnicode_1BYTE_KIND:
9919            if (PyUnicode_IS_ASCII(self))
9920                return asciilib_split_whitespace(
9921                    self,  PyUnicode_1BYTE_DATA(self),
9922                    PyUnicode_GET_LENGTH(self), maxcount
9923                    );
9924            else
9925                return ucs1lib_split_whitespace(
9926                    self,  PyUnicode_1BYTE_DATA(self),
9927                    PyUnicode_GET_LENGTH(self), maxcount
9928                    );
9929        case PyUnicode_2BYTE_KIND:
9930            return ucs2lib_split_whitespace(
9931                self,  PyUnicode_2BYTE_DATA(self),
9932                PyUnicode_GET_LENGTH(self), maxcount
9933                );
9934        case PyUnicode_4BYTE_KIND:
9935            return ucs4lib_split_whitespace(
9936                self,  PyUnicode_4BYTE_DATA(self),
9937                PyUnicode_GET_LENGTH(self), maxcount
9938                );
9939        default:
9940            assert(0);
9941            return NULL;
9942        }
9943
9944    if (PyUnicode_READY(substring) == -1)
9945        return NULL;
9946
9947    kind1 = PyUnicode_KIND(self);
9948    kind2 = PyUnicode_KIND(substring);
9949    kind = kind1 > kind2 ? kind1 : kind2;
9950    buf1 = PyUnicode_DATA(self);
9951    buf2 = PyUnicode_DATA(substring);
9952    if (kind1 != kind)
9953        buf1 = _PyUnicode_AsKind(self, kind);
9954    if (!buf1)
9955        return NULL;
9956    if (kind2 != kind)
9957        buf2 = _PyUnicode_AsKind(substring, kind);
9958    if (!buf2) {
9959        if (kind1 != kind) PyMem_Free(buf1);
9960        return NULL;
9961    }
9962    len1 = PyUnicode_GET_LENGTH(self);
9963    len2 = PyUnicode_GET_LENGTH(substring);
9964
9965    switch (kind) {
9966    case PyUnicode_1BYTE_KIND:
9967        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9968            out = asciilib_split(
9969                self,  buf1, len1, buf2, len2, maxcount);
9970        else
9971            out = ucs1lib_split(
9972                self,  buf1, len1, buf2, len2, maxcount);
9973        break;
9974    case PyUnicode_2BYTE_KIND:
9975        out = ucs2lib_split(
9976            self,  buf1, len1, buf2, len2, maxcount);
9977        break;
9978    case PyUnicode_4BYTE_KIND:
9979        out = ucs4lib_split(
9980            self,  buf1, len1, buf2, len2, maxcount);
9981        break;
9982    default:
9983        out = NULL;
9984    }
9985    if (kind1 != kind)
9986        PyMem_Free(buf1);
9987    if (kind2 != kind)
9988        PyMem_Free(buf2);
9989    return out;
9990}
9991
9992static PyObject *
9993rsplit(PyObject *self,
9994       PyObject *substring,
9995       Py_ssize_t maxcount)
9996{
9997    int kind1, kind2, kind;
9998    void *buf1, *buf2;
9999    Py_ssize_t len1, len2;
10000    PyObject* out;
10001
10002    if (maxcount < 0)
10003        maxcount = PY_SSIZE_T_MAX;
10004
10005    if (PyUnicode_READY(self) == -1)
10006        return NULL;
10007
10008    if (substring == NULL)
10009        switch (PyUnicode_KIND(self)) {
10010        case PyUnicode_1BYTE_KIND:
10011            if (PyUnicode_IS_ASCII(self))
10012                return asciilib_rsplit_whitespace(
10013                    self,  PyUnicode_1BYTE_DATA(self),
10014                    PyUnicode_GET_LENGTH(self), maxcount
10015                    );
10016            else
10017                return ucs1lib_rsplit_whitespace(
10018                    self,  PyUnicode_1BYTE_DATA(self),
10019                    PyUnicode_GET_LENGTH(self), maxcount
10020                    );
10021        case PyUnicode_2BYTE_KIND:
10022            return ucs2lib_rsplit_whitespace(
10023                self,  PyUnicode_2BYTE_DATA(self),
10024                PyUnicode_GET_LENGTH(self), maxcount
10025                );
10026        case PyUnicode_4BYTE_KIND:
10027            return ucs4lib_rsplit_whitespace(
10028                self,  PyUnicode_4BYTE_DATA(self),
10029                PyUnicode_GET_LENGTH(self), maxcount
10030                );
10031        default:
10032            assert(0);
10033            return NULL;
10034        }
10035
10036    if (PyUnicode_READY(substring) == -1)
10037        return NULL;
10038
10039    kind1 = PyUnicode_KIND(self);
10040    kind2 = PyUnicode_KIND(substring);
10041    kind = kind1 > kind2 ? kind1 : kind2;
10042    buf1 = PyUnicode_DATA(self);
10043    buf2 = PyUnicode_DATA(substring);
10044    if (kind1 != kind)
10045        buf1 = _PyUnicode_AsKind(self, kind);
10046    if (!buf1)
10047        return NULL;
10048    if (kind2 != kind)
10049        buf2 = _PyUnicode_AsKind(substring, kind);
10050    if (!buf2) {
10051        if (kind1 != kind) PyMem_Free(buf1);
10052        return NULL;
10053    }
10054    len1 = PyUnicode_GET_LENGTH(self);
10055    len2 = PyUnicode_GET_LENGTH(substring);
10056
10057    switch (kind) {
10058    case PyUnicode_1BYTE_KIND:
10059        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10060            out = asciilib_rsplit(
10061                self,  buf1, len1, buf2, len2, maxcount);
10062        else
10063            out = ucs1lib_rsplit(
10064                self,  buf1, len1, buf2, len2, maxcount);
10065        break;
10066    case PyUnicode_2BYTE_KIND:
10067        out = ucs2lib_rsplit(
10068            self,  buf1, len1, buf2, len2, maxcount);
10069        break;
10070    case PyUnicode_4BYTE_KIND:
10071        out = ucs4lib_rsplit(
10072            self,  buf1, len1, buf2, len2, maxcount);
10073        break;
10074    default:
10075        out = NULL;
10076    }
10077    if (kind1 != kind)
10078        PyMem_Free(buf1);
10079    if (kind2 != kind)
10080        PyMem_Free(buf2);
10081    return out;
10082}
10083
10084static Py_ssize_t
10085anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10086            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10087{
10088    switch (kind) {
10089    case PyUnicode_1BYTE_KIND:
10090        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10091            return asciilib_find(buf1, len1, buf2, len2, offset);
10092        else
10093            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10094    case PyUnicode_2BYTE_KIND:
10095        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10096    case PyUnicode_4BYTE_KIND:
10097        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10098    }
10099    assert(0);
10100    return -1;
10101}
10102
10103static Py_ssize_t
10104anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10105             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10106{
10107    switch (kind) {
10108    case PyUnicode_1BYTE_KIND:
10109        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10110            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10111        else
10112            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10113    case PyUnicode_2BYTE_KIND:
10114        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10115    case PyUnicode_4BYTE_KIND:
10116        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10117    }
10118    assert(0);
10119    return 0;
10120}
10121
10122static PyObject *
10123replace(PyObject *self, PyObject *str1,
10124        PyObject *str2, Py_ssize_t maxcount)
10125{
10126    PyObject *u;
10127    char *sbuf = PyUnicode_DATA(self);
10128    char *buf1 = PyUnicode_DATA(str1);
10129    char *buf2 = PyUnicode_DATA(str2);
10130    int srelease = 0, release1 = 0, release2 = 0;
10131    int skind = PyUnicode_KIND(self);
10132    int kind1 = PyUnicode_KIND(str1);
10133    int kind2 = PyUnicode_KIND(str2);
10134    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10135    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10136    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10137    int mayshrink;
10138    Py_UCS4 maxchar, maxchar_str2;
10139
10140    if (maxcount < 0)
10141        maxcount = PY_SSIZE_T_MAX;
10142    else if (maxcount == 0 || slen == 0)
10143        goto nothing;
10144
10145    if (str1 == str2)
10146        goto nothing;
10147    if (skind < kind1)
10148        /* substring too wide to be present */
10149        goto nothing;
10150
10151    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10152    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10153    /* Replacing str1 with str2 may cause a maxchar reduction in the
10154       result string. */
10155    mayshrink = (maxchar_str2 < maxchar);
10156    maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
10157
10158    if (len1 == len2) {
10159        /* same length */
10160        if (len1 == 0)
10161            goto nothing;
10162        if (len1 == 1) {
10163            /* replace characters */
10164            Py_UCS4 u1, u2;
10165            int rkind;
10166            Py_ssize_t index, pos;
10167            char *src;
10168
10169            u1 = PyUnicode_READ_CHAR(str1, 0);
10170            pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10171            if (pos < 0)
10172                goto nothing;
10173            u2 = PyUnicode_READ_CHAR(str2, 0);
10174            u = PyUnicode_New(slen, maxchar);
10175            if (!u)
10176                goto error;
10177            copy_characters(u, 0, self, 0, slen);
10178            rkind = PyUnicode_KIND(u);
10179
10180            PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10181            index = 0;
10182            src = sbuf;
10183            while (--maxcount)
10184            {
10185                pos++;
10186                src += pos * PyUnicode_KIND(self);
10187                slen -= pos;
10188                index += pos;
10189                pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10190                if (pos < 0)
10191                    break;
10192                PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10193            }
10194        }
10195        else {
10196            int rkind = skind;
10197            char *res;
10198            Py_ssize_t i;
10199
10200            if (kind1 < rkind) {
10201                /* widen substring */
10202                buf1 = _PyUnicode_AsKind(str1, rkind);
10203                if (!buf1) goto error;
10204                release1 = 1;
10205            }
10206            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10207            if (i < 0)
10208                goto nothing;
10209            if (rkind > kind2) {
10210                /* widen replacement */
10211                buf2 = _PyUnicode_AsKind(str2, rkind);
10212                if (!buf2) goto error;
10213                release2 = 1;
10214            }
10215            else if (rkind < kind2) {
10216                /* widen self and buf1 */
10217                rkind = kind2;
10218                if (release1) PyMem_Free(buf1);
10219                sbuf = _PyUnicode_AsKind(self, rkind);
10220                if (!sbuf) goto error;
10221                srelease = 1;
10222                buf1 = _PyUnicode_AsKind(str1, rkind);
10223                if (!buf1) goto error;
10224                release1 = 1;
10225            }
10226            u = PyUnicode_New(slen, maxchar);
10227            if (!u)
10228                goto error;
10229            assert(PyUnicode_KIND(u) == rkind);
10230            res = PyUnicode_DATA(u);
10231
10232            memcpy(res, sbuf, rkind * slen);
10233            /* change everything in-place, starting with this one */
10234            memcpy(res + rkind * i,
10235                   buf2,
10236                   rkind * len2);
10237            i += len1;
10238
10239            while ( --maxcount > 0) {
10240                i = anylib_find(rkind, self,
10241                                sbuf+rkind*i, slen-i,
10242                                str1, buf1, len1, i);
10243                if (i == -1)
10244                    break;
10245                memcpy(res + rkind * i,
10246                       buf2,
10247                       rkind * len2);
10248                i += len1;
10249            }
10250        }
10251    }
10252    else {
10253        Py_ssize_t n, i, j, ires;
10254        Py_ssize_t product, new_size;
10255        int rkind = skind;
10256        char *res;
10257
10258        if (kind1 < rkind) {
10259            /* widen substring */
10260            buf1 = _PyUnicode_AsKind(str1, rkind);
10261            if (!buf1) goto error;
10262            release1 = 1;
10263        }
10264        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10265        if (n == 0)
10266            goto nothing;
10267        if (kind2 < rkind) {
10268            /* widen replacement */
10269            buf2 = _PyUnicode_AsKind(str2, rkind);
10270            if (!buf2) goto error;
10271            release2 = 1;
10272        }
10273        else if (kind2 > rkind) {
10274            /* widen self and buf1 */
10275            rkind = kind2;
10276            sbuf = _PyUnicode_AsKind(self, rkind);
10277            if (!sbuf) goto error;
10278            srelease = 1;
10279            if (release1) PyMem_Free(buf1);
10280            buf1 = _PyUnicode_AsKind(str1, rkind);
10281            if (!buf1) goto error;
10282            release1 = 1;
10283        }
10284        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10285           PyUnicode_GET_LENGTH(str1))); */
10286        product = n * (len2-len1);
10287        if ((product / (len2-len1)) != n) {
10288                PyErr_SetString(PyExc_OverflowError,
10289                                "replace string is too long");
10290                goto error;
10291        }
10292        new_size = slen + product;
10293        if (new_size == 0) {
10294            Py_INCREF(unicode_empty);
10295            u = unicode_empty;
10296            goto done;
10297        }
10298        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10299            PyErr_SetString(PyExc_OverflowError,
10300                            "replace string is too long");
10301            goto error;
10302        }
10303        u = PyUnicode_New(new_size, maxchar);
10304        if (!u)
10305            goto error;
10306        assert(PyUnicode_KIND(u) == rkind);
10307        res = PyUnicode_DATA(u);
10308        ires = i = 0;
10309        if (len1 > 0) {
10310            while (n-- > 0) {
10311                /* look for next match */
10312                j = anylib_find(rkind, self,
10313                                sbuf + rkind * i, slen-i,
10314                                str1, buf1, len1, i);
10315                if (j == -1)
10316                    break;
10317                else if (j > i) {
10318                    /* copy unchanged part [i:j] */
10319                    memcpy(res + rkind * ires,
10320                           sbuf + rkind * i,
10321                           rkind * (j-i));
10322                    ires += j - i;
10323                }
10324                /* copy substitution string */
10325                if (len2 > 0) {
10326                    memcpy(res + rkind * ires,
10327                           buf2,
10328                           rkind * len2);
10329                    ires += len2;
10330                }
10331                i = j + len1;
10332            }
10333            if (i < slen)
10334                /* copy tail [i:] */
10335                memcpy(res + rkind * ires,
10336                       sbuf + rkind * i,
10337                       rkind * (slen-i));
10338        }
10339        else {
10340            /* interleave */
10341            while (n > 0) {
10342                memcpy(res + rkind * ires,
10343                       buf2,
10344                       rkind * len2);
10345                ires += len2;
10346                if (--n <= 0)
10347                    break;
10348                memcpy(res + rkind * ires,
10349                       sbuf + rkind * i,
10350                       rkind);
10351                ires++;
10352                i++;
10353            }
10354            memcpy(res + rkind * ires,
10355                   sbuf + rkind * i,
10356                   rkind * (slen-i));
10357        }
10358    }
10359
10360    if (mayshrink) {
10361        unicode_adjust_maxchar(&u);
10362        if (u == NULL)
10363            goto error;
10364    }
10365
10366  done:
10367    if (srelease)
10368        PyMem_FREE(sbuf);
10369    if (release1)
10370        PyMem_FREE(buf1);
10371    if (release2)
10372        PyMem_FREE(buf2);
10373    assert(_PyUnicode_CheckConsistency(u, 1));
10374    return u;
10375
10376  nothing:
10377    /* nothing to replace; return original string (when possible) */
10378    if (srelease)
10379        PyMem_FREE(sbuf);
10380    if (release1)
10381        PyMem_FREE(buf1);
10382    if (release2)
10383        PyMem_FREE(buf2);
10384    return unicode_result_unchanged(self);
10385
10386  error:
10387    if (srelease && sbuf)
10388        PyMem_FREE(sbuf);
10389    if (release1 && buf1)
10390        PyMem_FREE(buf1);
10391    if (release2 && buf2)
10392        PyMem_FREE(buf2);
10393    return NULL;
10394}
10395
10396/* --- Unicode Object Methods --------------------------------------------- */
10397
10398PyDoc_STRVAR(title__doc__,
10399             "S.title() -> str\n\
10400\n\
10401Return a titlecased version of S, i.e. words start with title case\n\
10402characters, all remaining cased characters have lower case.");
10403
10404static PyObject*
10405unicode_title(PyObject *self)
10406{
10407    if (PyUnicode_READY(self) == -1)
10408        return NULL;
10409    return case_operation(self, do_title);
10410}
10411
10412PyDoc_STRVAR(capitalize__doc__,
10413             "S.capitalize() -> str\n\
10414\n\
10415Return a capitalized version of S, i.e. make the first character\n\
10416have upper case and the rest lower case.");
10417
10418static PyObject*
10419unicode_capitalize(PyObject *self)
10420{
10421    if (PyUnicode_READY(self) == -1)
10422        return NULL;
10423    if (PyUnicode_GET_LENGTH(self) == 0)
10424        return unicode_result_unchanged(self);
10425    return case_operation(self, do_capitalize);
10426}
10427
10428PyDoc_STRVAR(casefold__doc__,
10429             "S.casefold() -> str\n\
10430\n\
10431Return a version of S suitable for caseless comparisons.");
10432
10433static PyObject *
10434unicode_casefold(PyObject *self)
10435{
10436    if (PyUnicode_READY(self) == -1)
10437        return NULL;
10438    if (PyUnicode_IS_ASCII(self))
10439        return ascii_upper_or_lower(self, 1);
10440    return case_operation(self, do_casefold);
10441}
10442
10443
10444/* Argument converter.  Coerces to a single unicode character */
10445
10446static int
10447convert_uc(PyObject *obj, void *addr)
10448{
10449    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10450    PyObject *uniobj;
10451
10452    uniobj = PyUnicode_FromObject(obj);
10453    if (uniobj == NULL) {
10454        PyErr_SetString(PyExc_TypeError,
10455                        "The fill character cannot be converted to Unicode");
10456        return 0;
10457    }
10458    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10459        PyErr_SetString(PyExc_TypeError,
10460                        "The fill character must be exactly one character long");
10461        Py_DECREF(uniobj);
10462        return 0;
10463    }
10464    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10465    Py_DECREF(uniobj);
10466    return 1;
10467}
10468
10469PyDoc_STRVAR(center__doc__,
10470             "S.center(width[, fillchar]) -> str\n\
10471\n\
10472Return S centered in a string of length width. Padding is\n\
10473done using the specified fill character (default is a space)");
10474
10475static PyObject *
10476unicode_center(PyObject *self, PyObject *args)
10477{
10478    Py_ssize_t marg, left;
10479    Py_ssize_t width;
10480    Py_UCS4 fillchar = ' ';
10481
10482    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10483        return NULL;
10484
10485    if (PyUnicode_READY(self) == -1)
10486        return NULL;
10487
10488    if (PyUnicode_GET_LENGTH(self) >= width)
10489        return unicode_result_unchanged(self);
10490
10491    marg = width - PyUnicode_GET_LENGTH(self);
10492    left = marg / 2 + (marg & width & 1);
10493
10494    return pad(self, left, marg - left, fillchar);
10495}
10496
10497/* This function assumes that str1 and str2 are readied by the caller. */
10498
10499static int
10500unicode_compare(PyObject *str1, PyObject *str2)
10501{
10502    int kind1, kind2;
10503    void *data1, *data2;
10504    Py_ssize_t len1, len2, i;
10505
10506    kind1 = PyUnicode_KIND(str1);
10507    kind2 = PyUnicode_KIND(str2);
10508    data1 = PyUnicode_DATA(str1);
10509    data2 = PyUnicode_DATA(str2);
10510    len1 = PyUnicode_GET_LENGTH(str1);
10511    len2 = PyUnicode_GET_LENGTH(str2);
10512
10513    for (i = 0; i < len1 && i < len2; ++i) {
10514        Py_UCS4 c1, c2;
10515        c1 = PyUnicode_READ(kind1, data1, i);
10516        c2 = PyUnicode_READ(kind2, data2, i);
10517
10518        if (c1 != c2)
10519            return (c1 < c2) ? -1 : 1;
10520    }
10521
10522    return (len1 < len2) ? -1 : (len1 != len2);
10523}
10524
10525int
10526PyUnicode_Compare(PyObject *left, PyObject *right)
10527{
10528    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10529        if (PyUnicode_READY(left) == -1 ||
10530            PyUnicode_READY(right) == -1)
10531            return -1;
10532        return unicode_compare(left, right);
10533    }
10534    PyErr_Format(PyExc_TypeError,
10535                 "Can't compare %.100s and %.100s",
10536                 left->ob_type->tp_name,
10537                 right->ob_type->tp_name);
10538    return -1;
10539}
10540
10541int
10542PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10543{
10544    Py_ssize_t i;
10545    int kind;
10546    void *data;
10547    Py_UCS4 chr;
10548
10549    assert(_PyUnicode_CHECK(uni));
10550    if (PyUnicode_READY(uni) == -1)
10551        return -1;
10552    kind = PyUnicode_KIND(uni);
10553    data = PyUnicode_DATA(uni);
10554    /* Compare Unicode string and source character set string */
10555    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10556        if (chr != str[i])
10557            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10558    /* This check keeps Python strings that end in '\0' from comparing equal
10559     to C strings identical up to that point. */
10560    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10561        return 1; /* uni is longer */
10562    if (str[i])
10563        return -1; /* str is longer */
10564    return 0;
10565}
10566
10567
10568#define TEST_COND(cond)                         \
10569    ((cond) ? Py_True : Py_False)
10570
10571PyObject *
10572PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10573{
10574    int result;
10575
10576    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10577        PyObject *v;
10578        if (PyUnicode_READY(left) == -1 ||
10579            PyUnicode_READY(right) == -1)
10580            return NULL;
10581        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10582            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10583            if (op == Py_EQ) {
10584                Py_INCREF(Py_False);
10585                return Py_False;
10586            }
10587            if (op == Py_NE) {
10588                Py_INCREF(Py_True);
10589                return Py_True;
10590            }
10591        }
10592        if (left == right)
10593            result = 0;
10594        else
10595            result = unicode_compare(left, right);
10596
10597        /* Convert the return value to a Boolean */
10598        switch (op) {
10599        case Py_EQ:
10600            v = TEST_COND(result == 0);
10601            break;
10602        case Py_NE:
10603            v = TEST_COND(result != 0);
10604            break;
10605        case Py_LE:
10606            v = TEST_COND(result <= 0);
10607            break;
10608        case Py_GE:
10609            v = TEST_COND(result >= 0);
10610            break;
10611        case Py_LT:
10612            v = TEST_COND(result == -1);
10613            break;
10614        case Py_GT:
10615            v = TEST_COND(result == 1);
10616            break;
10617        default:
10618            PyErr_BadArgument();
10619            return NULL;
10620        }
10621        Py_INCREF(v);
10622        return v;
10623    }
10624
10625    Py_RETURN_NOTIMPLEMENTED;
10626}
10627
10628int
10629PyUnicode_Contains(PyObject *container, PyObject *element)
10630{
10631    PyObject *str, *sub;
10632    int kind1, kind2, kind;
10633    void *buf1, *buf2;
10634    Py_ssize_t len1, len2;
10635    int result;
10636
10637    /* Coerce the two arguments */
10638    sub = PyUnicode_FromObject(element);
10639    if (!sub) {
10640        PyErr_Format(PyExc_TypeError,
10641                     "'in <string>' requires string as left operand, not %s",
10642                     element->ob_type->tp_name);
10643        return -1;
10644    }
10645
10646    str = PyUnicode_FromObject(container);
10647    if (!str) {
10648        Py_DECREF(sub);
10649        return -1;
10650    }
10651    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10652        Py_DECREF(sub);
10653        Py_DECREF(str);
10654    }
10655
10656    kind1 = PyUnicode_KIND(str);
10657    kind2 = PyUnicode_KIND(sub);
10658    kind = kind1;
10659    buf1 = PyUnicode_DATA(str);
10660    buf2 = PyUnicode_DATA(sub);
10661    if (kind2 != kind) {
10662        if (kind2 > kind)
10663            return 0;
10664        buf2 = _PyUnicode_AsKind(sub, kind);
10665    }
10666    if (!buf2) {
10667        Py_DECREF(sub);
10668        Py_DECREF(str);
10669        return -1;
10670    }
10671    len1 = PyUnicode_GET_LENGTH(str);
10672    len2 = PyUnicode_GET_LENGTH(sub);
10673
10674    switch (kind) {
10675    case PyUnicode_1BYTE_KIND:
10676        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10677        break;
10678    case PyUnicode_2BYTE_KIND:
10679        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10680        break;
10681    case PyUnicode_4BYTE_KIND:
10682        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10683        break;
10684    default:
10685        result = -1;
10686        assert(0);
10687    }
10688
10689    Py_DECREF(str);
10690    Py_DECREF(sub);
10691
10692    if (kind2 != kind)
10693        PyMem_Free(buf2);
10694
10695    return result;
10696}
10697
10698/* Concat to string or Unicode object giving a new Unicode object. */
10699
10700PyObject *
10701PyUnicode_Concat(PyObject *left, PyObject *right)
10702{
10703    PyObject *u = NULL, *v = NULL, *w;
10704    Py_UCS4 maxchar, maxchar2;
10705    Py_ssize_t u_len, v_len, new_len;
10706
10707    /* Coerce the two arguments */
10708    u = PyUnicode_FromObject(left);
10709    if (u == NULL)
10710        goto onError;
10711    v = PyUnicode_FromObject(right);
10712    if (v == NULL)
10713        goto onError;
10714
10715    /* Shortcuts */
10716    if (v == unicode_empty) {
10717        Py_DECREF(v);
10718        return u;
10719    }
10720    if (u == unicode_empty) {
10721        Py_DECREF(u);
10722        return v;
10723    }
10724
10725    u_len = PyUnicode_GET_LENGTH(u);
10726    v_len = PyUnicode_GET_LENGTH(v);
10727    if (u_len > PY_SSIZE_T_MAX - v_len) {
10728        PyErr_SetString(PyExc_OverflowError,
10729                        "strings are too large to concat");
10730        goto onError;
10731    }
10732    new_len = u_len + v_len;
10733
10734    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10735    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10736    maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10737
10738    /* Concat the two Unicode strings */
10739    w = PyUnicode_New(new_len, maxchar);
10740    if (w == NULL)
10741        goto onError;
10742    copy_characters(w, 0, u, 0, u_len);
10743    copy_characters(w, u_len, v, 0, v_len);
10744    Py_DECREF(u);
10745    Py_DECREF(v);
10746    assert(_PyUnicode_CheckConsistency(w, 1));
10747    return w;
10748
10749  onError:
10750    Py_XDECREF(u);
10751    Py_XDECREF(v);
10752    return NULL;
10753}
10754
10755void
10756PyUnicode_Append(PyObject **p_left, PyObject *right)
10757{
10758    PyObject *left, *res;
10759    Py_UCS4 maxchar, maxchar2;
10760    Py_ssize_t left_len, right_len, new_len;
10761
10762    if (p_left == NULL) {
10763        if (!PyErr_Occurred())
10764            PyErr_BadInternalCall();
10765        return;
10766    }
10767    left = *p_left;
10768    if (right == NULL || !PyUnicode_Check(left)) {
10769        if (!PyErr_Occurred())
10770            PyErr_BadInternalCall();
10771        goto error;
10772    }
10773
10774    if (PyUnicode_READY(left) == -1)
10775        goto error;
10776    if (PyUnicode_READY(right) == -1)
10777        goto error;
10778
10779    /* Shortcuts */
10780    if (left == unicode_empty) {
10781        Py_DECREF(left);
10782        Py_INCREF(right);
10783        *p_left = right;
10784        return;
10785    }
10786    if (right == unicode_empty)
10787        return;
10788
10789    left_len = PyUnicode_GET_LENGTH(left);
10790    right_len = PyUnicode_GET_LENGTH(right);
10791    if (left_len > PY_SSIZE_T_MAX - right_len) {
10792        PyErr_SetString(PyExc_OverflowError,
10793                        "strings are too large to concat");
10794        goto error;
10795    }
10796    new_len = left_len + right_len;
10797
10798    if (unicode_modifiable(left)
10799        && PyUnicode_CheckExact(right)
10800        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
10801        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10802           to change the structure size, but characters are stored just after
10803           the structure, and so it requires to move all characters which is
10804           not so different than duplicating the string. */
10805        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10806    {
10807        /* append inplace */
10808        if (unicode_resize(p_left, new_len) != 0) {
10809            /* XXX if _PyUnicode_Resize() fails, 'left' has been
10810             * deallocated so it cannot be put back into
10811             * 'variable'.  The MemoryError is raised when there
10812             * is no value in 'variable', which might (very
10813             * remotely) be a cause of incompatibilities.
10814             */
10815            goto error;
10816        }
10817        /* copy 'right' into the newly allocated area of 'left' */
10818        copy_characters(*p_left, left_len, right, 0, right_len);
10819    }
10820    else {
10821        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10822        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10823        maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10824
10825        /* Concat the two Unicode strings */
10826        res = PyUnicode_New(new_len, maxchar);
10827        if (res == NULL)
10828            goto error;
10829        copy_characters(res, 0, left, 0, left_len);
10830        copy_characters(res, left_len, right, 0, right_len);
10831        Py_DECREF(left);
10832        *p_left = res;
10833    }
10834    assert(_PyUnicode_CheckConsistency(*p_left, 1));
10835    return;
10836
10837error:
10838    Py_CLEAR(*p_left);
10839}
10840
10841void
10842PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10843{
10844    PyUnicode_Append(pleft, right);
10845    Py_XDECREF(right);
10846}
10847
10848PyDoc_STRVAR(count__doc__,
10849             "S.count(sub[, start[, end]]) -> int\n\
10850\n\
10851Return the number of non-overlapping occurrences of substring sub in\n\
10852string S[start:end].  Optional arguments start and end are\n\
10853interpreted as in slice notation.");
10854
10855static PyObject *
10856unicode_count(PyObject *self, PyObject *args)
10857{
10858    PyObject *substring;
10859    Py_ssize_t start = 0;
10860    Py_ssize_t end = PY_SSIZE_T_MAX;
10861    PyObject *result;
10862    int kind1, kind2, kind;
10863    void *buf1, *buf2;
10864    Py_ssize_t len1, len2, iresult;
10865
10866    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10867                                            &start, &end))
10868        return NULL;
10869
10870    kind1 = PyUnicode_KIND(self);
10871    kind2 = PyUnicode_KIND(substring);
10872    if (kind2 > kind1)
10873        return PyLong_FromLong(0);
10874    kind = kind1;
10875    buf1 = PyUnicode_DATA(self);
10876    buf2 = PyUnicode_DATA(substring);
10877    if (kind2 != kind)
10878        buf2 = _PyUnicode_AsKind(substring, kind);
10879    if (!buf2) {
10880        Py_DECREF(substring);
10881        return NULL;
10882    }
10883    len1 = PyUnicode_GET_LENGTH(self);
10884    len2 = PyUnicode_GET_LENGTH(substring);
10885
10886    ADJUST_INDICES(start, end, len1);
10887    switch (kind) {
10888    case PyUnicode_1BYTE_KIND:
10889        iresult = ucs1lib_count(
10890            ((Py_UCS1*)buf1) + start, end - start,
10891            buf2, len2, PY_SSIZE_T_MAX
10892            );
10893        break;
10894    case PyUnicode_2BYTE_KIND:
10895        iresult = ucs2lib_count(
10896            ((Py_UCS2*)buf1) + start, end - start,
10897            buf2, len2, PY_SSIZE_T_MAX
10898            );
10899        break;
10900    case PyUnicode_4BYTE_KIND:
10901        iresult = ucs4lib_count(
10902            ((Py_UCS4*)buf1) + start, end - start,
10903            buf2, len2, PY_SSIZE_T_MAX
10904            );
10905        break;
10906    default:
10907        assert(0); iresult = 0;
10908    }
10909
10910    result = PyLong_FromSsize_t(iresult);
10911
10912    if (kind2 != kind)
10913        PyMem_Free(buf2);
10914
10915    Py_DECREF(substring);
10916
10917    return result;
10918}
10919
10920PyDoc_STRVAR(encode__doc__,
10921             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10922\n\
10923Encode S using the codec registered for encoding. Default encoding\n\
10924is 'utf-8'. errors may be given to set a different error\n\
10925handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10926a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10927'xmlcharrefreplace' as well as any other name registered with\n\
10928codecs.register_error that can handle UnicodeEncodeErrors.");
10929
10930static PyObject *
10931unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10932{
10933    static char *kwlist[] = {"encoding", "errors", 0};
10934    char *encoding = NULL;
10935    char *errors = NULL;
10936
10937    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10938                                     kwlist, &encoding, &errors))
10939        return NULL;
10940    return PyUnicode_AsEncodedString(self, encoding, errors);
10941}
10942
10943PyDoc_STRVAR(expandtabs__doc__,
10944             "S.expandtabs([tabsize]) -> str\n\
10945\n\
10946Return a copy of S where all tab characters are expanded using spaces.\n\
10947If tabsize is not given, a tab size of 8 characters is assumed.");
10948
10949static PyObject*
10950unicode_expandtabs(PyObject *self, PyObject *args)
10951{
10952    Py_ssize_t i, j, line_pos, src_len, incr;
10953    Py_UCS4 ch;
10954    PyObject *u;
10955    void *src_data, *dest_data;
10956    int tabsize = 8;
10957    int kind;
10958    int found;
10959
10960    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10961        return NULL;
10962
10963    if (PyUnicode_READY(self) == -1)
10964        return NULL;
10965
10966    /* First pass: determine size of output string */
10967    src_len = PyUnicode_GET_LENGTH(self);
10968    i = j = line_pos = 0;
10969    kind = PyUnicode_KIND(self);
10970    src_data = PyUnicode_DATA(self);
10971    found = 0;
10972    for (; i < src_len; i++) {
10973        ch = PyUnicode_READ(kind, src_data, i);
10974        if (ch == '\t') {
10975            found = 1;
10976            if (tabsize > 0) {
10977                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10978                if (j > PY_SSIZE_T_MAX - incr)
10979                    goto overflow;
10980                line_pos += incr;
10981                j += incr;
10982            }
10983        }
10984        else {
10985            if (j > PY_SSIZE_T_MAX - 1)
10986                goto overflow;
10987            line_pos++;
10988            j++;
10989            if (ch == '\n' || ch == '\r')
10990                line_pos = 0;
10991        }
10992    }
10993    if (!found)
10994        return unicode_result_unchanged(self);
10995
10996    /* Second pass: create output string and fill it */
10997    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10998    if (!u)
10999        return NULL;
11000    dest_data = PyUnicode_DATA(u);
11001
11002    i = j = line_pos = 0;
11003
11004    for (; i < src_len; i++) {
11005        ch = PyUnicode_READ(kind, src_data, i);
11006        if (ch == '\t') {
11007            if (tabsize > 0) {
11008                incr = tabsize - (line_pos % tabsize);
11009                line_pos += incr;
11010                FILL(kind, dest_data, ' ', j, incr);
11011                j += incr;
11012            }
11013        }
11014        else {
11015            line_pos++;
11016            PyUnicode_WRITE(kind, dest_data, j, ch);
11017            j++;
11018            if (ch == '\n' || ch == '\r')
11019                line_pos = 0;
11020        }
11021    }
11022    assert (j == PyUnicode_GET_LENGTH(u));
11023    return unicode_result(u);
11024
11025  overflow:
11026    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11027    return NULL;
11028}
11029
11030PyDoc_STRVAR(find__doc__,
11031             "S.find(sub[, start[, end]]) -> int\n\
11032\n\
11033Return the lowest index in S where substring sub is found,\n\
11034such that sub is contained within S[start:end].  Optional\n\
11035arguments start and end are interpreted as in slice notation.\n\
11036\n\
11037Return -1 on failure.");
11038
11039static PyObject *
11040unicode_find(PyObject *self, PyObject *args)
11041{
11042    PyObject *substring;
11043    Py_ssize_t start;
11044    Py_ssize_t end;
11045    Py_ssize_t result;
11046
11047    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11048                                            &start, &end))
11049        return NULL;
11050
11051    if (PyUnicode_READY(self) == -1)
11052        return NULL;
11053    if (PyUnicode_READY(substring) == -1)
11054        return NULL;
11055
11056    result = any_find_slice(1, self, substring, start, end);
11057
11058    Py_DECREF(substring);
11059
11060    if (result == -2)
11061        return NULL;
11062
11063    return PyLong_FromSsize_t(result);
11064}
11065
11066static PyObject *
11067unicode_getitem(PyObject *self, Py_ssize_t index)
11068{
11069    void *data;
11070    enum PyUnicode_Kind kind;
11071    Py_UCS4 ch;
11072    PyObject *res;
11073
11074    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11075        PyErr_BadArgument();
11076        return NULL;
11077    }
11078    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11079        PyErr_SetString(PyExc_IndexError, "string index out of range");
11080        return NULL;
11081    }
11082    kind = PyUnicode_KIND(self);
11083    data = PyUnicode_DATA(self);
11084    ch = PyUnicode_READ(kind, data, index);
11085    if (ch < 256)
11086        return get_latin1_char(ch);
11087
11088    res = PyUnicode_New(1, ch);
11089    if (res == NULL)
11090        return NULL;
11091    kind = PyUnicode_KIND(res);
11092    data = PyUnicode_DATA(res);
11093    PyUnicode_WRITE(kind, data, 0, ch);
11094    assert(_PyUnicode_CheckConsistency(res, 1));
11095    return res;
11096}
11097
11098/* Believe it or not, this produces the same value for ASCII strings
11099   as bytes_hash(). */
11100static Py_hash_t
11101unicode_hash(PyObject *self)
11102{
11103    Py_ssize_t len;
11104    Py_uhash_t x;
11105
11106#ifdef Py_DEBUG
11107    assert(_Py_HashSecret_Initialized);
11108#endif
11109    if (_PyUnicode_HASH(self) != -1)
11110        return _PyUnicode_HASH(self);
11111    if (PyUnicode_READY(self) == -1)
11112        return -1;
11113    len = PyUnicode_GET_LENGTH(self);
11114    /*
11115      We make the hash of the empty string be 0, rather than using
11116      (prefix ^ suffix), since this slightly obfuscates the hash secret
11117    */
11118    if (len == 0) {
11119        _PyUnicode_HASH(self) = 0;
11120        return 0;
11121    }
11122
11123    /* The hash function as a macro, gets expanded three times below. */
11124#define HASH(P)                                            \
11125    x ^= (Py_uhash_t) *P << 7;                             \
11126    while (--len >= 0)                                     \
11127        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
11128
11129    x = (Py_uhash_t) _Py_HashSecret.prefix;
11130    switch (PyUnicode_KIND(self)) {
11131    case PyUnicode_1BYTE_KIND: {
11132        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11133        HASH(c);
11134        break;
11135    }
11136    case PyUnicode_2BYTE_KIND: {
11137        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11138        HASH(s);
11139        break;
11140    }
11141    default: {
11142        Py_UCS4 *l;
11143        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11144               "Impossible switch case in unicode_hash");
11145        l = PyUnicode_4BYTE_DATA(self);
11146        HASH(l);
11147        break;
11148    }
11149    }
11150    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11151    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
11152
11153    if (x == -1)
11154        x = -2;
11155    _PyUnicode_HASH(self) = x;
11156    return x;
11157}
11158#undef HASH
11159
11160PyDoc_STRVAR(index__doc__,
11161             "S.index(sub[, start[, end]]) -> int\n\
11162\n\
11163Like S.find() but raise ValueError when the substring is not found.");
11164
11165static PyObject *
11166unicode_index(PyObject *self, PyObject *args)
11167{
11168    Py_ssize_t result;
11169    PyObject *substring;
11170    Py_ssize_t start;
11171    Py_ssize_t end;
11172
11173    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11174                                            &start, &end))
11175        return NULL;
11176
11177    if (PyUnicode_READY(self) == -1)
11178        return NULL;
11179    if (PyUnicode_READY(substring) == -1)
11180        return NULL;
11181
11182    result = any_find_slice(1, self, substring, start, end);
11183
11184    Py_DECREF(substring);
11185
11186    if (result == -2)
11187        return NULL;
11188
11189    if (result < 0) {
11190        PyErr_SetString(PyExc_ValueError, "substring not found");
11191        return NULL;
11192    }
11193
11194    return PyLong_FromSsize_t(result);
11195}
11196
11197PyDoc_STRVAR(islower__doc__,
11198             "S.islower() -> bool\n\
11199\n\
11200Return True if all cased characters in S are lowercase and there is\n\
11201at least one cased character in S, False otherwise.");
11202
11203static PyObject*
11204unicode_islower(PyObject *self)
11205{
11206    Py_ssize_t i, length;
11207    int kind;
11208    void *data;
11209    int cased;
11210
11211    if (PyUnicode_READY(self) == -1)
11212        return NULL;
11213    length = PyUnicode_GET_LENGTH(self);
11214    kind = PyUnicode_KIND(self);
11215    data = PyUnicode_DATA(self);
11216
11217    /* Shortcut for single character strings */
11218    if (length == 1)
11219        return PyBool_FromLong(
11220            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11221
11222    /* Special case for empty strings */
11223    if (length == 0)
11224        return PyBool_FromLong(0);
11225
11226    cased = 0;
11227    for (i = 0; i < length; i++) {
11228        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11229
11230        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11231            return PyBool_FromLong(0);
11232        else if (!cased && Py_UNICODE_ISLOWER(ch))
11233            cased = 1;
11234    }
11235    return PyBool_FromLong(cased);
11236}
11237
11238PyDoc_STRVAR(isupper__doc__,
11239             "S.isupper() -> bool\n\
11240\n\
11241Return True if all cased characters in S are uppercase and there is\n\
11242at least one cased character in S, False otherwise.");
11243
11244static PyObject*
11245unicode_isupper(PyObject *self)
11246{
11247    Py_ssize_t i, length;
11248    int kind;
11249    void *data;
11250    int cased;
11251
11252    if (PyUnicode_READY(self) == -1)
11253        return NULL;
11254    length = PyUnicode_GET_LENGTH(self);
11255    kind = PyUnicode_KIND(self);
11256    data = PyUnicode_DATA(self);
11257
11258    /* Shortcut for single character strings */
11259    if (length == 1)
11260        return PyBool_FromLong(
11261            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11262
11263    /* Special case for empty strings */
11264    if (length == 0)
11265        return PyBool_FromLong(0);
11266
11267    cased = 0;
11268    for (i = 0; i < length; i++) {
11269        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11270
11271        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11272            return PyBool_FromLong(0);
11273        else if (!cased && Py_UNICODE_ISUPPER(ch))
11274            cased = 1;
11275    }
11276    return PyBool_FromLong(cased);
11277}
11278
11279PyDoc_STRVAR(istitle__doc__,
11280             "S.istitle() -> bool\n\
11281\n\
11282Return True if S is a titlecased string and there is at least one\n\
11283character in S, i.e. upper- and titlecase characters may only\n\
11284follow uncased characters and lowercase characters only cased ones.\n\
11285Return False otherwise.");
11286
11287static PyObject*
11288unicode_istitle(PyObject *self)
11289{
11290    Py_ssize_t i, length;
11291    int kind;
11292    void *data;
11293    int cased, previous_is_cased;
11294
11295    if (PyUnicode_READY(self) == -1)
11296        return NULL;
11297    length = PyUnicode_GET_LENGTH(self);
11298    kind = PyUnicode_KIND(self);
11299    data = PyUnicode_DATA(self);
11300
11301    /* Shortcut for single character strings */
11302    if (length == 1) {
11303        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11304        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11305                               (Py_UNICODE_ISUPPER(ch) != 0));
11306    }
11307
11308    /* Special case for empty strings */
11309    if (length == 0)
11310        return PyBool_FromLong(0);
11311
11312    cased = 0;
11313    previous_is_cased = 0;
11314    for (i = 0; i < length; i++) {
11315        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11316
11317        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11318            if (previous_is_cased)
11319                return PyBool_FromLong(0);
11320            previous_is_cased = 1;
11321            cased = 1;
11322        }
11323        else if (Py_UNICODE_ISLOWER(ch)) {
11324            if (!previous_is_cased)
11325                return PyBool_FromLong(0);
11326            previous_is_cased = 1;
11327            cased = 1;
11328        }
11329        else
11330            previous_is_cased = 0;
11331    }
11332    return PyBool_FromLong(cased);
11333}
11334
11335PyDoc_STRVAR(isspace__doc__,
11336             "S.isspace() -> bool\n\
11337\n\
11338Return True if all characters in S are whitespace\n\
11339and there is at least one character in S, False otherwise.");
11340
11341static PyObject*
11342unicode_isspace(PyObject *self)
11343{
11344    Py_ssize_t i, length;
11345    int kind;
11346    void *data;
11347
11348    if (PyUnicode_READY(self) == -1)
11349        return NULL;
11350    length = PyUnicode_GET_LENGTH(self);
11351    kind = PyUnicode_KIND(self);
11352    data = PyUnicode_DATA(self);
11353
11354    /* Shortcut for single character strings */
11355    if (length == 1)
11356        return PyBool_FromLong(
11357            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11358
11359    /* Special case for empty strings */
11360    if (length == 0)
11361        return PyBool_FromLong(0);
11362
11363    for (i = 0; i < length; i++) {
11364        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11365        if (!Py_UNICODE_ISSPACE(ch))
11366            return PyBool_FromLong(0);
11367    }
11368    return PyBool_FromLong(1);
11369}
11370
11371PyDoc_STRVAR(isalpha__doc__,
11372             "S.isalpha() -> bool\n\
11373\n\
11374Return True if all characters in S are alphabetic\n\
11375and there is at least one character in S, False otherwise.");
11376
11377static PyObject*
11378unicode_isalpha(PyObject *self)
11379{
11380    Py_ssize_t i, length;
11381    int kind;
11382    void *data;
11383
11384    if (PyUnicode_READY(self) == -1)
11385        return NULL;
11386    length = PyUnicode_GET_LENGTH(self);
11387    kind = PyUnicode_KIND(self);
11388    data = PyUnicode_DATA(self);
11389
11390    /* Shortcut for single character strings */
11391    if (length == 1)
11392        return PyBool_FromLong(
11393            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11394
11395    /* Special case for empty strings */
11396    if (length == 0)
11397        return PyBool_FromLong(0);
11398
11399    for (i = 0; i < length; i++) {
11400        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11401            return PyBool_FromLong(0);
11402    }
11403    return PyBool_FromLong(1);
11404}
11405
11406PyDoc_STRVAR(isalnum__doc__,
11407             "S.isalnum() -> bool\n\
11408\n\
11409Return True if all characters in S are alphanumeric\n\
11410and there is at least one character in S, False otherwise.");
11411
11412static PyObject*
11413unicode_isalnum(PyObject *self)
11414{
11415    int kind;
11416    void *data;
11417    Py_ssize_t len, i;
11418
11419    if (PyUnicode_READY(self) == -1)
11420        return NULL;
11421
11422    kind = PyUnicode_KIND(self);
11423    data = PyUnicode_DATA(self);
11424    len = PyUnicode_GET_LENGTH(self);
11425
11426    /* Shortcut for single character strings */
11427    if (len == 1) {
11428        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11429        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11430    }
11431
11432    /* Special case for empty strings */
11433    if (len == 0)
11434        return PyBool_FromLong(0);
11435
11436    for (i = 0; i < len; i++) {
11437        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11438        if (!Py_UNICODE_ISALNUM(ch))
11439            return PyBool_FromLong(0);
11440    }
11441    return PyBool_FromLong(1);
11442}
11443
11444PyDoc_STRVAR(isdecimal__doc__,
11445             "S.isdecimal() -> bool\n\
11446\n\
11447Return True if there are only decimal characters in S,\n\
11448False otherwise.");
11449
11450static PyObject*
11451unicode_isdecimal(PyObject *self)
11452{
11453    Py_ssize_t i, length;
11454    int kind;
11455    void *data;
11456
11457    if (PyUnicode_READY(self) == -1)
11458        return NULL;
11459    length = PyUnicode_GET_LENGTH(self);
11460    kind = PyUnicode_KIND(self);
11461    data = PyUnicode_DATA(self);
11462
11463    /* Shortcut for single character strings */
11464    if (length == 1)
11465        return PyBool_FromLong(
11466            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11467
11468    /* Special case for empty strings */
11469    if (length == 0)
11470        return PyBool_FromLong(0);
11471
11472    for (i = 0; i < length; i++) {
11473        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11474            return PyBool_FromLong(0);
11475    }
11476    return PyBool_FromLong(1);
11477}
11478
11479PyDoc_STRVAR(isdigit__doc__,
11480             "S.isdigit() -> bool\n\
11481\n\
11482Return True if all characters in S are digits\n\
11483and there is at least one character in S, False otherwise.");
11484
11485static PyObject*
11486unicode_isdigit(PyObject *self)
11487{
11488    Py_ssize_t i, length;
11489    int kind;
11490    void *data;
11491
11492    if (PyUnicode_READY(self) == -1)
11493        return NULL;
11494    length = PyUnicode_GET_LENGTH(self);
11495    kind = PyUnicode_KIND(self);
11496    data = PyUnicode_DATA(self);
11497
11498    /* Shortcut for single character strings */
11499    if (length == 1) {
11500        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11501        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11502    }
11503
11504    /* Special case for empty strings */
11505    if (length == 0)
11506        return PyBool_FromLong(0);
11507
11508    for (i = 0; i < length; i++) {
11509        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11510            return PyBool_FromLong(0);
11511    }
11512    return PyBool_FromLong(1);
11513}
11514
11515PyDoc_STRVAR(isnumeric__doc__,
11516             "S.isnumeric() -> bool\n\
11517\n\
11518Return True if there are only numeric characters in S,\n\
11519False otherwise.");
11520
11521static PyObject*
11522unicode_isnumeric(PyObject *self)
11523{
11524    Py_ssize_t i, length;
11525    int kind;
11526    void *data;
11527
11528    if (PyUnicode_READY(self) == -1)
11529        return NULL;
11530    length = PyUnicode_GET_LENGTH(self);
11531    kind = PyUnicode_KIND(self);
11532    data = PyUnicode_DATA(self);
11533
11534    /* Shortcut for single character strings */
11535    if (length == 1)
11536        return PyBool_FromLong(
11537            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11538
11539    /* Special case for empty strings */
11540    if (length == 0)
11541        return PyBool_FromLong(0);
11542
11543    for (i = 0; i < length; i++) {
11544        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11545            return PyBool_FromLong(0);
11546    }
11547    return PyBool_FromLong(1);
11548}
11549
11550int
11551PyUnicode_IsIdentifier(PyObject *self)
11552{
11553    int kind;
11554    void *data;
11555    Py_ssize_t i;
11556    Py_UCS4 first;
11557
11558    if (PyUnicode_READY(self) == -1) {
11559        Py_FatalError("identifier not ready");
11560        return 0;
11561    }
11562
11563    /* Special case for empty strings */
11564    if (PyUnicode_GET_LENGTH(self) == 0)
11565        return 0;
11566    kind = PyUnicode_KIND(self);
11567    data = PyUnicode_DATA(self);
11568
11569    /* PEP 3131 says that the first character must be in
11570       XID_Start and subsequent characters in XID_Continue,
11571       and for the ASCII range, the 2.x rules apply (i.e
11572       start with letters and underscore, continue with
11573       letters, digits, underscore). However, given the current
11574       definition of XID_Start and XID_Continue, it is sufficient
11575       to check just for these, except that _ must be allowed
11576       as starting an identifier.  */
11577    first = PyUnicode_READ(kind, data, 0);
11578    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11579        return 0;
11580
11581    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11582        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11583            return 0;
11584    return 1;
11585}
11586
11587PyDoc_STRVAR(isidentifier__doc__,
11588             "S.isidentifier() -> bool\n\
11589\n\
11590Return True if S is a valid identifier according\n\
11591to the language definition.");
11592
11593static PyObject*
11594unicode_isidentifier(PyObject *self)
11595{
11596    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11597}
11598
11599PyDoc_STRVAR(isprintable__doc__,
11600             "S.isprintable() -> bool\n\
11601\n\
11602Return True if all characters in S are considered\n\
11603printable in repr() or S is empty, False otherwise.");
11604
11605static PyObject*
11606unicode_isprintable(PyObject *self)
11607{
11608    Py_ssize_t i, length;
11609    int kind;
11610    void *data;
11611
11612    if (PyUnicode_READY(self) == -1)
11613        return NULL;
11614    length = PyUnicode_GET_LENGTH(self);
11615    kind = PyUnicode_KIND(self);
11616    data = PyUnicode_DATA(self);
11617
11618    /* Shortcut for single character strings */
11619    if (length == 1)
11620        return PyBool_FromLong(
11621            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11622
11623    for (i = 0; i < length; i++) {
11624        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11625            Py_RETURN_FALSE;
11626        }
11627    }
11628    Py_RETURN_TRUE;
11629}
11630
11631PyDoc_STRVAR(join__doc__,
11632             "S.join(iterable) -> str\n\
11633\n\
11634Return a string which is the concatenation of the strings in the\n\
11635iterable.  The separator between elements is S.");
11636
11637static PyObject*
11638unicode_join(PyObject *self, PyObject *data)
11639{
11640    return PyUnicode_Join(self, data);
11641}
11642
11643static Py_ssize_t
11644unicode_length(PyObject *self)
11645{
11646    if (PyUnicode_READY(self) == -1)
11647        return -1;
11648    return PyUnicode_GET_LENGTH(self);
11649}
11650
11651PyDoc_STRVAR(ljust__doc__,
11652             "S.ljust(width[, fillchar]) -> str\n\
11653\n\
11654Return S left-justified in a Unicode string of length width. Padding is\n\
11655done using the specified fill character (default is a space).");
11656
11657static PyObject *
11658unicode_ljust(PyObject *self, PyObject *args)
11659{
11660    Py_ssize_t width;
11661    Py_UCS4 fillchar = ' ';
11662
11663    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11664        return NULL;
11665
11666    if (PyUnicode_READY(self) == -1)
11667        return NULL;
11668
11669    if (PyUnicode_GET_LENGTH(self) >= width)
11670        return unicode_result_unchanged(self);
11671
11672    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11673}
11674
11675PyDoc_STRVAR(lower__doc__,
11676             "S.lower() -> str\n\
11677\n\
11678Return a copy of the string S converted to lowercase.");
11679
11680static PyObject*
11681unicode_lower(PyObject *self)
11682{
11683    if (PyUnicode_READY(self) == -1)
11684        return NULL;
11685    if (PyUnicode_IS_ASCII(self))
11686        return ascii_upper_or_lower(self, 1);
11687    return case_operation(self, do_lower);
11688}
11689
11690#define LEFTSTRIP 0
11691#define RIGHTSTRIP 1
11692#define BOTHSTRIP 2
11693
11694/* Arrays indexed by above */
11695static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11696
11697#define STRIPNAME(i) (stripformat[i]+3)
11698
11699/* externally visible for str.strip(unicode) */
11700PyObject *
11701_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11702{
11703    void *data;
11704    int kind;
11705    Py_ssize_t i, j, len;
11706    BLOOM_MASK sepmask;
11707
11708    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11709        return NULL;
11710
11711    kind = PyUnicode_KIND(self);
11712    data = PyUnicode_DATA(self);
11713    len = PyUnicode_GET_LENGTH(self);
11714    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11715                              PyUnicode_DATA(sepobj),
11716                              PyUnicode_GET_LENGTH(sepobj));
11717
11718    i = 0;
11719    if (striptype != RIGHTSTRIP) {
11720        while (i < len &&
11721               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11722            i++;
11723        }
11724    }
11725
11726    j = len;
11727    if (striptype != LEFTSTRIP) {
11728        do {
11729            j--;
11730        } while (j >= i &&
11731                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11732        j++;
11733    }
11734
11735    return PyUnicode_Substring(self, i, j);
11736}
11737
11738PyObject*
11739PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11740{
11741    unsigned char *data;
11742    int kind;
11743    Py_ssize_t length;
11744
11745    if (PyUnicode_READY(self) == -1)
11746        return NULL;
11747
11748    length = PyUnicode_GET_LENGTH(self);
11749    end = Py_MIN(end, length);
11750
11751    if (start == 0 && end == length)
11752        return unicode_result_unchanged(self);
11753
11754    if (start < 0 || end < 0) {
11755        PyErr_SetString(PyExc_IndexError, "string index out of range");
11756        return NULL;
11757    }
11758    if (start >= length || end < start) {
11759        Py_INCREF(unicode_empty);
11760        return unicode_empty;
11761    }
11762
11763    length = end - start;
11764    if (PyUnicode_IS_ASCII(self)) {
11765        data = PyUnicode_1BYTE_DATA(self);
11766        return unicode_fromascii(data + start, length);
11767    }
11768    else {
11769        kind = PyUnicode_KIND(self);
11770        data = PyUnicode_1BYTE_DATA(self);
11771        return PyUnicode_FromKindAndData(kind,
11772                                         data + kind * start,
11773                                         length);
11774    }
11775}
11776
11777static PyObject *
11778do_strip(PyObject *self, int striptype)
11779{
11780    int kind;
11781    void *data;
11782    Py_ssize_t len, i, j;
11783
11784    if (PyUnicode_READY(self) == -1)
11785        return NULL;
11786
11787    kind = PyUnicode_KIND(self);
11788    data = PyUnicode_DATA(self);
11789    len = PyUnicode_GET_LENGTH(self);
11790
11791    i = 0;
11792    if (striptype != RIGHTSTRIP) {
11793        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11794            i++;
11795        }
11796    }
11797
11798    j = len;
11799    if (striptype != LEFTSTRIP) {
11800        do {
11801            j--;
11802        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11803        j++;
11804    }
11805
11806    return PyUnicode_Substring(self, i, j);
11807}
11808
11809
11810static PyObject *
11811do_argstrip(PyObject *self, int striptype, PyObject *args)
11812{
11813    PyObject *sep = NULL;
11814
11815    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11816        return NULL;
11817
11818    if (sep != NULL && sep != Py_None) {
11819        if (PyUnicode_Check(sep))
11820            return _PyUnicode_XStrip(self, striptype, sep);
11821        else {
11822            PyErr_Format(PyExc_TypeError,
11823                         "%s arg must be None or str",
11824                         STRIPNAME(striptype));
11825            return NULL;
11826        }
11827    }
11828
11829    return do_strip(self, striptype);
11830}
11831
11832
11833PyDoc_STRVAR(strip__doc__,
11834             "S.strip([chars]) -> str\n\
11835\n\
11836Return a copy of the string S with leading and trailing\n\
11837whitespace removed.\n\
11838If chars is given and not None, remove characters in chars instead.");
11839
11840static PyObject *
11841unicode_strip(PyObject *self, PyObject *args)
11842{
11843    if (PyTuple_GET_SIZE(args) == 0)
11844        return do_strip(self, BOTHSTRIP); /* Common case */
11845    else
11846        return do_argstrip(self, BOTHSTRIP, args);
11847}
11848
11849
11850PyDoc_STRVAR(lstrip__doc__,
11851             "S.lstrip([chars]) -> str\n\
11852\n\
11853Return a copy of the string S with leading whitespace removed.\n\
11854If chars is given and not None, remove characters in chars instead.");
11855
11856static PyObject *
11857unicode_lstrip(PyObject *self, PyObject *args)
11858{
11859    if (PyTuple_GET_SIZE(args) == 0)
11860        return do_strip(self, LEFTSTRIP); /* Common case */
11861    else
11862        return do_argstrip(self, LEFTSTRIP, args);
11863}
11864
11865
11866PyDoc_STRVAR(rstrip__doc__,
11867             "S.rstrip([chars]) -> str\n\
11868\n\
11869Return a copy of the string S with trailing whitespace removed.\n\
11870If chars is given and not None, remove characters in chars instead.");
11871
11872static PyObject *
11873unicode_rstrip(PyObject *self, PyObject *args)
11874{
11875    if (PyTuple_GET_SIZE(args) == 0)
11876        return do_strip(self, RIGHTSTRIP); /* Common case */
11877    else
11878        return do_argstrip(self, RIGHTSTRIP, args);
11879}
11880
11881
11882static PyObject*
11883unicode_repeat(PyObject *str, Py_ssize_t len)
11884{
11885    PyObject *u;
11886    Py_ssize_t nchars, n;
11887
11888    if (len < 1) {
11889        Py_INCREF(unicode_empty);
11890        return unicode_empty;
11891    }
11892
11893    /* no repeat, return original string */
11894    if (len == 1)
11895        return unicode_result_unchanged(str);
11896
11897    if (PyUnicode_READY(str) == -1)
11898        return NULL;
11899
11900    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11901        PyErr_SetString(PyExc_OverflowError,
11902                        "repeated string is too long");
11903        return NULL;
11904    }
11905    nchars = len * PyUnicode_GET_LENGTH(str);
11906
11907    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11908    if (!u)
11909        return NULL;
11910    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11911
11912    if (PyUnicode_GET_LENGTH(str) == 1) {
11913        const int kind = PyUnicode_KIND(str);
11914        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11915        if (kind == PyUnicode_1BYTE_KIND) {
11916            void *to = PyUnicode_DATA(u);
11917            memset(to, (unsigned char)fill_char, len);
11918        }
11919        else if (kind == PyUnicode_2BYTE_KIND) {
11920            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11921            for (n = 0; n < len; ++n)
11922                ucs2[n] = fill_char;
11923        } else {
11924            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11925            assert(kind == PyUnicode_4BYTE_KIND);
11926            for (n = 0; n < len; ++n)
11927                ucs4[n] = fill_char;
11928        }
11929    }
11930    else {
11931        /* number of characters copied this far */
11932        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11933        const Py_ssize_t char_size = PyUnicode_KIND(str);
11934        char *to = (char *) PyUnicode_DATA(u);
11935        Py_MEMCPY(to, PyUnicode_DATA(str),
11936                  PyUnicode_GET_LENGTH(str) * char_size);
11937        while (done < nchars) {
11938            n = (done <= nchars-done) ? done : nchars-done;
11939            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11940            done += n;
11941        }
11942    }
11943
11944    assert(_PyUnicode_CheckConsistency(u, 1));
11945    return u;
11946}
11947
11948PyObject *
11949PyUnicode_Replace(PyObject *obj,
11950                  PyObject *subobj,
11951                  PyObject *replobj,
11952                  Py_ssize_t maxcount)
11953{
11954    PyObject *self;
11955    PyObject *str1;
11956    PyObject *str2;
11957    PyObject *result;
11958
11959    self = PyUnicode_FromObject(obj);
11960    if (self == NULL)
11961        return NULL;
11962    str1 = PyUnicode_FromObject(subobj);
11963    if (str1 == NULL) {
11964        Py_DECREF(self);
11965        return NULL;
11966    }
11967    str2 = PyUnicode_FromObject(replobj);
11968    if (str2 == NULL) {
11969        Py_DECREF(self);
11970        Py_DECREF(str1);
11971        return NULL;
11972    }
11973    if (PyUnicode_READY(self) == -1 ||
11974        PyUnicode_READY(str1) == -1 ||
11975        PyUnicode_READY(str2) == -1)
11976        result = NULL;
11977    else
11978        result = replace(self, str1, str2, maxcount);
11979    Py_DECREF(self);
11980    Py_DECREF(str1);
11981    Py_DECREF(str2);
11982    return result;
11983}
11984
11985PyDoc_STRVAR(replace__doc__,
11986             "S.replace(old, new[, count]) -> str\n\
11987\n\
11988Return a copy of S with all occurrences of substring\n\
11989old replaced by new.  If the optional argument count is\n\
11990given, only the first count occurrences are replaced.");
11991
11992static PyObject*
11993unicode_replace(PyObject *self, PyObject *args)
11994{
11995    PyObject *str1;
11996    PyObject *str2;
11997    Py_ssize_t maxcount = -1;
11998    PyObject *result;
11999
12000    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12001        return NULL;
12002    if (PyUnicode_READY(self) == -1)
12003        return NULL;
12004    str1 = PyUnicode_FromObject(str1);
12005    if (str1 == NULL)
12006        return NULL;
12007    str2 = PyUnicode_FromObject(str2);
12008    if (str2 == NULL) {
12009        Py_DECREF(str1);
12010        return NULL;
12011    }
12012    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12013        result = NULL;
12014    else
12015        result = replace(self, str1, str2, maxcount);
12016
12017    Py_DECREF(str1);
12018    Py_DECREF(str2);
12019    return result;
12020}
12021
12022static PyObject *
12023unicode_repr(PyObject *unicode)
12024{
12025    PyObject *repr;
12026    Py_ssize_t isize;
12027    Py_ssize_t osize, squote, dquote, i, o;
12028    Py_UCS4 max, quote;
12029    int ikind, okind;
12030    void *idata, *odata;
12031
12032    if (PyUnicode_READY(unicode) == -1)
12033        return NULL;
12034
12035    isize = PyUnicode_GET_LENGTH(unicode);
12036    idata = PyUnicode_DATA(unicode);
12037
12038    /* Compute length of output, quote characters, and
12039       maximum character */
12040    osize = 2; /* quotes */
12041    max = 127;
12042    squote = dquote = 0;
12043    ikind = PyUnicode_KIND(unicode);
12044    for (i = 0; i < isize; i++) {
12045        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12046        switch (ch) {
12047        case '\'': squote++; osize++; break;
12048        case '"':  dquote++; osize++; break;
12049        case '\\': case '\t': case '\r': case '\n':
12050            osize += 2; break;
12051        default:
12052            /* Fast-path ASCII */
12053            if (ch < ' ' || ch == 0x7f)
12054                osize += 4; /* \xHH */
12055            else if (ch < 0x7f)
12056                osize++;
12057            else if (Py_UNICODE_ISPRINTABLE(ch)) {
12058                osize++;
12059                max = ch > max ? ch : max;
12060            }
12061            else if (ch < 0x100)
12062                osize += 4; /* \xHH */
12063            else if (ch < 0x10000)
12064                osize += 6; /* \uHHHH */
12065            else
12066                osize += 10; /* \uHHHHHHHH */
12067        }
12068    }
12069
12070    quote = '\'';
12071    if (squote) {
12072        if (dquote)
12073            /* Both squote and dquote present. Use squote,
12074               and escape them */
12075            osize += squote;
12076        else
12077            quote = '"';
12078    }
12079
12080    repr = PyUnicode_New(osize, max);
12081    if (repr == NULL)
12082        return NULL;
12083    okind = PyUnicode_KIND(repr);
12084    odata = PyUnicode_DATA(repr);
12085
12086    PyUnicode_WRITE(okind, odata, 0, quote);
12087    PyUnicode_WRITE(okind, odata, osize-1, quote);
12088
12089    for (i = 0, o = 1; i < isize; i++) {
12090        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12091
12092        /* Escape quotes and backslashes */
12093        if ((ch == quote) || (ch == '\\')) {
12094            PyUnicode_WRITE(okind, odata, o++, '\\');
12095            PyUnicode_WRITE(okind, odata, o++, ch);
12096            continue;
12097        }
12098
12099        /* Map special whitespace to '\t', \n', '\r' */
12100        if (ch == '\t') {
12101            PyUnicode_WRITE(okind, odata, o++, '\\');
12102            PyUnicode_WRITE(okind, odata, o++, 't');
12103        }
12104        else if (ch == '\n') {
12105            PyUnicode_WRITE(okind, odata, o++, '\\');
12106            PyUnicode_WRITE(okind, odata, o++, 'n');
12107        }
12108        else if (ch == '\r') {
12109            PyUnicode_WRITE(okind, odata, o++, '\\');
12110            PyUnicode_WRITE(okind, odata, o++, 'r');
12111        }
12112
12113        /* Map non-printable US ASCII to '\xhh' */
12114        else if (ch < ' ' || ch == 0x7F) {
12115            PyUnicode_WRITE(okind, odata, o++, '\\');
12116            PyUnicode_WRITE(okind, odata, o++, 'x');
12117            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12118            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12119        }
12120
12121        /* Copy ASCII characters as-is */
12122        else if (ch < 0x7F) {
12123            PyUnicode_WRITE(okind, odata, o++, ch);
12124        }
12125
12126        /* Non-ASCII characters */
12127        else {
12128            /* Map Unicode whitespace and control characters
12129               (categories Z* and C* except ASCII space)
12130            */
12131            if (!Py_UNICODE_ISPRINTABLE(ch)) {
12132                /* Map 8-bit characters to '\xhh' */
12133                if (ch <= 0xff) {
12134                    PyUnicode_WRITE(okind, odata, o++, '\\');
12135                    PyUnicode_WRITE(okind, odata, o++, 'x');
12136                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12137                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12138                }
12139                /* Map 21-bit characters to '\U00xxxxxx' */
12140                else if (ch >= 0x10000) {
12141                    PyUnicode_WRITE(okind, odata, o++, '\\');
12142                    PyUnicode_WRITE(okind, odata, o++, 'U');
12143                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12144                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12145                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12146                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12147                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12148                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12149                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12150                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12151                }
12152                /* Map 16-bit characters to '\uxxxx' */
12153                else {
12154                    PyUnicode_WRITE(okind, odata, o++, '\\');
12155                    PyUnicode_WRITE(okind, odata, o++, 'u');
12156                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12157                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12158                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12159                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12160                }
12161            }
12162            /* Copy characters as-is */
12163            else {
12164                PyUnicode_WRITE(okind, odata, o++, ch);
12165            }
12166        }
12167    }
12168    /* Closing quote already added at the beginning */
12169    assert(_PyUnicode_CheckConsistency(repr, 1));
12170    return repr;
12171}
12172
12173PyDoc_STRVAR(rfind__doc__,
12174             "S.rfind(sub[, start[, end]]) -> int\n\
12175\n\
12176Return the highest index in S where substring sub is found,\n\
12177such that sub is contained within S[start:end].  Optional\n\
12178arguments start and end are interpreted as in slice notation.\n\
12179\n\
12180Return -1 on failure.");
12181
12182static PyObject *
12183unicode_rfind(PyObject *self, PyObject *args)
12184{
12185    PyObject *substring;
12186    Py_ssize_t start;
12187    Py_ssize_t end;
12188    Py_ssize_t result;
12189
12190    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12191                                            &start, &end))
12192        return NULL;
12193
12194    if (PyUnicode_READY(self) == -1)
12195        return NULL;
12196    if (PyUnicode_READY(substring) == -1)
12197        return NULL;
12198
12199    result = any_find_slice(-1, self, substring, start, end);
12200
12201    Py_DECREF(substring);
12202
12203    if (result == -2)
12204        return NULL;
12205
12206    return PyLong_FromSsize_t(result);
12207}
12208
12209PyDoc_STRVAR(rindex__doc__,
12210             "S.rindex(sub[, start[, end]]) -> int\n\
12211\n\
12212Like S.rfind() but raise ValueError when the substring is not found.");
12213
12214static PyObject *
12215unicode_rindex(PyObject *self, PyObject *args)
12216{
12217    PyObject *substring;
12218    Py_ssize_t start;
12219    Py_ssize_t end;
12220    Py_ssize_t result;
12221
12222    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12223                                            &start, &end))
12224        return NULL;
12225
12226    if (PyUnicode_READY(self) == -1)
12227        return NULL;
12228    if (PyUnicode_READY(substring) == -1)
12229        return NULL;
12230
12231    result = any_find_slice(-1, self, substring, start, end);
12232
12233    Py_DECREF(substring);
12234
12235    if (result == -2)
12236        return NULL;
12237
12238    if (result < 0) {
12239        PyErr_SetString(PyExc_ValueError, "substring not found");
12240        return NULL;
12241    }
12242
12243    return PyLong_FromSsize_t(result);
12244}
12245
12246PyDoc_STRVAR(rjust__doc__,
12247             "S.rjust(width[, fillchar]) -> str\n\
12248\n\
12249Return S right-justified in a string of length width. Padding is\n\
12250done using the specified fill character (default is a space).");
12251
12252static PyObject *
12253unicode_rjust(PyObject *self, PyObject *args)
12254{
12255    Py_ssize_t width;
12256    Py_UCS4 fillchar = ' ';
12257
12258    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12259        return NULL;
12260
12261    if (PyUnicode_READY(self) == -1)
12262        return NULL;
12263
12264    if (PyUnicode_GET_LENGTH(self) >= width)
12265        return unicode_result_unchanged(self);
12266
12267    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12268}
12269
12270PyObject *
12271PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12272{
12273    PyObject *result;
12274
12275    s = PyUnicode_FromObject(s);
12276    if (s == NULL)
12277        return NULL;
12278    if (sep != NULL) {
12279        sep = PyUnicode_FromObject(sep);
12280        if (sep == NULL) {
12281            Py_DECREF(s);
12282            return NULL;
12283        }
12284    }
12285
12286    result = split(s, sep, maxsplit);
12287
12288    Py_DECREF(s);
12289    Py_XDECREF(sep);
12290    return result;
12291}
12292
12293PyDoc_STRVAR(split__doc__,
12294             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12295\n\
12296Return a list of the words in S, using sep as the\n\
12297delimiter string.  If maxsplit is given, at most maxsplit\n\
12298splits are done. If sep is not specified or is None, any\n\
12299whitespace string is a separator and empty strings are\n\
12300removed from the result.");
12301
12302static PyObject*
12303unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12304{
12305    static char *kwlist[] = {"sep", "maxsplit", 0};
12306    PyObject *substring = Py_None;
12307    Py_ssize_t maxcount = -1;
12308
12309    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12310                                     kwlist, &substring, &maxcount))
12311        return NULL;
12312
12313    if (substring == Py_None)
12314        return split(self, NULL, maxcount);
12315    else if (PyUnicode_Check(substring))
12316        return split(self, substring, maxcount);
12317    else
12318        return PyUnicode_Split(self, substring, maxcount);
12319}
12320
12321PyObject *
12322PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12323{
12324    PyObject* str_obj;
12325    PyObject* sep_obj;
12326    PyObject* out;
12327    int kind1, kind2, kind;
12328    void *buf1 = NULL, *buf2 = NULL;
12329    Py_ssize_t len1, len2;
12330
12331    str_obj = PyUnicode_FromObject(str_in);
12332    if (!str_obj)
12333        return NULL;
12334    sep_obj = PyUnicode_FromObject(sep_in);
12335    if (!sep_obj) {
12336        Py_DECREF(str_obj);
12337        return NULL;
12338    }
12339    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12340        Py_DECREF(sep_obj);
12341        Py_DECREF(str_obj);
12342        return NULL;
12343    }
12344
12345    kind1 = PyUnicode_KIND(str_obj);
12346    kind2 = PyUnicode_KIND(sep_obj);
12347    kind = Py_MAX(kind1, kind2);
12348    buf1 = PyUnicode_DATA(str_obj);
12349    if (kind1 != kind)
12350        buf1 = _PyUnicode_AsKind(str_obj, kind);
12351    if (!buf1)
12352        goto onError;
12353    buf2 = PyUnicode_DATA(sep_obj);
12354    if (kind2 != kind)
12355        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12356    if (!buf2)
12357        goto onError;
12358    len1 = PyUnicode_GET_LENGTH(str_obj);
12359    len2 = PyUnicode_GET_LENGTH(sep_obj);
12360
12361    switch (PyUnicode_KIND(str_obj)) {
12362    case PyUnicode_1BYTE_KIND:
12363        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12364            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12365        else
12366            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12367        break;
12368    case PyUnicode_2BYTE_KIND:
12369        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12370        break;
12371    case PyUnicode_4BYTE_KIND:
12372        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12373        break;
12374    default:
12375        assert(0);
12376        out = 0;
12377    }
12378
12379    Py_DECREF(sep_obj);
12380    Py_DECREF(str_obj);
12381    if (kind1 != kind)
12382        PyMem_Free(buf1);
12383    if (kind2 != kind)
12384        PyMem_Free(buf2);
12385
12386    return out;
12387  onError:
12388    Py_DECREF(sep_obj);
12389    Py_DECREF(str_obj);
12390    if (kind1 != kind && buf1)
12391        PyMem_Free(buf1);
12392    if (kind2 != kind && buf2)
12393        PyMem_Free(buf2);
12394    return NULL;
12395}
12396
12397
12398PyObject *
12399PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12400{
12401    PyObject* str_obj;
12402    PyObject* sep_obj;
12403    PyObject* out;
12404    int kind1, kind2, kind;
12405    void *buf1 = NULL, *buf2 = NULL;
12406    Py_ssize_t len1, len2;
12407
12408    str_obj = PyUnicode_FromObject(str_in);
12409    if (!str_obj)
12410        return NULL;
12411    sep_obj = PyUnicode_FromObject(sep_in);
12412    if (!sep_obj) {
12413        Py_DECREF(str_obj);
12414        return NULL;
12415    }
12416
12417    kind1 = PyUnicode_KIND(str_in);
12418    kind2 = PyUnicode_KIND(sep_obj);
12419    kind = Py_MAX(kind1, kind2);
12420    buf1 = PyUnicode_DATA(str_in);
12421    if (kind1 != kind)
12422        buf1 = _PyUnicode_AsKind(str_in, kind);
12423    if (!buf1)
12424        goto onError;
12425    buf2 = PyUnicode_DATA(sep_obj);
12426    if (kind2 != kind)
12427        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12428    if (!buf2)
12429        goto onError;
12430    len1 = PyUnicode_GET_LENGTH(str_obj);
12431    len2 = PyUnicode_GET_LENGTH(sep_obj);
12432
12433    switch (PyUnicode_KIND(str_in)) {
12434    case PyUnicode_1BYTE_KIND:
12435        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12436            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12437        else
12438            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12439        break;
12440    case PyUnicode_2BYTE_KIND:
12441        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12442        break;
12443    case PyUnicode_4BYTE_KIND:
12444        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12445        break;
12446    default:
12447        assert(0);
12448        out = 0;
12449    }
12450
12451    Py_DECREF(sep_obj);
12452    Py_DECREF(str_obj);
12453    if (kind1 != kind)
12454        PyMem_Free(buf1);
12455    if (kind2 != kind)
12456        PyMem_Free(buf2);
12457
12458    return out;
12459  onError:
12460    Py_DECREF(sep_obj);
12461    Py_DECREF(str_obj);
12462    if (kind1 != kind && buf1)
12463        PyMem_Free(buf1);
12464    if (kind2 != kind && buf2)
12465        PyMem_Free(buf2);
12466    return NULL;
12467}
12468
12469PyDoc_STRVAR(partition__doc__,
12470             "S.partition(sep) -> (head, sep, tail)\n\
12471\n\
12472Search for the separator sep in S, and return the part before it,\n\
12473the separator itself, and the part after it.  If the separator is not\n\
12474found, return S and two empty strings.");
12475
12476static PyObject*
12477unicode_partition(PyObject *self, PyObject *separator)
12478{
12479    return PyUnicode_Partition(self, separator);
12480}
12481
12482PyDoc_STRVAR(rpartition__doc__,
12483             "S.rpartition(sep) -> (head, sep, tail)\n\
12484\n\
12485Search for the separator sep in S, starting at the end of S, and return\n\
12486the part before it, the separator itself, and the part after it.  If the\n\
12487separator is not found, return two empty strings and S.");
12488
12489static PyObject*
12490unicode_rpartition(PyObject *self, PyObject *separator)
12491{
12492    return PyUnicode_RPartition(self, separator);
12493}
12494
12495PyObject *
12496PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12497{
12498    PyObject *result;
12499
12500    s = PyUnicode_FromObject(s);
12501    if (s == NULL)
12502        return NULL;
12503    if (sep != NULL) {
12504        sep = PyUnicode_FromObject(sep);
12505        if (sep == NULL) {
12506            Py_DECREF(s);
12507            return NULL;
12508        }
12509    }
12510
12511    result = rsplit(s, sep, maxsplit);
12512
12513    Py_DECREF(s);
12514    Py_XDECREF(sep);
12515    return result;
12516}
12517
12518PyDoc_STRVAR(rsplit__doc__,
12519             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12520\n\
12521Return a list of the words in S, using sep as the\n\
12522delimiter string, starting at the end of the string and\n\
12523working to the front.  If maxsplit is given, at most maxsplit\n\
12524splits are done. If sep is not specified, any whitespace string\n\
12525is a separator.");
12526
12527static PyObject*
12528unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12529{
12530    static char *kwlist[] = {"sep", "maxsplit", 0};
12531    PyObject *substring = Py_None;
12532    Py_ssize_t maxcount = -1;
12533
12534    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12535                                     kwlist, &substring, &maxcount))
12536        return NULL;
12537
12538    if (substring == Py_None)
12539        return rsplit(self, NULL, maxcount);
12540    else if (PyUnicode_Check(substring))
12541        return rsplit(self, substring, maxcount);
12542    else
12543        return PyUnicode_RSplit(self, substring, maxcount);
12544}
12545
12546PyDoc_STRVAR(splitlines__doc__,
12547             "S.splitlines([keepends]) -> list of strings\n\
12548\n\
12549Return a list of the lines in S, breaking at line boundaries.\n\
12550Line breaks are not included in the resulting list unless keepends\n\
12551is given and true.");
12552
12553static PyObject*
12554unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12555{
12556    static char *kwlist[] = {"keepends", 0};
12557    int keepends = 0;
12558
12559    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12560                                     kwlist, &keepends))
12561        return NULL;
12562
12563    return PyUnicode_Splitlines(self, keepends);
12564}
12565
12566static
12567PyObject *unicode_str(PyObject *self)
12568{
12569    return unicode_result_unchanged(self);
12570}
12571
12572PyDoc_STRVAR(swapcase__doc__,
12573             "S.swapcase() -> str\n\
12574\n\
12575Return a copy of S with uppercase characters converted to lowercase\n\
12576and vice versa.");
12577
12578static PyObject*
12579unicode_swapcase(PyObject *self)
12580{
12581    if (PyUnicode_READY(self) == -1)
12582        return NULL;
12583    return case_operation(self, do_swapcase);
12584}
12585
12586PyDoc_STRVAR(maketrans__doc__,
12587             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12588\n\
12589Return a translation table usable for str.translate().\n\
12590If there is only one argument, it must be a dictionary mapping Unicode\n\
12591ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12592Character keys will be then converted to ordinals.\n\
12593If there are two arguments, they must be strings of equal length, and\n\
12594in the resulting dictionary, each character in x will be mapped to the\n\
12595character at the same position in y. If there is a third argument, it\n\
12596must be a string, whose characters will be mapped to None in the result.");
12597
12598static PyObject*
12599unicode_maketrans(PyObject *null, PyObject *args)
12600{
12601    PyObject *x, *y = NULL, *z = NULL;
12602    PyObject *new = NULL, *key, *value;
12603    Py_ssize_t i = 0;
12604    int res;
12605
12606    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12607        return NULL;
12608    new = PyDict_New();
12609    if (!new)
12610        return NULL;
12611    if (y != NULL) {
12612        int x_kind, y_kind, z_kind;
12613        void *x_data, *y_data, *z_data;
12614
12615        /* x must be a string too, of equal length */
12616        if (!PyUnicode_Check(x)) {
12617            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12618                            "be a string if there is a second argument");
12619            goto err;
12620        }
12621        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12622            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12623                            "arguments must have equal length");
12624            goto err;
12625        }
12626        /* create entries for translating chars in x to those in y */
12627        x_kind = PyUnicode_KIND(x);
12628        y_kind = PyUnicode_KIND(y);
12629        x_data = PyUnicode_DATA(x);
12630        y_data = PyUnicode_DATA(y);
12631        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12632            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12633            if (!key)
12634                goto err;
12635            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12636            if (!value) {
12637                Py_DECREF(key);
12638                goto err;
12639            }
12640            res = PyDict_SetItem(new, key, value);
12641            Py_DECREF(key);
12642            Py_DECREF(value);
12643            if (res < 0)
12644                goto err;
12645        }
12646        /* create entries for deleting chars in z */
12647        if (z != NULL) {
12648            z_kind = PyUnicode_KIND(z);
12649            z_data = PyUnicode_DATA(z);
12650            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12651                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12652                if (!key)
12653                    goto err;
12654                res = PyDict_SetItem(new, key, Py_None);
12655                Py_DECREF(key);
12656                if (res < 0)
12657                    goto err;
12658            }
12659        }
12660    } else {
12661        int kind;
12662        void *data;
12663
12664        /* x must be a dict */
12665        if (!PyDict_CheckExact(x)) {
12666            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12667                            "to maketrans it must be a dict");
12668            goto err;
12669        }
12670        /* copy entries into the new dict, converting string keys to int keys */
12671        while (PyDict_Next(x, &i, &key, &value)) {
12672            if (PyUnicode_Check(key)) {
12673                /* convert string keys to integer keys */
12674                PyObject *newkey;
12675                if (PyUnicode_GET_LENGTH(key) != 1) {
12676                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12677                                    "table must be of length 1");
12678                    goto err;
12679                }
12680                kind = PyUnicode_KIND(key);
12681                data = PyUnicode_DATA(key);
12682                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12683                if (!newkey)
12684                    goto err;
12685                res = PyDict_SetItem(new, newkey, value);
12686                Py_DECREF(newkey);
12687                if (res < 0)
12688                    goto err;
12689            } else if (PyLong_Check(key)) {
12690                /* just keep integer keys */
12691                if (PyDict_SetItem(new, key, value) < 0)
12692                    goto err;
12693            } else {
12694                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12695                                "be strings or integers");
12696                goto err;
12697            }
12698        }
12699    }
12700    return new;
12701  err:
12702    Py_DECREF(new);
12703    return NULL;
12704}
12705
12706PyDoc_STRVAR(translate__doc__,
12707             "S.translate(table) -> str\n\
12708\n\
12709Return a copy of the string S, where all characters have been mapped\n\
12710through the given translation table, which must be a mapping of\n\
12711Unicode ordinals to Unicode ordinals, strings, or None.\n\
12712Unmapped characters are left untouched. Characters mapped to None\n\
12713are deleted.");
12714
12715static PyObject*
12716unicode_translate(PyObject *self, PyObject *table)
12717{
12718    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12719}
12720
12721PyDoc_STRVAR(upper__doc__,
12722             "S.upper() -> str\n\
12723\n\
12724Return a copy of S converted to uppercase.");
12725
12726static PyObject*
12727unicode_upper(PyObject *self)
12728{
12729    if (PyUnicode_READY(self) == -1)
12730        return NULL;
12731    if (PyUnicode_IS_ASCII(self))
12732        return ascii_upper_or_lower(self, 0);
12733    return case_operation(self, do_upper);
12734}
12735
12736PyDoc_STRVAR(zfill__doc__,
12737             "S.zfill(width) -> str\n\
12738\n\
12739Pad a numeric string S with zeros on the left, to fill a field\n\
12740of the specified width. The string S is never truncated.");
12741
12742static PyObject *
12743unicode_zfill(PyObject *self, PyObject *args)
12744{
12745    Py_ssize_t fill;
12746    PyObject *u;
12747    Py_ssize_t width;
12748    int kind;
12749    void *data;
12750    Py_UCS4 chr;
12751
12752    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12753        return NULL;
12754
12755    if (PyUnicode_READY(self) == -1)
12756        return NULL;
12757
12758    if (PyUnicode_GET_LENGTH(self) >= width)
12759        return unicode_result_unchanged(self);
12760
12761    fill = width - PyUnicode_GET_LENGTH(self);
12762
12763    u = pad(self, fill, 0, '0');
12764
12765    if (u == NULL)
12766        return NULL;
12767
12768    kind = PyUnicode_KIND(u);
12769    data = PyUnicode_DATA(u);
12770    chr = PyUnicode_READ(kind, data, fill);
12771
12772    if (chr == '+' || chr == '-') {
12773        /* move sign to beginning of string */
12774        PyUnicode_WRITE(kind, data, 0, chr);
12775        PyUnicode_WRITE(kind, data, fill, '0');
12776    }
12777
12778    assert(_PyUnicode_CheckConsistency(u, 1));
12779    return u;
12780}
12781
12782#if 0
12783static PyObject *
12784unicode__decimal2ascii(PyObject *self)
12785{
12786    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12787}
12788#endif
12789
12790PyDoc_STRVAR(startswith__doc__,
12791             "S.startswith(prefix[, start[, end]]) -> bool\n\
12792\n\
12793Return True if S starts with the specified prefix, False otherwise.\n\
12794With optional start, test S beginning at that position.\n\
12795With optional end, stop comparing S at that position.\n\
12796prefix can also be a tuple of strings to try.");
12797
12798static PyObject *
12799unicode_startswith(PyObject *self,
12800                   PyObject *args)
12801{
12802    PyObject *subobj;
12803    PyObject *substring;
12804    Py_ssize_t start = 0;
12805    Py_ssize_t end = PY_SSIZE_T_MAX;
12806    int result;
12807
12808    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12809        return NULL;
12810    if (PyTuple_Check(subobj)) {
12811        Py_ssize_t i;
12812        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12813            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12814            if (substring == NULL)
12815                return NULL;
12816            result = tailmatch(self, substring, start, end, -1);
12817            Py_DECREF(substring);
12818            if (result) {
12819                Py_RETURN_TRUE;
12820            }
12821        }
12822        /* nothing matched */
12823        Py_RETURN_FALSE;
12824    }
12825    substring = PyUnicode_FromObject(subobj);
12826    if (substring == NULL) {
12827        if (PyErr_ExceptionMatches(PyExc_TypeError))
12828            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12829                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12830        return NULL;
12831    }
12832    result = tailmatch(self, substring, start, end, -1);
12833    Py_DECREF(substring);
12834    return PyBool_FromLong(result);
12835}
12836
12837
12838PyDoc_STRVAR(endswith__doc__,
12839             "S.endswith(suffix[, start[, end]]) -> bool\n\
12840\n\
12841Return True if S ends with the specified suffix, False otherwise.\n\
12842With optional start, test S beginning at that position.\n\
12843With optional end, stop comparing S at that position.\n\
12844suffix can also be a tuple of strings to try.");
12845
12846static PyObject *
12847unicode_endswith(PyObject *self,
12848                 PyObject *args)
12849{
12850    PyObject *subobj;
12851    PyObject *substring;
12852    Py_ssize_t start = 0;
12853    Py_ssize_t end = PY_SSIZE_T_MAX;
12854    int result;
12855
12856    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12857        return NULL;
12858    if (PyTuple_Check(subobj)) {
12859        Py_ssize_t i;
12860        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12861            substring = PyUnicode_FromObject(
12862                PyTuple_GET_ITEM(subobj, i));
12863            if (substring == NULL)
12864                return NULL;
12865            result = tailmatch(self, substring, start, end, +1);
12866            Py_DECREF(substring);
12867            if (result) {
12868                Py_RETURN_TRUE;
12869            }
12870        }
12871        Py_RETURN_FALSE;
12872    }
12873    substring = PyUnicode_FromObject(subobj);
12874    if (substring == NULL) {
12875        if (PyErr_ExceptionMatches(PyExc_TypeError))
12876            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12877                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12878        return NULL;
12879    }
12880    result = tailmatch(self, substring, start, end, +1);
12881    Py_DECREF(substring);
12882    return PyBool_FromLong(result);
12883}
12884
12885typedef struct {
12886    PyObject *buffer;
12887    void *data;
12888    enum PyUnicode_Kind kind;
12889    Py_UCS4 maxchar;
12890    Py_ssize_t pos;
12891} _PyUnicodeWriter ;
12892
12893Py_LOCAL_INLINE(void)
12894_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12895{
12896    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12897    writer->data = PyUnicode_DATA(writer->buffer);
12898    writer->kind = PyUnicode_KIND(writer->buffer);
12899}
12900
12901Py_LOCAL(int)
12902_PyUnicodeWriter_Init(_PyUnicodeWriter *writer,
12903                    Py_ssize_t length, Py_UCS4 maxchar)
12904{
12905    writer->pos = 0;
12906    writer->buffer = PyUnicode_New(length, maxchar);
12907    if (writer->buffer == NULL)
12908        return -1;
12909    _PyUnicodeWriter_Update(writer);
12910    return 0;
12911}
12912
12913Py_LOCAL_INLINE(int)
12914_PyUnicodeWriter_Prepare(_PyUnicodeWriter *writer,
12915                       Py_ssize_t length, Py_UCS4 maxchar)
12916{
12917    Py_ssize_t newlen;
12918    PyObject *newbuffer;
12919
12920    if (length > PY_SSIZE_T_MAX - writer->pos) {
12921        PyErr_NoMemory();
12922        return -1;
12923    }
12924    newlen = writer->pos + length;
12925
12926    if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
12927        /* overallocate 25% to limit the number of resize */
12928        if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12929            newlen += newlen / 4;
12930
12931        if (maxchar > writer->maxchar) {
12932            /* resize + widen */
12933            newbuffer = PyUnicode_New(newlen, maxchar);
12934            if (newbuffer == NULL)
12935                return -1;
12936            PyUnicode_CopyCharacters(newbuffer, 0,
12937                                     writer->buffer, 0, writer->pos);
12938            Py_DECREF(writer->buffer);
12939        }
12940        else {
12941            newbuffer = resize_compact(writer->buffer, newlen);
12942            if (newbuffer == NULL)
12943                return -1;
12944        }
12945        writer->buffer = newbuffer;
12946        _PyUnicodeWriter_Update(writer);
12947    }
12948    else if (maxchar > writer->maxchar) {
12949        if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
12950            return -1;
12951        _PyUnicodeWriter_Update(writer);
12952    }
12953    return 0;
12954}
12955
12956Py_LOCAL(PyObject *)
12957_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
12958{
12959    if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) {
12960        Py_DECREF(writer->buffer);
12961        return NULL;
12962    }
12963    assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
12964    return writer->buffer;
12965}
12966
12967Py_LOCAL(void)
12968_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
12969{
12970    Py_CLEAR(writer->buffer);
12971}
12972
12973#include "stringlib/unicode_format.h"
12974
12975PyDoc_STRVAR(format__doc__,
12976             "S.format(*args, **kwargs) -> str\n\
12977\n\
12978Return a formatted version of S, using substitutions from args and kwargs.\n\
12979The substitutions are identified by braces ('{' and '}').");
12980
12981PyDoc_STRVAR(format_map__doc__,
12982             "S.format_map(mapping) -> str\n\
12983\n\
12984Return a formatted version of S, using substitutions from mapping.\n\
12985The substitutions are identified by braces ('{' and '}').");
12986
12987static PyObject *
12988unicode__format__(PyObject* self, PyObject* args)
12989{
12990    PyObject *format_spec, *out;
12991
12992    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12993        return NULL;
12994
12995    out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
12996                                     PyUnicode_GET_LENGTH(format_spec));
12997    return out;
12998}
12999
13000PyDoc_STRVAR(p_format__doc__,
13001             "S.__format__(format_spec) -> str\n\
13002\n\
13003Return a formatted version of S as described by format_spec.");
13004
13005static PyObject *
13006unicode__sizeof__(PyObject *v)
13007{
13008    Py_ssize_t size;
13009
13010    /* If it's a compact object, account for base structure +
13011       character data. */
13012    if (PyUnicode_IS_COMPACT_ASCII(v))
13013        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13014    else if (PyUnicode_IS_COMPACT(v))
13015        size = sizeof(PyCompactUnicodeObject) +
13016            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13017    else {
13018        /* If it is a two-block object, account for base object, and
13019           for character block if present. */
13020        size = sizeof(PyUnicodeObject);
13021        if (_PyUnicode_DATA_ANY(v))
13022            size += (PyUnicode_GET_LENGTH(v) + 1) *
13023                PyUnicode_KIND(v);
13024    }
13025    /* If the wstr pointer is present, account for it unless it is shared
13026       with the data pointer. Check if the data is not shared. */
13027    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13028        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13029    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13030        size += PyUnicode_UTF8_LENGTH(v) + 1;
13031
13032    return PyLong_FromSsize_t(size);
13033}
13034
13035PyDoc_STRVAR(sizeof__doc__,
13036             "S.__sizeof__() -> size of S in memory, in bytes");
13037
13038static PyObject *
13039unicode_getnewargs(PyObject *v)
13040{
13041    PyObject *copy = _PyUnicode_Copy(v);
13042    if (!copy)
13043        return NULL;
13044    return Py_BuildValue("(N)", copy);
13045}
13046
13047static PyMethodDef unicode_methods[] = {
13048    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13049    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13050    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13051    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13052    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13053    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13054    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13055    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13056    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13057    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13058    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13059    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13060    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13061    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13062    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13063    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13064    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13065    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13066    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13067    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13068    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13069    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13070    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13071    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13072    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13073    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13074    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13075    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13076    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13077    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13078    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13079    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13080    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13081    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13082    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13083    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13084    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13085    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13086    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13087    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13088    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13089    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13090    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13091    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13092    {"maketrans", (PyCFunction) unicode_maketrans,
13093     METH_VARARGS | METH_STATIC, maketrans__doc__},
13094    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13095#if 0
13096    /* These methods are just used for debugging the implementation. */
13097    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13098#endif
13099
13100    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13101    {NULL, NULL}
13102};
13103
13104static PyObject *
13105unicode_mod(PyObject *v, PyObject *w)
13106{
13107    if (!PyUnicode_Check(v))
13108        Py_RETURN_NOTIMPLEMENTED;
13109    return PyUnicode_Format(v, w);
13110}
13111
13112static PyNumberMethods unicode_as_number = {
13113    0,              /*nb_add*/
13114    0,              /*nb_subtract*/
13115    0,              /*nb_multiply*/
13116    unicode_mod,            /*nb_remainder*/
13117};
13118
13119static PySequenceMethods unicode_as_sequence = {
13120    (lenfunc) unicode_length,       /* sq_length */
13121    PyUnicode_Concat,           /* sq_concat */
13122    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13123    (ssizeargfunc) unicode_getitem,     /* sq_item */
13124    0,                  /* sq_slice */
13125    0,                  /* sq_ass_item */
13126    0,                  /* sq_ass_slice */
13127    PyUnicode_Contains,         /* sq_contains */
13128};
13129
13130static PyObject*
13131unicode_subscript(PyObject* self, PyObject* item)
13132{
13133    if (PyUnicode_READY(self) == -1)
13134        return NULL;
13135
13136    if (PyIndex_Check(item)) {
13137        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13138        if (i == -1 && PyErr_Occurred())
13139            return NULL;
13140        if (i < 0)
13141            i += PyUnicode_GET_LENGTH(self);
13142        return unicode_getitem(self, i);
13143    } else if (PySlice_Check(item)) {
13144        Py_ssize_t start, stop, step, slicelength, cur, i;
13145        PyObject *result;
13146        void *src_data, *dest_data;
13147        int src_kind, dest_kind;
13148        Py_UCS4 ch, max_char, kind_limit;
13149
13150        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13151                                 &start, &stop, &step, &slicelength) < 0) {
13152            return NULL;
13153        }
13154
13155        if (slicelength <= 0) {
13156            Py_INCREF(unicode_empty);
13157            return unicode_empty;
13158        } else if (start == 0 && step == 1 &&
13159                   slicelength == PyUnicode_GET_LENGTH(self)) {
13160            return unicode_result_unchanged(self);
13161        } else if (step == 1) {
13162            return PyUnicode_Substring(self,
13163                                       start, start + slicelength);
13164        }
13165        /* General case */
13166        src_kind = PyUnicode_KIND(self);
13167        src_data = PyUnicode_DATA(self);
13168        if (!PyUnicode_IS_ASCII(self)) {
13169            kind_limit = kind_maxchar_limit(src_kind);
13170            max_char = 0;
13171            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13172                ch = PyUnicode_READ(src_kind, src_data, cur);
13173                if (ch > max_char) {
13174                    max_char = ch;
13175                    if (max_char >= kind_limit)
13176                        break;
13177                }
13178            }
13179        }
13180        else
13181            max_char = 127;
13182        result = PyUnicode_New(slicelength, max_char);
13183        if (result == NULL)
13184            return NULL;
13185        dest_kind = PyUnicode_KIND(result);
13186        dest_data = PyUnicode_DATA(result);
13187
13188        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13189            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13190            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13191        }
13192        assert(_PyUnicode_CheckConsistency(result, 1));
13193        return result;
13194    } else {
13195        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13196        return NULL;
13197    }
13198}
13199
13200static PyMappingMethods unicode_as_mapping = {
13201    (lenfunc)unicode_length,        /* mp_length */
13202    (binaryfunc)unicode_subscript,  /* mp_subscript */
13203    (objobjargproc)0,           /* mp_ass_subscript */
13204};
13205
13206
13207/* Helpers for PyUnicode_Format() */
13208
13209static PyObject *
13210getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
13211{
13212    Py_ssize_t argidx = *p_argidx;
13213    if (argidx < arglen) {
13214        (*p_argidx)++;
13215        if (arglen < 0)
13216            return args;
13217        else
13218            return PyTuple_GetItem(args, argidx);
13219    }
13220    PyErr_SetString(PyExc_TypeError,
13221                    "not enough arguments for format string");
13222    return NULL;
13223}
13224
13225/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13226
13227static PyObject *
13228formatfloat(PyObject *v, int flags, int prec, int type)
13229{
13230    char *p;
13231    PyObject *result;
13232    double x;
13233
13234    x = PyFloat_AsDouble(v);
13235    if (x == -1.0 && PyErr_Occurred())
13236        return NULL;
13237
13238    if (prec < 0)
13239        prec = 6;
13240
13241    p = PyOS_double_to_string(x, type, prec,
13242                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
13243    if (p == NULL)
13244        return NULL;
13245    result = unicode_fromascii((unsigned char*)p, strlen(p));
13246    PyMem_Free(p);
13247    return result;
13248}
13249
13250/* formatlong() emulates the format codes d, u, o, x and X, and
13251 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13252 * Python's regular ints.
13253 * Return value:  a new PyUnicodeObject*, or NULL if error.
13254 *     The output string is of the form
13255 *         "-"? ("0x" | "0X")? digit+
13256 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13257 *         set in flags.  The case of hex digits will be correct,
13258 *     There will be at least prec digits, zero-filled on the left if
13259 *         necessary to get that many.
13260 * val          object to be converted
13261 * flags        bitmask of format flags; only F_ALT is looked at
13262 * prec         minimum number of digits; 0-fill on left if needed
13263 * type         a character in [duoxX]; u acts the same as d
13264 *
13265 * CAUTION:  o, x and X conversions on regular ints can never
13266 * produce a '-' sign, but can for Python's unbounded ints.
13267 */
13268static PyObject*
13269formatlong(PyObject *val, int flags, int prec, int type)
13270{
13271    PyObject *result = NULL;
13272    char *buf;
13273    Py_ssize_t i;
13274    int sign;           /* 1 if '-', else 0 */
13275    int len;            /* number of characters */
13276    Py_ssize_t llen;
13277    int numdigits;      /* len == numnondigits + numdigits */
13278    int numnondigits = 0;
13279
13280    /* Avoid exceeding SSIZE_T_MAX */
13281    if (prec > INT_MAX-3) {
13282        PyErr_SetString(PyExc_OverflowError,
13283                        "precision too large");
13284        return NULL;
13285    }
13286
13287    assert(PyLong_Check(val));
13288
13289    switch (type) {
13290    case 'd':
13291    case 'u':
13292        /* Special-case boolean: we want 0/1 */
13293        if (PyBool_Check(val))
13294            result = PyNumber_ToBase(val, 10);
13295        else
13296            result = Py_TYPE(val)->tp_str(val);
13297        break;
13298    case 'o':
13299        numnondigits = 2;
13300        result = PyNumber_ToBase(val, 8);
13301        break;
13302    case 'x':
13303    case 'X':
13304        numnondigits = 2;
13305        result = PyNumber_ToBase(val, 16);
13306        break;
13307    default:
13308        assert(!"'type' not in [duoxX]");
13309    }
13310    if (!result)
13311        return NULL;
13312
13313    assert(unicode_modifiable(result));
13314    assert(PyUnicode_IS_READY(result));
13315    assert(PyUnicode_IS_ASCII(result));
13316
13317    /* To modify the string in-place, there can only be one reference. */
13318    if (Py_REFCNT(result) != 1) {
13319        PyErr_BadInternalCall();
13320        return NULL;
13321    }
13322    buf = PyUnicode_DATA(result);
13323    llen = PyUnicode_GET_LENGTH(result);
13324    if (llen > INT_MAX) {
13325        PyErr_SetString(PyExc_ValueError,
13326                        "string too large in _PyBytes_FormatLong");
13327        return NULL;
13328    }
13329    len = (int)llen;
13330    sign = buf[0] == '-';
13331    numnondigits += sign;
13332    numdigits = len - numnondigits;
13333    assert(numdigits > 0);
13334
13335    /* Get rid of base marker unless F_ALT */
13336    if (((flags & F_ALT) == 0 &&
13337        (type == 'o' || type == 'x' || type == 'X'))) {
13338        assert(buf[sign] == '0');
13339        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13340               buf[sign+1] == 'o');
13341        numnondigits -= 2;
13342        buf += 2;
13343        len -= 2;
13344        if (sign)
13345            buf[0] = '-';
13346        assert(len == numnondigits + numdigits);
13347        assert(numdigits > 0);
13348    }
13349
13350    /* Fill with leading zeroes to meet minimum width. */
13351    if (prec > numdigits) {
13352        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13353                                numnondigits + prec);
13354        char *b1;
13355        if (!r1) {
13356            Py_DECREF(result);
13357            return NULL;
13358        }
13359        b1 = PyBytes_AS_STRING(r1);
13360        for (i = 0; i < numnondigits; ++i)
13361            *b1++ = *buf++;
13362        for (i = 0; i < prec - numdigits; i++)
13363            *b1++ = '0';
13364        for (i = 0; i < numdigits; i++)
13365            *b1++ = *buf++;
13366        *b1 = '\0';
13367        Py_DECREF(result);
13368        result = r1;
13369        buf = PyBytes_AS_STRING(result);
13370        len = numnondigits + prec;
13371    }
13372
13373    /* Fix up case for hex conversions. */
13374    if (type == 'X') {
13375        /* Need to convert all lower case letters to upper case.
13376           and need to convert 0x to 0X (and -0x to -0X). */
13377        for (i = 0; i < len; i++)
13378            if (buf[i] >= 'a' && buf[i] <= 'x')
13379                buf[i] -= 'a'-'A';
13380    }
13381    if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13382        PyObject *unicode;
13383        unicode = unicode_fromascii((unsigned char *)buf, len);
13384        Py_DECREF(result);
13385        result = unicode;
13386    }
13387    return result;
13388}
13389
13390static Py_UCS4
13391formatchar(PyObject *v)
13392{
13393    /* presume that the buffer is at least 3 characters long */
13394    if (PyUnicode_Check(v)) {
13395        if (PyUnicode_GET_LENGTH(v) == 1) {
13396            return PyUnicode_READ_CHAR(v, 0);
13397        }
13398        goto onError;
13399    }
13400    else {
13401        /* Integer input truncated to a character */
13402        long x;
13403        x = PyLong_AsLong(v);
13404        if (x == -1 && PyErr_Occurred())
13405            goto onError;
13406
13407        if (x < 0 || x > MAX_UNICODE) {
13408            PyErr_SetString(PyExc_OverflowError,
13409                            "%c arg not in range(0x110000)");
13410            return (Py_UCS4) -1;
13411        }
13412
13413        return (Py_UCS4) x;
13414    }
13415
13416  onError:
13417    PyErr_SetString(PyExc_TypeError,
13418                    "%c requires int or char");
13419    return (Py_UCS4) -1;
13420}
13421
13422PyObject *
13423PyUnicode_Format(PyObject *format, PyObject *args)
13424{
13425    Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
13426    int args_owned = 0;
13427    PyObject *dict = NULL;
13428    PyObject *temp = NULL;
13429    PyObject *second = NULL;
13430    PyObject *uformat;
13431    void *fmt;
13432    enum PyUnicode_Kind kind, fmtkind;
13433    _PyUnicodeWriter writer;
13434    Py_ssize_t sublen;
13435    Py_UCS4 maxchar;
13436
13437    if (format == NULL || args == NULL) {
13438        PyErr_BadInternalCall();
13439        return NULL;
13440    }
13441    uformat = PyUnicode_FromObject(format);
13442    if (uformat == NULL)
13443        return NULL;
13444    if (PyUnicode_READY(uformat) == -1)
13445        Py_DECREF(uformat);
13446
13447    fmt = PyUnicode_DATA(uformat);
13448    fmtkind = PyUnicode_KIND(uformat);
13449    fmtcnt = PyUnicode_GET_LENGTH(uformat);
13450    fmtpos = 0;
13451
13452    if (_PyUnicodeWriter_Init(&writer, fmtcnt + 100, 127) < 0)
13453        goto onError;
13454
13455    if (PyTuple_Check(args)) {
13456        arglen = PyTuple_Size(args);
13457        argidx = 0;
13458    }
13459    else {
13460        arglen = -1;
13461        argidx = -2;
13462    }
13463    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
13464        !PyUnicode_Check(args))
13465        dict = args;
13466
13467    while (--fmtcnt >= 0) {
13468        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13469            Py_ssize_t nonfmtpos;
13470            nonfmtpos = fmtpos++;
13471            while (fmtcnt >= 0 &&
13472                   PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13473                fmtpos++;
13474                fmtcnt--;
13475            }
13476            if (fmtcnt < 0)
13477                fmtpos--;
13478            sublen = fmtpos - nonfmtpos;
13479            maxchar = _PyUnicode_FindMaxChar(uformat,
13480                                             nonfmtpos, nonfmtpos + sublen);
13481            if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
13482                goto onError;
13483
13484            copy_characters(writer.buffer, writer.pos,
13485                            uformat, nonfmtpos, sublen);
13486            writer.pos += sublen;
13487        }
13488        else {
13489            /* Got a format specifier */
13490            int flags = 0;
13491            Py_ssize_t width = -1;
13492            int prec = -1;
13493            Py_UCS4 c = '\0';
13494            Py_UCS4 fill;
13495            int sign;
13496            Py_UCS4 signchar;
13497            int isnumok;
13498            PyObject *v = NULL;
13499            void *pbuf = NULL;
13500            Py_ssize_t pindex, len;
13501            Py_UCS4 bufmaxchar;
13502            Py_ssize_t buflen;
13503
13504            fmtpos++;
13505            c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13506            if (c == '(') {
13507                Py_ssize_t keystart;
13508                Py_ssize_t keylen;
13509                PyObject *key;
13510                int pcount = 1;
13511
13512                if (dict == NULL) {
13513                    PyErr_SetString(PyExc_TypeError,
13514                                    "format requires a mapping");
13515                    goto onError;
13516                }
13517                ++fmtpos;
13518                --fmtcnt;
13519                keystart = fmtpos;
13520                /* Skip over balanced parentheses */
13521                while (pcount > 0 && --fmtcnt >= 0) {
13522                    c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13523                    if (c == ')')
13524                        --pcount;
13525                    else if (c == '(')
13526                        ++pcount;
13527                    fmtpos++;
13528                }
13529                keylen = fmtpos - keystart - 1;
13530                if (fmtcnt < 0 || pcount > 0) {
13531                    PyErr_SetString(PyExc_ValueError,
13532                                    "incomplete format key");
13533                    goto onError;
13534                }
13535                key = PyUnicode_Substring(uformat,
13536                                          keystart, keystart + keylen);
13537                if (key == NULL)
13538                    goto onError;
13539                if (args_owned) {
13540                    Py_DECREF(args);
13541                    args_owned = 0;
13542                }
13543                args = PyObject_GetItem(dict, key);
13544                Py_DECREF(key);
13545                if (args == NULL) {
13546                    goto onError;
13547                }
13548                args_owned = 1;
13549                arglen = -1;
13550                argidx = -2;
13551            }
13552            while (--fmtcnt >= 0) {
13553                c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13554                switch (c) {
13555                case '-': flags |= F_LJUST; continue;
13556                case '+': flags |= F_SIGN; continue;
13557                case ' ': flags |= F_BLANK; continue;
13558                case '#': flags |= F_ALT; continue;
13559                case '0': flags |= F_ZERO; continue;
13560                }
13561                break;
13562            }
13563            if (c == '*') {
13564                v = getnextarg(args, arglen, &argidx);
13565                if (v == NULL)
13566                    goto onError;
13567                if (!PyLong_Check(v)) {
13568                    PyErr_SetString(PyExc_TypeError,
13569                                    "* wants int");
13570                    goto onError;
13571                }
13572                width = PyLong_AsLong(v);
13573                if (width == -1 && PyErr_Occurred())
13574                    goto onError;
13575                if (width < 0) {
13576                    flags |= F_LJUST;
13577                    width = -width;
13578                }
13579                if (--fmtcnt >= 0)
13580                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13581            }
13582            else if (c >= '0' && c <= '9') {
13583                width = c - '0';
13584                while (--fmtcnt >= 0) {
13585                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13586                    if (c < '0' || c > '9')
13587                        break;
13588                    if (width > (PY_SSIZE_T_MAX - (c - '0')) / 10) {
13589                        PyErr_SetString(PyExc_ValueError,
13590                                        "width too big");
13591                        goto onError;
13592                    }
13593                    width = width*10 + (c - '0');
13594                }
13595            }
13596            if (c == '.') {
13597                prec = 0;
13598                if (--fmtcnt >= 0)
13599                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13600                if (c == '*') {
13601                    v = getnextarg(args, arglen, &argidx);
13602                    if (v == NULL)
13603                        goto onError;
13604                    if (!PyLong_Check(v)) {
13605                        PyErr_SetString(PyExc_TypeError,
13606                                        "* wants int");
13607                        goto onError;
13608                    }
13609                    prec = PyLong_AsLong(v);
13610                    if (prec == -1 && PyErr_Occurred())
13611                        goto onError;
13612                    if (prec < 0)
13613                        prec = 0;
13614                    if (--fmtcnt >= 0)
13615                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13616                }
13617                else if (c >= '0' && c <= '9') {
13618                    prec = c - '0';
13619                    while (--fmtcnt >= 0) {
13620                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13621                        if (c < '0' || c > '9')
13622                            break;
13623                        if (prec > (INT_MAX - (c - '0')) / 10) {
13624                            PyErr_SetString(PyExc_ValueError,
13625                                            "prec too big");
13626                            goto onError;
13627                        }
13628                        prec = prec*10 + (c - '0');
13629                    }
13630                }
13631            } /* prec */
13632            if (fmtcnt >= 0) {
13633                if (c == 'h' || c == 'l' || c == 'L') {
13634                    if (--fmtcnt >= 0)
13635                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13636                }
13637            }
13638            if (fmtcnt < 0) {
13639                PyErr_SetString(PyExc_ValueError,
13640                                "incomplete format");
13641                goto onError;
13642            }
13643
13644            if (c == '%') {
13645                if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
13646                    goto onError;
13647                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13648                writer.pos += 1;
13649                continue;
13650            }
13651
13652
13653            v = getnextarg(args, arglen, &argidx);
13654            if (v == NULL)
13655                goto onError;
13656
13657            sign = 0;
13658            signchar = '\0';
13659            fill = ' ';
13660            switch (c) {
13661
13662            case 's':
13663            case 'r':
13664            case 'a':
13665                if (PyUnicode_CheckExact(v) && c == 's') {
13666                    temp = v;
13667                    Py_INCREF(temp);
13668                }
13669                else {
13670                    if (c == 's')
13671                        temp = PyObject_Str(v);
13672                    else if (c == 'r')
13673                        temp = PyObject_Repr(v);
13674                    else
13675                        temp = PyObject_ASCII(v);
13676                }
13677                break;
13678
13679            case 'i':
13680            case 'd':
13681            case 'u':
13682            case 'o':
13683            case 'x':
13684            case 'X':
13685                isnumok = 0;
13686                if (PyNumber_Check(v)) {
13687                    PyObject *iobj=NULL;
13688
13689                    if (PyLong_Check(v)) {
13690                        iobj = v;
13691                        Py_INCREF(iobj);
13692                    }
13693                    else {
13694                        iobj = PyNumber_Long(v);
13695                    }
13696                    if (iobj!=NULL) {
13697                        if (PyLong_Check(iobj)) {
13698                            isnumok = 1;
13699                            sign = 1;
13700                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13701                            Py_DECREF(iobj);
13702                        }
13703                        else {
13704                            Py_DECREF(iobj);
13705                        }
13706                    }
13707                }
13708                if (!isnumok) {
13709                    PyErr_Format(PyExc_TypeError,
13710                                 "%%%c format: a number is required, "
13711                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13712                    goto onError;
13713                }
13714                if (flags & F_ZERO)
13715                    fill = '0';
13716                break;
13717
13718            case 'e':
13719            case 'E':
13720            case 'f':
13721            case 'F':
13722            case 'g':
13723            case 'G':
13724                sign = 1;
13725                if (flags & F_ZERO)
13726                    fill = '0';
13727                temp = formatfloat(v, flags, prec, c);
13728                break;
13729
13730            case 'c':
13731            {
13732                Py_UCS4 ch = formatchar(v);
13733                if (ch == (Py_UCS4) -1)
13734                    goto onError;
13735                temp = PyUnicode_FromOrdinal(ch);
13736                break;
13737            }
13738
13739            default:
13740                PyErr_Format(PyExc_ValueError,
13741                             "unsupported format character '%c' (0x%x) "
13742                             "at index %zd",
13743                             (31<=c && c<=126) ? (char)c : '?',
13744                             (int)c,
13745                             fmtpos - 1);
13746                goto onError;
13747            }
13748            if (temp == NULL)
13749                goto onError;
13750            assert (PyUnicode_Check(temp));
13751            if (PyUnicode_READY(temp) == -1) {
13752                Py_CLEAR(temp);
13753                goto onError;
13754            }
13755            kind = PyUnicode_KIND(temp);
13756            pbuf = PyUnicode_DATA(temp);
13757            len = PyUnicode_GET_LENGTH(temp);
13758
13759            if (c == 's' || c == 'r' || c == 'a') {
13760                if (prec >= 0 && len > prec)
13761                    len = prec;
13762            }
13763
13764            /* pbuf is initialized here. */
13765            pindex = 0;
13766            if (sign) {
13767                Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13768                if (ch == '-' || ch == '+') {
13769                    signchar = ch;
13770                    len--;
13771                    pindex++;
13772                }
13773                else if (flags & F_SIGN)
13774                    signchar = '+';
13775                else if (flags & F_BLANK)
13776                    signchar = ' ';
13777                else
13778                    sign = 0;
13779            }
13780            if (width < len)
13781                width = len;
13782
13783            /* Compute the length and maximum character of the
13784               written characters */
13785            bufmaxchar = 127;
13786            if (!(flags & F_LJUST)) {
13787                if (sign) {
13788                    if ((width-1) > len)
13789                        bufmaxchar = Py_MAX(bufmaxchar, fill);
13790                }
13791                else {
13792                    if (width > len)
13793                        bufmaxchar = Py_MAX(bufmaxchar, fill);
13794                }
13795            }
13796            maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
13797            bufmaxchar = Py_MAX(bufmaxchar, maxchar);
13798
13799            buflen = width;
13800            if (sign && len == width)
13801                buflen++;
13802
13803            if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
13804                goto onError;
13805
13806            /* Write characters */
13807            if (sign) {
13808                if (fill != ' ') {
13809                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13810                    writer.pos += 1;
13811                }
13812                if (width > len)
13813                    width--;
13814            }
13815            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13816                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13817                assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13818                if (fill != ' ') {
13819                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13820                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13821                    writer.pos += 2;
13822                    pindex += 2;
13823                }
13824                width -= 2;
13825                if (width < 0)
13826                    width = 0;
13827                len -= 2;
13828            }
13829            if (width > len && !(flags & F_LJUST)) {
13830                sublen = width - len;
13831                FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13832                writer.pos += sublen;
13833                width = len;
13834            }
13835            if (fill == ' ') {
13836                if (sign) {
13837                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13838                    writer.pos += 1;
13839                }
13840                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13841                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13842                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13843                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13844                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13845                    writer.pos += 2;
13846                    pindex += 2;
13847                }
13848            }
13849
13850            copy_characters(writer.buffer, writer.pos,
13851                            temp, pindex, len);
13852            writer.pos += len;
13853            if (width > len) {
13854                sublen = width - len;
13855                FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13856                writer.pos += sublen;
13857            }
13858
13859            if (dict && (argidx < arglen) && c != '%') {
13860                PyErr_SetString(PyExc_TypeError,
13861                                "not all arguments converted during string formatting");
13862                goto onError;
13863            }
13864            Py_CLEAR(temp);
13865        } /* '%' */
13866    } /* until end */
13867    if (argidx < arglen && !dict) {
13868        PyErr_SetString(PyExc_TypeError,
13869                        "not all arguments converted during string formatting");
13870        goto onError;
13871    }
13872
13873    if (args_owned) {
13874        Py_DECREF(args);
13875    }
13876    Py_DECREF(uformat);
13877    Py_XDECREF(temp);
13878    Py_XDECREF(second);
13879    return _PyUnicodeWriter_Finish(&writer);
13880
13881  onError:
13882    Py_DECREF(uformat);
13883    Py_XDECREF(temp);
13884    Py_XDECREF(second);
13885    _PyUnicodeWriter_Dealloc(&writer);
13886    if (args_owned) {
13887        Py_DECREF(args);
13888    }
13889    return NULL;
13890}
13891
13892static PyObject *
13893unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13894
13895static PyObject *
13896unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13897{
13898    PyObject *x = NULL;
13899    static char *kwlist[] = {"object", "encoding", "errors", 0};
13900    char *encoding = NULL;
13901    char *errors = NULL;
13902
13903    if (type != &PyUnicode_Type)
13904        return unicode_subtype_new(type, args, kwds);
13905    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13906                                     kwlist, &x, &encoding, &errors))
13907        return NULL;
13908    if (x == NULL) {
13909        Py_INCREF(unicode_empty);
13910        return unicode_empty;
13911    }
13912    if (encoding == NULL && errors == NULL)
13913        return PyObject_Str(x);
13914    else
13915        return PyUnicode_FromEncodedObject(x, encoding, errors);
13916}
13917
13918static PyObject *
13919unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13920{
13921    PyObject *unicode, *self;
13922    Py_ssize_t length, char_size;
13923    int share_wstr, share_utf8;
13924    unsigned int kind;
13925    void *data;
13926
13927    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13928
13929    unicode = unicode_new(&PyUnicode_Type, args, kwds);
13930    if (unicode == NULL)
13931        return NULL;
13932    assert(_PyUnicode_CHECK(unicode));
13933    if (PyUnicode_READY(unicode) == -1) {
13934        Py_DECREF(unicode);
13935        return NULL;
13936    }
13937
13938    self = type->tp_alloc(type, 0);
13939    if (self == NULL) {
13940        Py_DECREF(unicode);
13941        return NULL;
13942    }
13943    kind = PyUnicode_KIND(unicode);
13944    length = PyUnicode_GET_LENGTH(unicode);
13945
13946    _PyUnicode_LENGTH(self) = length;
13947#ifdef Py_DEBUG
13948    _PyUnicode_HASH(self) = -1;
13949#else
13950    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13951#endif
13952    _PyUnicode_STATE(self).interned = 0;
13953    _PyUnicode_STATE(self).kind = kind;
13954    _PyUnicode_STATE(self).compact = 0;
13955    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13956    _PyUnicode_STATE(self).ready = 1;
13957    _PyUnicode_WSTR(self) = NULL;
13958    _PyUnicode_UTF8_LENGTH(self) = 0;
13959    _PyUnicode_UTF8(self) = NULL;
13960    _PyUnicode_WSTR_LENGTH(self) = 0;
13961    _PyUnicode_DATA_ANY(self) = NULL;
13962
13963    share_utf8 = 0;
13964    share_wstr = 0;
13965    if (kind == PyUnicode_1BYTE_KIND) {
13966        char_size = 1;
13967        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13968            share_utf8 = 1;
13969    }
13970    else if (kind == PyUnicode_2BYTE_KIND) {
13971        char_size = 2;
13972        if (sizeof(wchar_t) == 2)
13973            share_wstr = 1;
13974    }
13975    else {
13976        assert(kind == PyUnicode_4BYTE_KIND);
13977        char_size = 4;
13978        if (sizeof(wchar_t) == 4)
13979            share_wstr = 1;
13980    }
13981
13982    /* Ensure we won't overflow the length. */
13983    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13984        PyErr_NoMemory();
13985        goto onError;
13986    }
13987    data = PyObject_MALLOC((length + 1) * char_size);
13988    if (data == NULL) {
13989        PyErr_NoMemory();
13990        goto onError;
13991    }
13992
13993    _PyUnicode_DATA_ANY(self) = data;
13994    if (share_utf8) {
13995        _PyUnicode_UTF8_LENGTH(self) = length;
13996        _PyUnicode_UTF8(self) = data;
13997    }
13998    if (share_wstr) {
13999        _PyUnicode_WSTR_LENGTH(self) = length;
14000        _PyUnicode_WSTR(self) = (wchar_t *)data;
14001    }
14002
14003    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14004              kind * (length + 1));
14005    assert(_PyUnicode_CheckConsistency(self, 1));
14006#ifdef Py_DEBUG
14007    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14008#endif
14009    Py_DECREF(unicode);
14010    return self;
14011
14012onError:
14013    Py_DECREF(unicode);
14014    Py_DECREF(self);
14015    return NULL;
14016}
14017
14018PyDoc_STRVAR(unicode_doc,
14019             "str(string[, encoding[, errors]]) -> str\n\
14020\n\
14021Create a new string object from the given encoded string.\n\
14022encoding defaults to the current default string encoding.\n\
14023errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
14024
14025static PyObject *unicode_iter(PyObject *seq);
14026
14027PyTypeObject PyUnicode_Type = {
14028    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14029    "str",              /* tp_name */
14030    sizeof(PyUnicodeObject),        /* tp_size */
14031    0,                  /* tp_itemsize */
14032    /* Slots */
14033    (destructor)unicode_dealloc,    /* tp_dealloc */
14034    0,                  /* tp_print */
14035    0,                  /* tp_getattr */
14036    0,                  /* tp_setattr */
14037    0,                  /* tp_reserved */
14038    unicode_repr,           /* tp_repr */
14039    &unicode_as_number,         /* tp_as_number */
14040    &unicode_as_sequence,       /* tp_as_sequence */
14041    &unicode_as_mapping,        /* tp_as_mapping */
14042    (hashfunc) unicode_hash,        /* tp_hash*/
14043    0,                  /* tp_call*/
14044    (reprfunc) unicode_str,     /* tp_str */
14045    PyObject_GenericGetAttr,        /* tp_getattro */
14046    0,                  /* tp_setattro */
14047    0,                  /* tp_as_buffer */
14048    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14049    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14050    unicode_doc,            /* tp_doc */
14051    0,                  /* tp_traverse */
14052    0,                  /* tp_clear */
14053    PyUnicode_RichCompare,      /* tp_richcompare */
14054    0,                  /* tp_weaklistoffset */
14055    unicode_iter,           /* tp_iter */
14056    0,                  /* tp_iternext */
14057    unicode_methods,            /* tp_methods */
14058    0,                  /* tp_members */
14059    0,                  /* tp_getset */
14060    &PyBaseObject_Type,         /* tp_base */
14061    0,                  /* tp_dict */
14062    0,                  /* tp_descr_get */
14063    0,                  /* tp_descr_set */
14064    0,                  /* tp_dictoffset */
14065    0,                  /* tp_init */
14066    0,                  /* tp_alloc */
14067    unicode_new,            /* tp_new */
14068    PyObject_Del,           /* tp_free */
14069};
14070
14071/* Initialize the Unicode implementation */
14072
14073int _PyUnicode_Init(void)
14074{
14075    int i;
14076
14077    /* XXX - move this array to unicodectype.c ? */
14078    Py_UCS2 linebreak[] = {
14079        0x000A, /* LINE FEED */
14080        0x000D, /* CARRIAGE RETURN */
14081        0x001C, /* FILE SEPARATOR */
14082        0x001D, /* GROUP SEPARATOR */
14083        0x001E, /* RECORD SEPARATOR */
14084        0x0085, /* NEXT LINE */
14085        0x2028, /* LINE SEPARATOR */
14086        0x2029, /* PARAGRAPH SEPARATOR */
14087    };
14088
14089    /* Init the implementation */
14090    unicode_empty = PyUnicode_New(0, 0);
14091    if (!unicode_empty)
14092        Py_FatalError("Can't create empty string");
14093    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
14094
14095    for (i = 0; i < 256; i++)
14096        unicode_latin1[i] = NULL;
14097    if (PyType_Ready(&PyUnicode_Type) < 0)
14098        Py_FatalError("Can't initialize 'unicode'");
14099
14100    /* initialize the linebreak bloom filter */
14101    bloom_linebreak = make_bloom_mask(
14102        PyUnicode_2BYTE_KIND, linebreak,
14103        Py_ARRAY_LENGTH(linebreak));
14104
14105    PyType_Ready(&EncodingMapType);
14106
14107#ifdef HAVE_MBCS
14108    winver.dwOSVersionInfoSize = sizeof(winver);
14109    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14110        PyErr_SetFromWindowsErr(0);
14111        return -1;
14112    }
14113#endif
14114    return 0;
14115}
14116
14117/* Finalize the Unicode implementation */
14118
14119int
14120PyUnicode_ClearFreeList(void)
14121{
14122    return 0;
14123}
14124
14125void
14126_PyUnicode_Fini(void)
14127{
14128    int i;
14129
14130    Py_XDECREF(unicode_empty);
14131    unicode_empty = NULL;
14132
14133    for (i = 0; i < 256; i++) {
14134        if (unicode_latin1[i]) {
14135            Py_DECREF(unicode_latin1[i]);
14136            unicode_latin1[i] = NULL;
14137        }
14138    }
14139    _PyUnicode_ClearStaticStrings();
14140    (void)PyUnicode_ClearFreeList();
14141}
14142
14143void
14144PyUnicode_InternInPlace(PyObject **p)
14145{
14146    register PyObject *s = *p;
14147    PyObject *t;
14148#ifdef Py_DEBUG
14149    assert(s != NULL);
14150    assert(_PyUnicode_CHECK(s));
14151#else
14152    if (s == NULL || !PyUnicode_Check(s))
14153        return;
14154#endif
14155    /* If it's a subclass, we don't really know what putting
14156       it in the interned dict might do. */
14157    if (!PyUnicode_CheckExact(s))
14158        return;
14159    if (PyUnicode_CHECK_INTERNED(s))
14160        return;
14161    if (interned == NULL) {
14162        interned = PyDict_New();
14163        if (interned == NULL) {
14164            PyErr_Clear(); /* Don't leave an exception */
14165            return;
14166        }
14167    }
14168    /* It might be that the GetItem call fails even
14169       though the key is present in the dictionary,
14170       namely when this happens during a stack overflow. */
14171    Py_ALLOW_RECURSION
14172    t = PyDict_GetItem(interned, s);
14173    Py_END_ALLOW_RECURSION
14174
14175        if (t) {
14176            Py_INCREF(t);
14177            Py_DECREF(*p);
14178            *p = t;
14179            return;
14180        }
14181
14182    PyThreadState_GET()->recursion_critical = 1;
14183    if (PyDict_SetItem(interned, s, s) < 0) {
14184        PyErr_Clear();
14185        PyThreadState_GET()->recursion_critical = 0;
14186        return;
14187    }
14188    PyThreadState_GET()->recursion_critical = 0;
14189    /* The two references in interned are not counted by refcnt.
14190       The deallocator will take care of this */
14191    Py_REFCNT(s) -= 2;
14192    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
14193}
14194
14195void
14196PyUnicode_InternImmortal(PyObject **p)
14197{
14198    PyUnicode_InternInPlace(p);
14199    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
14200        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
14201        Py_INCREF(*p);
14202    }
14203}
14204
14205PyObject *
14206PyUnicode_InternFromString(const char *cp)
14207{
14208    PyObject *s = PyUnicode_FromString(cp);
14209    if (s == NULL)
14210        return NULL;
14211    PyUnicode_InternInPlace(&s);
14212    return s;
14213}
14214
14215void
14216_Py_ReleaseInternedUnicodeStrings(void)
14217{
14218    PyObject *keys;
14219    PyObject *s;
14220    Py_ssize_t i, n;
14221    Py_ssize_t immortal_size = 0, mortal_size = 0;
14222
14223    if (interned == NULL || !PyDict_Check(interned))
14224        return;
14225    keys = PyDict_Keys(interned);
14226    if (keys == NULL || !PyList_Check(keys)) {
14227        PyErr_Clear();
14228        return;
14229    }
14230
14231    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14232       detector, interned unicode strings are not forcibly deallocated;
14233       rather, we give them their stolen references back, and then clear
14234       and DECREF the interned dict. */
14235
14236    n = PyList_GET_SIZE(keys);
14237    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14238            n);
14239    for (i = 0; i < n; i++) {
14240        s = PyList_GET_ITEM(keys, i);
14241        if (PyUnicode_READY(s) == -1) {
14242            assert(0 && "could not ready string");
14243            fprintf(stderr, "could not ready string\n");
14244        }
14245        switch (PyUnicode_CHECK_INTERNED(s)) {
14246        case SSTATE_NOT_INTERNED:
14247            /* XXX Shouldn't happen */
14248            break;
14249        case SSTATE_INTERNED_IMMORTAL:
14250            Py_REFCNT(s) += 1;
14251            immortal_size += PyUnicode_GET_LENGTH(s);
14252            break;
14253        case SSTATE_INTERNED_MORTAL:
14254            Py_REFCNT(s) += 2;
14255            mortal_size += PyUnicode_GET_LENGTH(s);
14256            break;
14257        default:
14258            Py_FatalError("Inconsistent interned string state.");
14259        }
14260        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14261    }
14262    fprintf(stderr, "total size of all interned strings: "
14263            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14264            "mortal/immortal\n", mortal_size, immortal_size);
14265    Py_DECREF(keys);
14266    PyDict_Clear(interned);
14267    Py_DECREF(interned);
14268    interned = NULL;
14269}
14270
14271
14272/********************* Unicode Iterator **************************/
14273
14274typedef struct {
14275    PyObject_HEAD
14276    Py_ssize_t it_index;
14277    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14278} unicodeiterobject;
14279
14280static void
14281unicodeiter_dealloc(unicodeiterobject *it)
14282{
14283    _PyObject_GC_UNTRACK(it);
14284    Py_XDECREF(it->it_seq);
14285    PyObject_GC_Del(it);
14286}
14287
14288static int
14289unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14290{
14291    Py_VISIT(it->it_seq);
14292    return 0;
14293}
14294
14295static PyObject *
14296unicodeiter_next(unicodeiterobject *it)
14297{
14298    PyObject *seq, *item;
14299
14300    assert(it != NULL);
14301    seq = it->it_seq;
14302    if (seq == NULL)
14303        return NULL;
14304    assert(_PyUnicode_CHECK(seq));
14305
14306    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14307        int kind = PyUnicode_KIND(seq);
14308        void *data = PyUnicode_DATA(seq);
14309        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14310        item = PyUnicode_FromOrdinal(chr);
14311        if (item != NULL)
14312            ++it->it_index;
14313        return item;
14314    }
14315
14316    Py_DECREF(seq);
14317    it->it_seq = NULL;
14318    return NULL;
14319}
14320
14321static PyObject *
14322unicodeiter_len(unicodeiterobject *it)
14323{
14324    Py_ssize_t len = 0;
14325    if (it->it_seq)
14326        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14327    return PyLong_FromSsize_t(len);
14328}
14329
14330PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14331
14332static PyObject *
14333unicodeiter_reduce(unicodeiterobject *it)
14334{
14335    if (it->it_seq != NULL) {
14336        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
14337                             it->it_seq, it->it_index);
14338    } else {
14339        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14340        if (u == NULL)
14341            return NULL;
14342        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
14343    }
14344}
14345
14346PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14347
14348static PyObject *
14349unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14350{
14351    Py_ssize_t index = PyLong_AsSsize_t(state);
14352    if (index == -1 && PyErr_Occurred())
14353        return NULL;
14354    if (index < 0)
14355        index = 0;
14356    it->it_index = index;
14357    Py_RETURN_NONE;
14358}
14359
14360PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14361
14362static PyMethodDef unicodeiter_methods[] = {
14363    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14364     length_hint_doc},
14365    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14366     reduce_doc},
14367    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
14368     setstate_doc},
14369    {NULL,      NULL}       /* sentinel */
14370};
14371
14372PyTypeObject PyUnicodeIter_Type = {
14373    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14374    "str_iterator",         /* tp_name */
14375    sizeof(unicodeiterobject),      /* tp_basicsize */
14376    0,                  /* tp_itemsize */
14377    /* methods */
14378    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14379    0,                  /* tp_print */
14380    0,                  /* tp_getattr */
14381    0,                  /* tp_setattr */
14382    0,                  /* tp_reserved */
14383    0,                  /* tp_repr */
14384    0,                  /* tp_as_number */
14385    0,                  /* tp_as_sequence */
14386    0,                  /* tp_as_mapping */
14387    0,                  /* tp_hash */
14388    0,                  /* tp_call */
14389    0,                  /* tp_str */
14390    PyObject_GenericGetAttr,        /* tp_getattro */
14391    0,                  /* tp_setattro */
14392    0,                  /* tp_as_buffer */
14393    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14394    0,                  /* tp_doc */
14395    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14396    0,                  /* tp_clear */
14397    0,                  /* tp_richcompare */
14398    0,                  /* tp_weaklistoffset */
14399    PyObject_SelfIter,          /* tp_iter */
14400    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14401    unicodeiter_methods,            /* tp_methods */
14402    0,
14403};
14404
14405static PyObject *
14406unicode_iter(PyObject *seq)
14407{
14408    unicodeiterobject *it;
14409
14410    if (!PyUnicode_Check(seq)) {
14411        PyErr_BadInternalCall();
14412        return NULL;
14413    }
14414    if (PyUnicode_READY(seq) == -1)
14415        return NULL;
14416    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14417    if (it == NULL)
14418        return NULL;
14419    it->it_index = 0;
14420    Py_INCREF(seq);
14421    it->it_seq = seq;
14422    _PyObject_GC_TRACK(it);
14423    return (PyObject *)it;
14424}
14425
14426
14427size_t
14428Py_UNICODE_strlen(const Py_UNICODE *u)
14429{
14430    int res = 0;
14431    while(*u++)
14432        res++;
14433    return res;
14434}
14435
14436Py_UNICODE*
14437Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14438{
14439    Py_UNICODE *u = s1;
14440    while ((*u++ = *s2++));
14441    return s1;
14442}
14443
14444Py_UNICODE*
14445Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14446{
14447    Py_UNICODE *u = s1;
14448    while ((*u++ = *s2++))
14449        if (n-- == 0)
14450            break;
14451    return s1;
14452}
14453
14454Py_UNICODE*
14455Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14456{
14457    Py_UNICODE *u1 = s1;
14458    u1 += Py_UNICODE_strlen(u1);
14459    Py_UNICODE_strcpy(u1, s2);
14460    return s1;
14461}
14462
14463int
14464Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14465{
14466    while (*s1 && *s2 && *s1 == *s2)
14467        s1++, s2++;
14468    if (*s1 && *s2)
14469        return (*s1 < *s2) ? -1 : +1;
14470    if (*s1)
14471        return 1;
14472    if (*s2)
14473        return -1;
14474    return 0;
14475}
14476
14477int
14478Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14479{
14480    register Py_UNICODE u1, u2;
14481    for (; n != 0; n--) {
14482        u1 = *s1;
14483        u2 = *s2;
14484        if (u1 != u2)
14485            return (u1 < u2) ? -1 : +1;
14486        if (u1 == '\0')
14487            return 0;
14488        s1++;
14489        s2++;
14490    }
14491    return 0;
14492}
14493
14494Py_UNICODE*
14495Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14496{
14497    const Py_UNICODE *p;
14498    for (p = s; *p; p++)
14499        if (*p == c)
14500            return (Py_UNICODE*)p;
14501    return NULL;
14502}
14503
14504Py_UNICODE*
14505Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14506{
14507    const Py_UNICODE *p;
14508    p = s + Py_UNICODE_strlen(s);
14509    while (p != s) {
14510        p--;
14511        if (*p == c)
14512            return (Py_UNICODE*)p;
14513    }
14514    return NULL;
14515}
14516
14517Py_UNICODE*
14518PyUnicode_AsUnicodeCopy(PyObject *unicode)
14519{
14520    Py_UNICODE *u, *copy;
14521    Py_ssize_t len, size;
14522
14523    if (!PyUnicode_Check(unicode)) {
14524        PyErr_BadArgument();
14525        return NULL;
14526    }
14527    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14528    if (u == NULL)
14529        return NULL;
14530    /* Ensure we won't overflow the size. */
14531    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14532        PyErr_NoMemory();
14533        return NULL;
14534    }
14535    size = len + 1; /* copy the null character */
14536    size *= sizeof(Py_UNICODE);
14537    copy = PyMem_Malloc(size);
14538    if (copy == NULL) {
14539        PyErr_NoMemory();
14540        return NULL;
14541    }
14542    memcpy(copy, u, size);
14543    return copy;
14544}
14545
14546/* A _string module, to export formatter_parser and formatter_field_name_split
14547   to the string.Formatter class implemented in Python. */
14548
14549static PyMethodDef _string_methods[] = {
14550    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14551     METH_O, PyDoc_STR("split the argument as a field name")},
14552    {"formatter_parser", (PyCFunction) formatter_parser,
14553     METH_O, PyDoc_STR("parse the argument as a format string")},
14554    {NULL, NULL}
14555};
14556
14557static struct PyModuleDef _string_module = {
14558    PyModuleDef_HEAD_INIT,
14559    "_string",
14560    PyDoc_STR("string helper module"),
14561    0,
14562    _string_methods,
14563    NULL,
14564    NULL,
14565    NULL,
14566    NULL
14567};
14568
14569PyMODINIT_FUNC
14570PyInit__string(void)
14571{
14572    return PyModule_Create(&_string_module);
14573}
14574
14575
14576#ifdef __cplusplus
14577}
14578#endif
14579