unicodeobject.c revision f05e17ece9ee4cf4d04e0657e6c7c9283a233968
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
58/* --- Globals ------------------------------------------------------------
59
60   The globals are initialized by the _PyUnicode_Init() API and should
61   not be used before calling that API.
62
63*/
64
65
66#ifdef __cplusplus
67extern "C" {
68#endif
69
70/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
73#ifdef Py_DEBUG
74#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
75#else
76#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
78
79#define _PyUnicode_UTF8(op)                             \
80    (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op)                              \
82    (assert(_PyUnicode_CHECK(op)),                      \
83     assert(PyUnicode_IS_READY(op)),                    \
84     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
85         ((char*)((PyASCIIObject*)(op) + 1)) :          \
86         _PyUnicode_UTF8(op))
87#define _PyUnicode_UTF8_LENGTH(op)                      \
88    (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op)                       \
90    (assert(_PyUnicode_CHECK(op)),                      \
91     assert(PyUnicode_IS_READY(op)),                    \
92     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
93         ((PyASCIIObject*)(op))->length :               \
94         _PyUnicode_UTF8_LENGTH(op))
95#define _PyUnicode_WSTR(op)                             \
96    (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op)                      \
98    (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op)                           \
100    (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op)                            \
102    (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op)                             \
104    (((PyASCIIObject *)(op))->hash)
105#define _PyUnicode_KIND(op)                             \
106    (assert(_PyUnicode_CHECK(op)),                      \
107     ((PyASCIIObject *)(op))->state.kind)
108#define _PyUnicode_GET_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     ((PyASCIIObject *)(op))->length)
111#define _PyUnicode_DATA_ANY(op)                         \
112    (((PyUnicodeObject*)(op))->data.any)
113
114/* Optimized version of Py_MAX() to compute the maximum character:
115   use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2)                 \
117    ((maxchar1) | (maxchar2))
118
119#undef PyUnicode_READY
120#define PyUnicode_READY(op)                             \
121    (assert(_PyUnicode_CHECK(op)),                      \
122     (PyUnicode_IS_READY(op) ?                          \
123      0 :                                               \
124      _PyUnicode_Ready(op)))
125
126#define _PyUnicode_SHARE_UTF8(op)                       \
127    (assert(_PyUnicode_CHECK(op)),                      \
128     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
129     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op)                       \
131    (assert(_PyUnicode_CHECK(op)),                      \
132     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated UTF-8 memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
137    (assert(_PyUnicode_CHECK(op)),                      \
138     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
139      && _PyUnicode_UTF8(op)                            \
140      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
142/* true if the Unicode object has an allocated wstr memory block
143   (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
145    (assert(_PyUnicode_CHECK(op)),                      \
146     (_PyUnicode_WSTR(op) &&                            \
147      (!PyUnicode_IS_READY(op) ||                       \
148       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
150/* Generic helper macro to convert characters of different types.
151   from_type and to_type have to be valid type names, begin and end
152   are pointers to the source characters which should be of type
153   "from_type *".  to is a pointer of type "to_type *" and points to the
154   buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156    do {                                                \
157        to_type *_to = (to_type *) to;                  \
158        const from_type *_iter = (begin);               \
159        const from_type *_end = (end);                  \
160        Py_ssize_t n = (_end) - (_iter);                \
161        const from_type *_unrolled_end =                \
162            _iter + (n & ~ (Py_ssize_t) 3);             \
163        while (_iter < (_unrolled_end)) {               \
164            _to[0] = (to_type) _iter[0];                \
165            _to[1] = (to_type) _iter[1];                \
166            _to[2] = (to_type) _iter[2];                \
167            _to[3] = (to_type) _iter[3];                \
168            _iter += 4; _to += 4;                       \
169        }                                               \
170        while (_iter < (_end))                          \
171            *_to++ = (to_type) *_iter++;                \
172    } while (0)
173
174/* This dictionary holds all interned unicode strings.  Note that references
175   to strings in this dictionary are *not* counted in the string's ob_refcnt.
176   When the interned string reaches a refcnt of 0 the string deallocation
177   function will delete the reference from this dictionary.
178
179   Another way to look at this is that to say that the actual reference
180   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
181*/
182static PyObject *interned;
183
184/* The empty Unicode object is shared to improve performance. */
185static PyObject *unicode_empty;
186
187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
190/* Single character Unicode strings in the Latin-1 range are being
191   shared as well. */
192static PyObject *unicode_latin1[256];
193
194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
196    0, 0, 0, 0, 0, 0, 0, 0,
197/*     case 0x0009: * CHARACTER TABULATION */
198/*     case 0x000A: * LINE FEED */
199/*     case 0x000B: * LINE TABULATION */
200/*     case 0x000C: * FORM FEED */
201/*     case 0x000D: * CARRIAGE RETURN */
202    0, 1, 1, 1, 1, 1, 0, 0,
203    0, 0, 0, 0, 0, 0, 0, 0,
204/*     case 0x001C: * FILE SEPARATOR */
205/*     case 0x001D: * GROUP SEPARATOR */
206/*     case 0x001E: * RECORD SEPARATOR */
207/*     case 0x001F: * UNIT SEPARATOR */
208    0, 0, 0, 0, 1, 1, 1, 1,
209/*     case 0x0020: * SPACE */
210    1, 0, 0, 0, 0, 0, 0, 0,
211    0, 0, 0, 0, 0, 0, 0, 0,
212    0, 0, 0, 0, 0, 0, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* forward */
226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
227static PyObject* get_latin1_char(unsigned char ch);
228static int unicode_modifiable(PyObject *unicode);
229
230
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
239unicode_encode_call_errorhandler(const char *errors,
240       PyObject **errorHandler,const char *encoding, const char *reason,
241       PyObject *unicode, PyObject **exceptionObject,
242       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
244static void
245raise_encode_exception(PyObject **exceptionObject,
246                       const char *encoding,
247                       PyObject *unicode,
248                       Py_ssize_t startpos, Py_ssize_t endpos,
249                       const char *reason);
250
251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
253    0, 0, 0, 0, 0, 0, 0, 0,
254/*         0x000A, * LINE FEED */
255/*         0x000B, * LINE TABULATION */
256/*         0x000C, * FORM FEED */
257/*         0x000D, * CARRIAGE RETURN */
258    0, 0, 1, 1, 1, 1, 0, 0,
259    0, 0, 0, 0, 0, 0, 0, 0,
260/*         0x001C, * FILE SEPARATOR */
261/*         0x001D, * GROUP SEPARATOR */
262/*         0x001E, * RECORD SEPARATOR */
263    0, 0, 0, 0, 1, 1, 1, 0,
264    0, 0, 0, 0, 0, 0, 0, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268
269    0, 0, 0, 0, 0, 0, 0, 0,
270    0, 0, 0, 0, 0, 0, 0, 0,
271    0, 0, 0, 0, 0, 0, 0, 0,
272    0, 0, 0, 0, 0, 0, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0
277};
278
279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280   This function is kept for backward compatibility with the old API. */
281Py_UNICODE
282PyUnicode_GetMax(void)
283{
284#ifdef Py_UNICODE_WIDE
285    return 0x10FFFF;
286#else
287    /* This is actually an illegal character, so it should
288       not be passed to unichr. */
289    return 0xFFFF;
290#endif
291}
292
293#ifdef Py_DEBUG
294int
295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
296{
297    PyASCIIObject *ascii;
298    unsigned int kind;
299
300    assert(PyUnicode_Check(op));
301
302    ascii = (PyASCIIObject *)op;
303    kind = ascii->state.kind;
304
305    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
306        assert(kind == PyUnicode_1BYTE_KIND);
307        assert(ascii->state.ready == 1);
308    }
309    else {
310        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
311        void *data;
312
313        if (ascii->state.compact == 1) {
314            data = compact + 1;
315            assert(kind == PyUnicode_1BYTE_KIND
316                   || kind == PyUnicode_2BYTE_KIND
317                   || kind == PyUnicode_4BYTE_KIND);
318            assert(ascii->state.ascii == 0);
319            assert(ascii->state.ready == 1);
320            assert (compact->utf8 != data);
321        }
322        else {
323            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325            data = unicode->data.any;
326            if (kind == PyUnicode_WCHAR_KIND) {
327                assert(ascii->length == 0);
328                assert(ascii->hash == -1);
329                assert(ascii->state.compact == 0);
330                assert(ascii->state.ascii == 0);
331                assert(ascii->state.ready == 0);
332                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
333                assert(ascii->wstr != NULL);
334                assert(data == NULL);
335                assert(compact->utf8 == NULL);
336            }
337            else {
338                assert(kind == PyUnicode_1BYTE_KIND
339                       || kind == PyUnicode_2BYTE_KIND
340                       || kind == PyUnicode_4BYTE_KIND);
341                assert(ascii->state.compact == 0);
342                assert(ascii->state.ready == 1);
343                assert(data != NULL);
344                if (ascii->state.ascii) {
345                    assert (compact->utf8 == data);
346                    assert (compact->utf8_length == ascii->length);
347                }
348                else
349                    assert (compact->utf8 != data);
350            }
351        }
352        if (kind != PyUnicode_WCHAR_KIND) {
353            if (
354#if SIZEOF_WCHAR_T == 2
355                kind == PyUnicode_2BYTE_KIND
356#else
357                kind == PyUnicode_4BYTE_KIND
358#endif
359               )
360            {
361                assert(ascii->wstr == data);
362                assert(compact->wstr_length == ascii->length);
363            } else
364                assert(ascii->wstr != data);
365        }
366
367        if (compact->utf8 == NULL)
368            assert(compact->utf8_length == 0);
369        if (ascii->wstr == NULL)
370            assert(compact->wstr_length == 0);
371    }
372    /* check that the best kind is used */
373    if (check_content && kind != PyUnicode_WCHAR_KIND)
374    {
375        Py_ssize_t i;
376        Py_UCS4 maxchar = 0;
377        void *data;
378        Py_UCS4 ch;
379
380        data = PyUnicode_DATA(ascii);
381        for (i=0; i < ascii->length; i++)
382        {
383            ch = PyUnicode_READ(kind, data, i);
384            if (ch > maxchar)
385                maxchar = ch;
386        }
387        if (kind == PyUnicode_1BYTE_KIND) {
388            if (ascii->state.ascii == 0) {
389                assert(maxchar >= 128);
390                assert(maxchar <= 255);
391            }
392            else
393                assert(maxchar < 128);
394        }
395        else if (kind == PyUnicode_2BYTE_KIND) {
396            assert(maxchar >= 0x100);
397            assert(maxchar <= 0xFFFF);
398        }
399        else {
400            assert(maxchar >= 0x10000);
401            assert(maxchar <= MAX_UNICODE);
402        }
403        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
404    }
405    return 1;
406}
407#endif
408
409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413    Py_ssize_t len;
414
415    assert(Py_REFCNT(unicode) == 1);
416
417    len = _PyUnicode_WSTR_LENGTH(unicode);
418    if (len == 0) {
419        Py_INCREF(unicode_empty);
420        Py_DECREF(unicode);
421        return unicode_empty;
422    }
423
424    if (len == 1) {
425        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
426        if (ch < 256) {
427            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
428            Py_DECREF(unicode);
429            return latin1_char;
430        }
431    }
432
433    if (_PyUnicode_Ready(unicode) < 0) {
434        Py_XDECREF(unicode);
435        return NULL;
436    }
437#else
438    /* don't make the result ready in debug mode to ensure that the caller
439       makes the string ready before using it */
440    assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442    return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448    Py_ssize_t length;
449
450    length = PyUnicode_GET_LENGTH(unicode);
451    if (length == 0) {
452        if (unicode != unicode_empty) {
453            Py_INCREF(unicode_empty);
454            Py_DECREF(unicode);
455        }
456        return unicode_empty;
457    }
458
459    if (length == 1) {
460        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461        if (ch < 256) {
462            PyObject *latin1_char = unicode_latin1[ch];
463            if (latin1_char != NULL) {
464                if (unicode != latin1_char) {
465                    Py_INCREF(latin1_char);
466                    Py_DECREF(unicode);
467                }
468                return latin1_char;
469            }
470            else {
471                assert(_PyUnicode_CheckConsistency(unicode, 1));
472                Py_INCREF(unicode);
473                unicode_latin1[ch] = unicode;
474                return unicode;
475            }
476        }
477    }
478
479    assert(_PyUnicode_CheckConsistency(unicode, 1));
480    return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486    assert(_PyUnicode_CHECK(unicode));
487    if (PyUnicode_IS_READY(unicode))
488        return unicode_result_ready(unicode);
489    else
490        return unicode_result_wchar(unicode);
491}
492
493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496    if (PyUnicode_CheckExact(unicode)) {
497        if (PyUnicode_READY(unicode) == -1)
498            return NULL;
499        Py_INCREF(unicode);
500        return unicode;
501    }
502    else
503        /* Subtype -- return genuine unicode string with the same value. */
504        return _PyUnicode_Copy(unicode);
505}
506
507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514   to keep things simple, we use a single bitmask, using the least 5
515   bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
535
536#define BLOOM_LINEBREAK(ch)                                             \
537    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
538     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
539
540Py_LOCAL_INLINE(BLOOM_MASK)
541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
542{
543    /* calculate simple bloom-style bitmask for a given unicode string */
544
545    BLOOM_MASK mask;
546    Py_ssize_t i;
547
548    mask = 0;
549    for (i = 0; i < len; i++)
550        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
551
552    return mask;
553}
554
555#define BLOOM_MEMBER(mask, chr, str) \
556    (BLOOM(mask, chr) \
557     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
558
559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
605#include "stringlib/undef.h"
606
607/* --- Unicode Object ----------------------------------------------------- */
608
609static PyObject *
610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
611
612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613                                     Py_ssize_t size, Py_UCS4 ch,
614                                     int direction)
615{
616    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618    switch (kind) {
619    case PyUnicode_1BYTE_KIND:
620        {
621            Py_UCS1 ch1 = (Py_UCS1) ch;
622            if (ch1 == ch)
623                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624            else
625                return -1;
626        }
627    case PyUnicode_2BYTE_KIND:
628        {
629            Py_UCS2 ch2 = (Py_UCS2) ch;
630            if (ch2 == ch)
631                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632            else
633                return -1;
634        }
635    case PyUnicode_4BYTE_KIND:
636        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637    default:
638        assert(0);
639        return -1;
640    }
641}
642
643static PyObject*
644resize_compact(PyObject *unicode, Py_ssize_t length)
645{
646    Py_ssize_t char_size;
647    Py_ssize_t struct_size;
648    Py_ssize_t new_size;
649    int share_wstr;
650    PyObject *new_unicode;
651    assert(unicode_modifiable(unicode));
652    assert(PyUnicode_IS_READY(unicode));
653    assert(PyUnicode_IS_COMPACT(unicode));
654
655    char_size = PyUnicode_KIND(unicode);
656    if (PyUnicode_IS_ASCII(unicode))
657        struct_size = sizeof(PyASCIIObject);
658    else
659        struct_size = sizeof(PyCompactUnicodeObject);
660    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
661
662    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663        PyErr_NoMemory();
664        return NULL;
665    }
666    new_size = (struct_size + (length + 1) * char_size);
667
668    _Py_DEC_REFTOTAL;
669    _Py_ForgetReference(unicode);
670
671    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672    if (new_unicode == NULL) {
673        _Py_NewReference(unicode);
674        PyErr_NoMemory();
675        return NULL;
676    }
677    unicode = new_unicode;
678    _Py_NewReference(unicode);
679
680    _PyUnicode_LENGTH(unicode) = length;
681    if (share_wstr) {
682        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
683        if (!PyUnicode_IS_ASCII(unicode))
684            _PyUnicode_WSTR_LENGTH(unicode) = length;
685    }
686    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687                    length, 0);
688    assert(_PyUnicode_CheckConsistency(unicode, 0));
689    return unicode;
690}
691
692static int
693resize_inplace(PyObject *unicode, Py_ssize_t length)
694{
695    wchar_t *wstr;
696    Py_ssize_t new_size;
697    assert(!PyUnicode_IS_COMPACT(unicode));
698    assert(Py_REFCNT(unicode) == 1);
699
700    if (PyUnicode_IS_READY(unicode)) {
701        Py_ssize_t char_size;
702        int share_wstr, share_utf8;
703        void *data;
704
705        data = _PyUnicode_DATA_ANY(unicode);
706        char_size = PyUnicode_KIND(unicode);
707        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
708        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
709
710        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
711            PyErr_NoMemory();
712            return -1;
713        }
714        new_size = (length + 1) * char_size;
715
716        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
717        {
718            PyObject_DEL(_PyUnicode_UTF8(unicode));
719            _PyUnicode_UTF8(unicode) = NULL;
720            _PyUnicode_UTF8_LENGTH(unicode) = 0;
721        }
722
723        data = (PyObject *)PyObject_REALLOC(data, new_size);
724        if (data == NULL) {
725            PyErr_NoMemory();
726            return -1;
727        }
728        _PyUnicode_DATA_ANY(unicode) = data;
729        if (share_wstr) {
730            _PyUnicode_WSTR(unicode) = data;
731            _PyUnicode_WSTR_LENGTH(unicode) = length;
732        }
733        if (share_utf8) {
734            _PyUnicode_UTF8(unicode) = data;
735            _PyUnicode_UTF8_LENGTH(unicode) = length;
736        }
737        _PyUnicode_LENGTH(unicode) = length;
738        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
739        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
740            assert(_PyUnicode_CheckConsistency(unicode, 0));
741            return 0;
742        }
743    }
744    assert(_PyUnicode_WSTR(unicode) != NULL);
745
746    /* check for integer overflow */
747    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
748        PyErr_NoMemory();
749        return -1;
750    }
751    new_size = sizeof(wchar_t) * (length + 1);
752    wstr =  _PyUnicode_WSTR(unicode);
753    wstr = PyObject_REALLOC(wstr, new_size);
754    if (!wstr) {
755        PyErr_NoMemory();
756        return -1;
757    }
758    _PyUnicode_WSTR(unicode) = wstr;
759    _PyUnicode_WSTR(unicode)[length] = 0;
760    _PyUnicode_WSTR_LENGTH(unicode) = length;
761    assert(_PyUnicode_CheckConsistency(unicode, 0));
762    return 0;
763}
764
765static PyObject*
766resize_copy(PyObject *unicode, Py_ssize_t length)
767{
768    Py_ssize_t copy_length;
769    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
770        PyObject *copy;
771
772        if (PyUnicode_READY(unicode) == -1)
773            return NULL;
774
775        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
776        if (copy == NULL)
777            return NULL;
778
779        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
780        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
781        return copy;
782    }
783    else {
784        PyObject *w;
785
786        w = (PyObject*)_PyUnicode_New(length);
787        if (w == NULL)
788            return NULL;
789        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
790        copy_length = Py_MIN(copy_length, length);
791        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
792                        copy_length);
793        return w;
794    }
795}
796
797/* We allocate one more byte to make sure the string is
798   Ux0000 terminated; some code (e.g. new_identifier)
799   relies on that.
800
801   XXX This allocator could further be enhanced by assuring that the
802   free list never reduces its size below 1.
803
804*/
805
806static PyUnicodeObject *
807_PyUnicode_New(Py_ssize_t length)
808{
809    register PyUnicodeObject *unicode;
810    size_t new_size;
811
812    /* Optimization for empty strings */
813    if (length == 0 && unicode_empty != NULL) {
814        Py_INCREF(unicode_empty);
815        return (PyUnicodeObject*)unicode_empty;
816    }
817
818    /* Ensure we won't overflow the size. */
819    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
820        return (PyUnicodeObject *)PyErr_NoMemory();
821    }
822    if (length < 0) {
823        PyErr_SetString(PyExc_SystemError,
824                        "Negative size passed to _PyUnicode_New");
825        return NULL;
826    }
827
828    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
829    if (unicode == NULL)
830        return NULL;
831    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
832    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
833    if (!_PyUnicode_WSTR(unicode)) {
834        Py_DECREF(unicode);
835        PyErr_NoMemory();
836        return NULL;
837    }
838
839    /* Initialize the first element to guard against cases where
840     * the caller fails before initializing str -- unicode_resize()
841     * reads str[0], and the Keep-Alive optimization can keep memory
842     * allocated for str alive across a call to unicode_dealloc(unicode).
843     * We don't want unicode_resize to read uninitialized memory in
844     * that case.
845     */
846    _PyUnicode_WSTR(unicode)[0] = 0;
847    _PyUnicode_WSTR(unicode)[length] = 0;
848    _PyUnicode_WSTR_LENGTH(unicode) = length;
849    _PyUnicode_HASH(unicode) = -1;
850    _PyUnicode_STATE(unicode).interned = 0;
851    _PyUnicode_STATE(unicode).kind = 0;
852    _PyUnicode_STATE(unicode).compact = 0;
853    _PyUnicode_STATE(unicode).ready = 0;
854    _PyUnicode_STATE(unicode).ascii = 0;
855    _PyUnicode_DATA_ANY(unicode) = NULL;
856    _PyUnicode_LENGTH(unicode) = 0;
857    _PyUnicode_UTF8(unicode) = NULL;
858    _PyUnicode_UTF8_LENGTH(unicode) = 0;
859    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
860    return unicode;
861}
862
863static const char*
864unicode_kind_name(PyObject *unicode)
865{
866    /* don't check consistency: unicode_kind_name() is called from
867       _PyUnicode_Dump() */
868    if (!PyUnicode_IS_COMPACT(unicode))
869    {
870        if (!PyUnicode_IS_READY(unicode))
871            return "wstr";
872        switch (PyUnicode_KIND(unicode))
873        {
874        case PyUnicode_1BYTE_KIND:
875            if (PyUnicode_IS_ASCII(unicode))
876                return "legacy ascii";
877            else
878                return "legacy latin1";
879        case PyUnicode_2BYTE_KIND:
880            return "legacy UCS2";
881        case PyUnicode_4BYTE_KIND:
882            return "legacy UCS4";
883        default:
884            return "<legacy invalid kind>";
885        }
886    }
887    assert(PyUnicode_IS_READY(unicode));
888    switch (PyUnicode_KIND(unicode)) {
889    case PyUnicode_1BYTE_KIND:
890        if (PyUnicode_IS_ASCII(unicode))
891            return "ascii";
892        else
893            return "latin1";
894    case PyUnicode_2BYTE_KIND:
895        return "UCS2";
896    case PyUnicode_4BYTE_KIND:
897        return "UCS4";
898    default:
899        return "<invalid compact kind>";
900    }
901}
902
903#ifdef Py_DEBUG
904/* Functions wrapping macros for use in debugger */
905char *_PyUnicode_utf8(void *unicode){
906    return PyUnicode_UTF8(unicode);
907}
908
909void *_PyUnicode_compact_data(void *unicode) {
910    return _PyUnicode_COMPACT_DATA(unicode);
911}
912void *_PyUnicode_data(void *unicode){
913    printf("obj %p\n", unicode);
914    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
915    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
916    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
917    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
918    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
919    return PyUnicode_DATA(unicode);
920}
921
922void
923_PyUnicode_Dump(PyObject *op)
924{
925    PyASCIIObject *ascii = (PyASCIIObject *)op;
926    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
927    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
928    void *data;
929
930    if (ascii->state.compact)
931    {
932        if (ascii->state.ascii)
933            data = (ascii + 1);
934        else
935            data = (compact + 1);
936    }
937    else
938        data = unicode->data.any;
939    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
940
941    if (ascii->wstr == data)
942        printf("shared ");
943    printf("wstr=%p", ascii->wstr);
944
945    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
946        printf(" (%zu), ", compact->wstr_length);
947        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
948            printf("shared ");
949        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
950    }
951    printf(", data=%p\n", data);
952}
953#endif
954
955PyObject *
956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
957{
958    PyObject *obj;
959    PyCompactUnicodeObject *unicode;
960    void *data;
961    enum PyUnicode_Kind kind;
962    int is_sharing, is_ascii;
963    Py_ssize_t char_size;
964    Py_ssize_t struct_size;
965
966    /* Optimization for empty strings */
967    if (size == 0 && unicode_empty != NULL) {
968        Py_INCREF(unicode_empty);
969        return unicode_empty;
970    }
971
972    is_ascii = 0;
973    is_sharing = 0;
974    struct_size = sizeof(PyCompactUnicodeObject);
975    if (maxchar < 128) {
976        kind = PyUnicode_1BYTE_KIND;
977        char_size = 1;
978        is_ascii = 1;
979        struct_size = sizeof(PyASCIIObject);
980    }
981    else if (maxchar < 256) {
982        kind = PyUnicode_1BYTE_KIND;
983        char_size = 1;
984    }
985    else if (maxchar < 65536) {
986        kind = PyUnicode_2BYTE_KIND;
987        char_size = 2;
988        if (sizeof(wchar_t) == 2)
989            is_sharing = 1;
990    }
991    else {
992        if (maxchar > MAX_UNICODE) {
993            PyErr_SetString(PyExc_SystemError,
994                            "invalid maximum character passed to PyUnicode_New");
995            return NULL;
996        }
997        kind = PyUnicode_4BYTE_KIND;
998        char_size = 4;
999        if (sizeof(wchar_t) == 4)
1000            is_sharing = 1;
1001    }
1002
1003    /* Ensure we won't overflow the size. */
1004    if (size < 0) {
1005        PyErr_SetString(PyExc_SystemError,
1006                        "Negative size passed to PyUnicode_New");
1007        return NULL;
1008    }
1009    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1010        return PyErr_NoMemory();
1011
1012    /* Duplicated allocation code from _PyObject_New() instead of a call to
1013     * PyObject_New() so we are able to allocate space for the object and
1014     * it's data buffer.
1015     */
1016    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1017    if (obj == NULL)
1018        return PyErr_NoMemory();
1019    obj = PyObject_INIT(obj, &PyUnicode_Type);
1020    if (obj == NULL)
1021        return NULL;
1022
1023    unicode = (PyCompactUnicodeObject *)obj;
1024    if (is_ascii)
1025        data = ((PyASCIIObject*)obj) + 1;
1026    else
1027        data = unicode + 1;
1028    _PyUnicode_LENGTH(unicode) = size;
1029    _PyUnicode_HASH(unicode) = -1;
1030    _PyUnicode_STATE(unicode).interned = 0;
1031    _PyUnicode_STATE(unicode).kind = kind;
1032    _PyUnicode_STATE(unicode).compact = 1;
1033    _PyUnicode_STATE(unicode).ready = 1;
1034    _PyUnicode_STATE(unicode).ascii = is_ascii;
1035    if (is_ascii) {
1036        ((char*)data)[size] = 0;
1037        _PyUnicode_WSTR(unicode) = NULL;
1038    }
1039    else if (kind == PyUnicode_1BYTE_KIND) {
1040        ((char*)data)[size] = 0;
1041        _PyUnicode_WSTR(unicode) = NULL;
1042        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1043        unicode->utf8 = NULL;
1044        unicode->utf8_length = 0;
1045    }
1046    else {
1047        unicode->utf8 = NULL;
1048        unicode->utf8_length = 0;
1049        if (kind == PyUnicode_2BYTE_KIND)
1050            ((Py_UCS2*)data)[size] = 0;
1051        else /* kind == PyUnicode_4BYTE_KIND */
1052            ((Py_UCS4*)data)[size] = 0;
1053        if (is_sharing) {
1054            _PyUnicode_WSTR_LENGTH(unicode) = size;
1055            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1056        }
1057        else {
1058            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1059            _PyUnicode_WSTR(unicode) = NULL;
1060        }
1061    }
1062#ifdef Py_DEBUG
1063    /* Fill the data with invalid characters to detect bugs earlier.
1064       _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1065       at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1066       and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1067    memset(data, 0xff, size * kind);
1068#endif
1069    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1070    return obj;
1071}
1072
1073#if SIZEOF_WCHAR_T == 2
1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1075   will decode surrogate pairs, the other conversions are implemented as macros
1076   for efficiency.
1077
1078   This function assumes that unicode can hold one more code point than wstr
1079   characters for a terminating null character. */
1080static void
1081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1082                              PyObject *unicode)
1083{
1084    const wchar_t *iter;
1085    Py_UCS4 *ucs4_out;
1086
1087    assert(unicode != NULL);
1088    assert(_PyUnicode_CHECK(unicode));
1089    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1090    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1091
1092    for (iter = begin; iter < end; ) {
1093        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1094                           _PyUnicode_GET_LENGTH(unicode)));
1095        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1096            && (iter+1) < end
1097            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1098        {
1099            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1100            iter += 2;
1101        }
1102        else {
1103            *ucs4_out++ = *iter;
1104            iter++;
1105        }
1106    }
1107    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1108                        _PyUnicode_GET_LENGTH(unicode)));
1109
1110}
1111#endif
1112
1113static int
1114unicode_check_modifiable(PyObject *unicode)
1115{
1116    if (!unicode_modifiable(unicode)) {
1117        PyErr_SetString(PyExc_SystemError,
1118                        "Cannot modify a string currently used");
1119        return -1;
1120    }
1121    return 0;
1122}
1123
1124static int
1125_copy_characters(PyObject *to, Py_ssize_t to_start,
1126                 PyObject *from, Py_ssize_t from_start,
1127                 Py_ssize_t how_many, int check_maxchar)
1128{
1129    unsigned int from_kind, to_kind;
1130    void *from_data, *to_data;
1131    int fast;
1132
1133    assert(0 <= how_many);
1134    assert(0 <= from_start);
1135    assert(0 <= to_start);
1136    assert(PyUnicode_Check(from));
1137    assert(PyUnicode_IS_READY(from));
1138    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1139
1140    if (how_many == 0)
1141        return 0;
1142
1143    assert(PyUnicode_Check(to));
1144    assert(PyUnicode_IS_READY(to));
1145    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1146
1147    from_kind = PyUnicode_KIND(from);
1148    from_data = PyUnicode_DATA(from);
1149    to_kind = PyUnicode_KIND(to);
1150    to_data = PyUnicode_DATA(to);
1151
1152#ifdef Py_DEBUG
1153    if (!check_maxchar
1154        && (from_kind > to_kind
1155            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
1156    {
1157        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1158        Py_UCS4 ch;
1159        Py_ssize_t i;
1160        for (i=0; i < how_many; i++) {
1161            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1162            assert(ch <= to_maxchar);
1163        }
1164    }
1165#endif
1166    fast = (from_kind == to_kind);
1167    if (check_maxchar
1168        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1169    {
1170        /* deny latin1 => ascii */
1171        fast = 0;
1172    }
1173
1174    if (fast) {
1175        Py_MEMCPY((char*)to_data + to_kind * to_start,
1176                  (char*)from_data + from_kind * from_start,
1177                  to_kind * how_many);
1178    }
1179    else if (from_kind == PyUnicode_1BYTE_KIND
1180             && to_kind == PyUnicode_2BYTE_KIND)
1181    {
1182        _PyUnicode_CONVERT_BYTES(
1183            Py_UCS1, Py_UCS2,
1184            PyUnicode_1BYTE_DATA(from) + from_start,
1185            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186            PyUnicode_2BYTE_DATA(to) + to_start
1187            );
1188    }
1189    else if (from_kind == PyUnicode_1BYTE_KIND
1190             && to_kind == PyUnicode_4BYTE_KIND)
1191    {
1192        _PyUnicode_CONVERT_BYTES(
1193            Py_UCS1, Py_UCS4,
1194            PyUnicode_1BYTE_DATA(from) + from_start,
1195            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1196            PyUnicode_4BYTE_DATA(to) + to_start
1197            );
1198    }
1199    else if (from_kind == PyUnicode_2BYTE_KIND
1200             && to_kind == PyUnicode_4BYTE_KIND)
1201    {
1202        _PyUnicode_CONVERT_BYTES(
1203            Py_UCS2, Py_UCS4,
1204            PyUnicode_2BYTE_DATA(from) + from_start,
1205            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1206            PyUnicode_4BYTE_DATA(to) + to_start
1207            );
1208    }
1209    else {
1210        /* check if max_char(from substring) <= max_char(to) */
1211        if (from_kind > to_kind
1212                /* latin1 => ascii */
1213            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1214        {
1215            /* slow path to check for character overflow */
1216            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1217            Py_UCS4 ch;
1218            Py_ssize_t i;
1219
1220#ifdef Py_DEBUG
1221            for (i=0; i < how_many; i++) {
1222                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1223                assert(ch <= to_maxchar);
1224                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1225            }
1226#else
1227            if (!check_maxchar) {
1228                for (i=0; i < how_many; i++) {
1229                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1230                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1231                }
1232            }
1233            else {
1234                for (i=0; i < how_many; i++) {
1235                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1236                    if (ch > to_maxchar)
1237                        return 1;
1238                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1239                }
1240            }
1241#endif
1242        }
1243        else {
1244            assert(0 && "inconsistent state");
1245            return 1;
1246        }
1247    }
1248    return 0;
1249}
1250
1251void
1252_PyUnicode_FastCopyCharacters(
1253    PyObject *to, Py_ssize_t to_start,
1254    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1255{
1256    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1257}
1258
1259Py_ssize_t
1260PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1261                         PyObject *from, Py_ssize_t from_start,
1262                         Py_ssize_t how_many)
1263{
1264    int err;
1265
1266    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1267        PyErr_BadInternalCall();
1268        return -1;
1269    }
1270
1271    if (PyUnicode_READY(from) == -1)
1272        return -1;
1273    if (PyUnicode_READY(to) == -1)
1274        return -1;
1275
1276    if (from_start < 0) {
1277        PyErr_SetString(PyExc_IndexError, "string index out of range");
1278        return -1;
1279    }
1280    if (to_start < 0) {
1281        PyErr_SetString(PyExc_IndexError, "string index out of range");
1282        return -1;
1283    }
1284    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1285    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1286        PyErr_Format(PyExc_SystemError,
1287                     "Cannot write %zi characters at %zi "
1288                     "in a string of %zi characters",
1289                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1290        return -1;
1291    }
1292
1293    if (how_many == 0)
1294        return 0;
1295
1296    if (unicode_check_modifiable(to))
1297        return -1;
1298
1299    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1300    if (err) {
1301        PyErr_Format(PyExc_SystemError,
1302                     "Cannot copy %s characters "
1303                     "into a string of %s characters",
1304                     unicode_kind_name(from),
1305                     unicode_kind_name(to));
1306        return -1;
1307    }
1308    return how_many;
1309}
1310
1311/* Find the maximum code point and count the number of surrogate pairs so a
1312   correct string length can be computed before converting a string to UCS4.
1313   This function counts single surrogates as a character and not as a pair.
1314
1315   Return 0 on success, or -1 on error. */
1316static int
1317find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1318                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1319{
1320    const wchar_t *iter;
1321    Py_UCS4 ch;
1322
1323    assert(num_surrogates != NULL && maxchar != NULL);
1324    *num_surrogates = 0;
1325    *maxchar = 0;
1326
1327    for (iter = begin; iter < end; ) {
1328#if SIZEOF_WCHAR_T == 2
1329        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1330            && (iter+1) < end
1331            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1332        {
1333            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1334            ++(*num_surrogates);
1335            iter += 2;
1336        }
1337        else
1338#endif
1339        {
1340            ch = *iter;
1341            iter++;
1342        }
1343        if (ch > *maxchar) {
1344            *maxchar = ch;
1345            if (*maxchar > MAX_UNICODE) {
1346                PyErr_Format(PyExc_ValueError,
1347                             "character U+%x is not in range [U+0000; U+10ffff]",
1348                             ch);
1349                return -1;
1350            }
1351        }
1352    }
1353    return 0;
1354}
1355
1356#ifdef Py_DEBUG
1357static int unicode_ready_calls = 0;
1358#endif
1359
1360int
1361_PyUnicode_Ready(PyObject *unicode)
1362{
1363    wchar_t *end;
1364    Py_UCS4 maxchar = 0;
1365    Py_ssize_t num_surrogates;
1366#if SIZEOF_WCHAR_T == 2
1367    Py_ssize_t length_wo_surrogates;
1368#endif
1369
1370    /* _PyUnicode_Ready() is only intended for old-style API usage where
1371       strings were created using _PyObject_New() and where no canonical
1372       representation (the str field) has been set yet aka strings
1373       which are not yet ready. */
1374    assert(_PyUnicode_CHECK(unicode));
1375    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1376    assert(_PyUnicode_WSTR(unicode) != NULL);
1377    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1378    assert(_PyUnicode_UTF8(unicode) == NULL);
1379    /* Actually, it should neither be interned nor be anything else: */
1380    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1381
1382#ifdef Py_DEBUG
1383    ++unicode_ready_calls;
1384#endif
1385
1386    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1387    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1388                                &maxchar, &num_surrogates) == -1)
1389        return -1;
1390
1391    if (maxchar < 256) {
1392        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1393        if (!_PyUnicode_DATA_ANY(unicode)) {
1394            PyErr_NoMemory();
1395            return -1;
1396        }
1397        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1398                                _PyUnicode_WSTR(unicode), end,
1399                                PyUnicode_1BYTE_DATA(unicode));
1400        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1401        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1402        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1403        if (maxchar < 128) {
1404            _PyUnicode_STATE(unicode).ascii = 1;
1405            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1406            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1407        }
1408        else {
1409            _PyUnicode_STATE(unicode).ascii = 0;
1410            _PyUnicode_UTF8(unicode) = NULL;
1411            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1412        }
1413        PyObject_FREE(_PyUnicode_WSTR(unicode));
1414        _PyUnicode_WSTR(unicode) = NULL;
1415        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1416    }
1417    /* In this case we might have to convert down from 4-byte native
1418       wchar_t to 2-byte unicode. */
1419    else if (maxchar < 65536) {
1420        assert(num_surrogates == 0 &&
1421               "FindMaxCharAndNumSurrogatePairs() messed up");
1422
1423#if SIZEOF_WCHAR_T == 2
1424        /* We can share representations and are done. */
1425        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1426        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1427        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1428        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1429        _PyUnicode_UTF8(unicode) = NULL;
1430        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1431#else
1432        /* sizeof(wchar_t) == 4 */
1433        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1434            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1435        if (!_PyUnicode_DATA_ANY(unicode)) {
1436            PyErr_NoMemory();
1437            return -1;
1438        }
1439        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1440                                _PyUnicode_WSTR(unicode), end,
1441                                PyUnicode_2BYTE_DATA(unicode));
1442        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1443        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1444        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1445        _PyUnicode_UTF8(unicode) = NULL;
1446        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1447        PyObject_FREE(_PyUnicode_WSTR(unicode));
1448        _PyUnicode_WSTR(unicode) = NULL;
1449        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1450#endif
1451    }
1452    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1453    else {
1454#if SIZEOF_WCHAR_T == 2
1455        /* in case the native representation is 2-bytes, we need to allocate a
1456           new normalized 4-byte version. */
1457        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1458        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1459        if (!_PyUnicode_DATA_ANY(unicode)) {
1460            PyErr_NoMemory();
1461            return -1;
1462        }
1463        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1464        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1465        _PyUnicode_UTF8(unicode) = NULL;
1466        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1467        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1468        _PyUnicode_STATE(unicode).ready = 1;
1469        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1470        PyObject_FREE(_PyUnicode_WSTR(unicode));
1471        _PyUnicode_WSTR(unicode) = NULL;
1472        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1473#else
1474        assert(num_surrogates == 0);
1475
1476        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1477        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1478        _PyUnicode_UTF8(unicode) = NULL;
1479        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1480        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1481#endif
1482        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1483    }
1484    _PyUnicode_STATE(unicode).ready = 1;
1485    assert(_PyUnicode_CheckConsistency(unicode, 1));
1486    return 0;
1487}
1488
1489static void
1490unicode_dealloc(register PyObject *unicode)
1491{
1492    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1493    case SSTATE_NOT_INTERNED:
1494        break;
1495
1496    case SSTATE_INTERNED_MORTAL:
1497        /* revive dead object temporarily for DelItem */
1498        Py_REFCNT(unicode) = 3;
1499        if (PyDict_DelItem(interned, unicode) != 0)
1500            Py_FatalError(
1501                "deletion of interned string failed");
1502        break;
1503
1504    case SSTATE_INTERNED_IMMORTAL:
1505        Py_FatalError("Immortal interned string died.");
1506
1507    default:
1508        Py_FatalError("Inconsistent interned string state.");
1509    }
1510
1511    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1512        PyObject_DEL(_PyUnicode_WSTR(unicode));
1513    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1514        PyObject_DEL(_PyUnicode_UTF8(unicode));
1515    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1516        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1517
1518    Py_TYPE(unicode)->tp_free(unicode);
1519}
1520
1521#ifdef Py_DEBUG
1522static int
1523unicode_is_singleton(PyObject *unicode)
1524{
1525    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1526    if (unicode == unicode_empty)
1527        return 1;
1528    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1529    {
1530        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1531        if (ch < 256 && unicode_latin1[ch] == unicode)
1532            return 1;
1533    }
1534    return 0;
1535}
1536#endif
1537
1538static int
1539unicode_modifiable(PyObject *unicode)
1540{
1541    assert(_PyUnicode_CHECK(unicode));
1542    if (Py_REFCNT(unicode) != 1)
1543        return 0;
1544    if (_PyUnicode_HASH(unicode) != -1)
1545        return 0;
1546    if (PyUnicode_CHECK_INTERNED(unicode))
1547        return 0;
1548    if (!PyUnicode_CheckExact(unicode))
1549        return 0;
1550#ifdef Py_DEBUG
1551    /* singleton refcount is greater than 1 */
1552    assert(!unicode_is_singleton(unicode));
1553#endif
1554    return 1;
1555}
1556
1557static int
1558unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1559{
1560    PyObject *unicode;
1561    Py_ssize_t old_length;
1562
1563    assert(p_unicode != NULL);
1564    unicode = *p_unicode;
1565
1566    assert(unicode != NULL);
1567    assert(PyUnicode_Check(unicode));
1568    assert(0 <= length);
1569
1570    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1571        old_length = PyUnicode_WSTR_LENGTH(unicode);
1572    else
1573        old_length = PyUnicode_GET_LENGTH(unicode);
1574    if (old_length == length)
1575        return 0;
1576
1577    if (length == 0) {
1578        Py_DECREF(*p_unicode);
1579        *p_unicode = unicode_empty;
1580        Py_INCREF(*p_unicode);
1581        return 0;
1582    }
1583
1584    if (!unicode_modifiable(unicode)) {
1585        PyObject *copy = resize_copy(unicode, length);
1586        if (copy == NULL)
1587            return -1;
1588        Py_DECREF(*p_unicode);
1589        *p_unicode = copy;
1590        return 0;
1591    }
1592
1593    if (PyUnicode_IS_COMPACT(unicode)) {
1594        PyObject *new_unicode = resize_compact(unicode, length);
1595        if (new_unicode == NULL)
1596            return -1;
1597        *p_unicode = new_unicode;
1598        return 0;
1599    }
1600    return resize_inplace(unicode, length);
1601}
1602
1603int
1604PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1605{
1606    PyObject *unicode;
1607    if (p_unicode == NULL) {
1608        PyErr_BadInternalCall();
1609        return -1;
1610    }
1611    unicode = *p_unicode;
1612    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1613    {
1614        PyErr_BadInternalCall();
1615        return -1;
1616    }
1617    return unicode_resize(p_unicode, length);
1618}
1619
1620static int
1621unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1622              unsigned int maxchar)
1623{
1624    PyObject *result;
1625    assert(PyUnicode_IS_READY(*p_unicode));
1626    assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
1627    if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1628        return 0;
1629    result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1630                           maxchar);
1631    if (result == NULL)
1632        return -1;
1633    _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
1634    Py_DECREF(*p_unicode);
1635    *p_unicode = result;
1636    return 0;
1637}
1638
1639static int
1640unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1641                Py_UCS4 ch)
1642{
1643    assert(ch <= MAX_UNICODE);
1644    if (unicode_widen(p_unicode, *pos, ch) < 0)
1645        return -1;
1646    PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1647                    PyUnicode_DATA(*p_unicode),
1648                    (*pos)++, ch);
1649    return 0;
1650}
1651
1652/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1653   Return the length of the input string.
1654
1655   WARNING: The function doesn't copy the terminating null character and
1656   doesn't check the maximum character (may write a latin1 character in an
1657   ASCII string). */
1658static Py_ssize_t
1659unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1660{
1661    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1662    void *data = PyUnicode_DATA(unicode);
1663
1664    switch (kind) {
1665    case PyUnicode_1BYTE_KIND: {
1666        Py_ssize_t len = strlen(str);
1667        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1668        memcpy((char *) data + index, str, len);
1669        return len;
1670    }
1671    case PyUnicode_2BYTE_KIND: {
1672        Py_UCS2 *start = (Py_UCS2 *)data + index;
1673        Py_UCS2 *ucs2 = start;
1674        assert(index <= PyUnicode_GET_LENGTH(unicode));
1675
1676        for (; *str; ++ucs2, ++str)
1677            *ucs2 = (Py_UCS2)*str;
1678
1679        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1680        return ucs2 - start;
1681    }
1682    default: {
1683        Py_UCS4 *start = (Py_UCS4 *)data + index;
1684        Py_UCS4 *ucs4 = start;
1685        assert(kind == PyUnicode_4BYTE_KIND);
1686        assert(index <= PyUnicode_GET_LENGTH(unicode));
1687
1688        for (; *str; ++ucs4, ++str)
1689            *ucs4 = (Py_UCS4)*str;
1690
1691        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1692        return ucs4 - start;
1693    }
1694    }
1695}
1696
1697
1698static PyObject*
1699get_latin1_char(unsigned char ch)
1700{
1701    PyObject *unicode = unicode_latin1[ch];
1702    if (!unicode) {
1703        unicode = PyUnicode_New(1, ch);
1704        if (!unicode)
1705            return NULL;
1706        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1707        assert(_PyUnicode_CheckConsistency(unicode, 1));
1708        unicode_latin1[ch] = unicode;
1709    }
1710    Py_INCREF(unicode);
1711    return unicode;
1712}
1713
1714PyObject *
1715PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1716{
1717    PyObject *unicode;
1718    Py_UCS4 maxchar = 0;
1719    Py_ssize_t num_surrogates;
1720
1721    if (u == NULL)
1722        return (PyObject*)_PyUnicode_New(size);
1723
1724    /* If the Unicode data is known at construction time, we can apply
1725       some optimizations which share commonly used objects. */
1726
1727    /* Optimization for empty strings */
1728    if (size == 0 && unicode_empty != NULL) {
1729        Py_INCREF(unicode_empty);
1730        return unicode_empty;
1731    }
1732
1733    /* Single character Unicode objects in the Latin-1 range are
1734       shared when using this constructor */
1735    if (size == 1 && *u < 256)
1736        return get_latin1_char((unsigned char)*u);
1737
1738    /* If not empty and not single character, copy the Unicode data
1739       into the new object */
1740    if (find_maxchar_surrogates(u, u + size,
1741                                &maxchar, &num_surrogates) == -1)
1742        return NULL;
1743
1744    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1745    if (!unicode)
1746        return NULL;
1747
1748    switch (PyUnicode_KIND(unicode)) {
1749    case PyUnicode_1BYTE_KIND:
1750        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1751                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1752        break;
1753    case PyUnicode_2BYTE_KIND:
1754#if Py_UNICODE_SIZE == 2
1755        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1756#else
1757        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1758                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1759#endif
1760        break;
1761    case PyUnicode_4BYTE_KIND:
1762#if SIZEOF_WCHAR_T == 2
1763        /* This is the only case which has to process surrogates, thus
1764           a simple copy loop is not enough and we need a function. */
1765        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1766#else
1767        assert(num_surrogates == 0);
1768        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1769#endif
1770        break;
1771    default:
1772        assert(0 && "Impossible state");
1773    }
1774
1775    return unicode_result(unicode);
1776}
1777
1778PyObject *
1779PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1780{
1781    if (size < 0) {
1782        PyErr_SetString(PyExc_SystemError,
1783                        "Negative size passed to PyUnicode_FromStringAndSize");
1784        return NULL;
1785    }
1786    if (u != NULL)
1787        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1788    else
1789        return (PyObject *)_PyUnicode_New(size);
1790}
1791
1792PyObject *
1793PyUnicode_FromString(const char *u)
1794{
1795    size_t size = strlen(u);
1796    if (size > PY_SSIZE_T_MAX) {
1797        PyErr_SetString(PyExc_OverflowError, "input too long");
1798        return NULL;
1799    }
1800    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1801}
1802
1803PyObject *
1804_PyUnicode_FromId(_Py_Identifier *id)
1805{
1806    if (!id->object) {
1807        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1808                                                  strlen(id->string),
1809                                                  NULL, NULL);
1810        if (!id->object)
1811            return NULL;
1812        PyUnicode_InternInPlace(&id->object);
1813        assert(!id->next);
1814        id->next = static_strings;
1815        static_strings = id;
1816    }
1817    return id->object;
1818}
1819
1820void
1821_PyUnicode_ClearStaticStrings()
1822{
1823    _Py_Identifier *i;
1824    for (i = static_strings; i; i = i->next) {
1825        Py_DECREF(i->object);
1826        i->object = NULL;
1827        i->next = NULL;
1828    }
1829}
1830
1831/* Internal function, doesn't check maximum character */
1832
1833PyObject*
1834_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1835{
1836    const unsigned char *s = (const unsigned char *)buffer;
1837    PyObject *unicode;
1838    if (size == 1) {
1839#ifdef Py_DEBUG
1840        assert(s[0] < 128);
1841#endif
1842        return get_latin1_char(s[0]);
1843    }
1844    unicode = PyUnicode_New(size, 127);
1845    if (!unicode)
1846        return NULL;
1847    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1848    assert(_PyUnicode_CheckConsistency(unicode, 1));
1849    return unicode;
1850}
1851
1852static Py_UCS4
1853kind_maxchar_limit(unsigned int kind)
1854{
1855    switch (kind) {
1856    case PyUnicode_1BYTE_KIND:
1857        return 0x80;
1858    case PyUnicode_2BYTE_KIND:
1859        return 0x100;
1860    case PyUnicode_4BYTE_KIND:
1861        return 0x10000;
1862    default:
1863        assert(0 && "invalid kind");
1864        return MAX_UNICODE;
1865    }
1866}
1867
1868Py_LOCAL_INLINE(Py_UCS4)
1869align_maxchar(Py_UCS4 maxchar)
1870{
1871    if (maxchar <= 127)
1872        return 127;
1873    else if (maxchar <= 255)
1874        return 255;
1875    else if (maxchar <= 65535)
1876        return 65535;
1877    else
1878        return MAX_UNICODE;
1879}
1880
1881static PyObject*
1882_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1883{
1884    PyObject *res;
1885    unsigned char max_char;
1886
1887    if (size == 0) {
1888        Py_INCREF(unicode_empty);
1889        return unicode_empty;
1890    }
1891    assert(size > 0);
1892    if (size == 1)
1893        return get_latin1_char(u[0]);
1894
1895    max_char = ucs1lib_find_max_char(u, u + size);
1896    res = PyUnicode_New(size, max_char);
1897    if (!res)
1898        return NULL;
1899    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1900    assert(_PyUnicode_CheckConsistency(res, 1));
1901    return res;
1902}
1903
1904static PyObject*
1905_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1906{
1907    PyObject *res;
1908    Py_UCS2 max_char;
1909
1910    if (size == 0) {
1911        Py_INCREF(unicode_empty);
1912        return unicode_empty;
1913    }
1914    assert(size > 0);
1915    if (size == 1) {
1916        Py_UCS4 ch = u[0];
1917        if (ch < 256)
1918            return get_latin1_char((unsigned char)ch);
1919
1920        res = PyUnicode_New(1, ch);
1921        if (res == NULL)
1922            return NULL;
1923        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1924        assert(_PyUnicode_CheckConsistency(res, 1));
1925        return res;
1926    }
1927
1928    max_char = ucs2lib_find_max_char(u, u + size);
1929    res = PyUnicode_New(size, max_char);
1930    if (!res)
1931        return NULL;
1932    if (max_char >= 256)
1933        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1934    else {
1935        _PyUnicode_CONVERT_BYTES(
1936            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1937    }
1938    assert(_PyUnicode_CheckConsistency(res, 1));
1939    return res;
1940}
1941
1942static PyObject*
1943_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1944{
1945    PyObject *res;
1946    Py_UCS4 max_char;
1947
1948    if (size == 0) {
1949        Py_INCREF(unicode_empty);
1950        return unicode_empty;
1951    }
1952    assert(size > 0);
1953    if (size == 1) {
1954        Py_UCS4 ch = u[0];
1955        if (ch < 256)
1956            return get_latin1_char((unsigned char)ch);
1957
1958        res = PyUnicode_New(1, ch);
1959        if (res == NULL)
1960            return NULL;
1961        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1962        assert(_PyUnicode_CheckConsistency(res, 1));
1963        return res;
1964    }
1965
1966    max_char = ucs4lib_find_max_char(u, u + size);
1967    res = PyUnicode_New(size, max_char);
1968    if (!res)
1969        return NULL;
1970    if (max_char < 256)
1971        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1972                                 PyUnicode_1BYTE_DATA(res));
1973    else if (max_char < 0x10000)
1974        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1975                                 PyUnicode_2BYTE_DATA(res));
1976    else
1977        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1978    assert(_PyUnicode_CheckConsistency(res, 1));
1979    return res;
1980}
1981
1982PyObject*
1983PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1984{
1985    if (size < 0) {
1986        PyErr_SetString(PyExc_ValueError, "size must be positive");
1987        return NULL;
1988    }
1989    switch (kind) {
1990    case PyUnicode_1BYTE_KIND:
1991        return _PyUnicode_FromUCS1(buffer, size);
1992    case PyUnicode_2BYTE_KIND:
1993        return _PyUnicode_FromUCS2(buffer, size);
1994    case PyUnicode_4BYTE_KIND:
1995        return _PyUnicode_FromUCS4(buffer, size);
1996    default:
1997        PyErr_SetString(PyExc_SystemError, "invalid kind");
1998        return NULL;
1999    }
2000}
2001
2002Py_UCS4
2003_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2004{
2005    enum PyUnicode_Kind kind;
2006    void *startptr, *endptr;
2007
2008    assert(PyUnicode_IS_READY(unicode));
2009    assert(0 <= start);
2010    assert(end <= PyUnicode_GET_LENGTH(unicode));
2011    assert(start <= end);
2012
2013    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2014        return PyUnicode_MAX_CHAR_VALUE(unicode);
2015
2016    if (start == end)
2017        return 127;
2018
2019    if (PyUnicode_IS_ASCII(unicode))
2020        return 127;
2021
2022    kind = PyUnicode_KIND(unicode);
2023    startptr = PyUnicode_DATA(unicode);
2024    endptr = (char *)startptr + end * kind;
2025    startptr = (char *)startptr + start * kind;
2026    switch(kind) {
2027    case PyUnicode_1BYTE_KIND:
2028        return ucs1lib_find_max_char(startptr, endptr);
2029    case PyUnicode_2BYTE_KIND:
2030        return ucs2lib_find_max_char(startptr, endptr);
2031    case PyUnicode_4BYTE_KIND:
2032        return ucs4lib_find_max_char(startptr, endptr);
2033    default:
2034        assert(0);
2035        return 0;
2036    }
2037}
2038
2039/* Ensure that a string uses the most efficient storage, if it is not the
2040   case: create a new string with of the right kind. Write NULL into *p_unicode
2041   on error. */
2042static void
2043unicode_adjust_maxchar(PyObject **p_unicode)
2044{
2045    PyObject *unicode, *copy;
2046    Py_UCS4 max_char;
2047    Py_ssize_t len;
2048    unsigned int kind;
2049
2050    assert(p_unicode != NULL);
2051    unicode = *p_unicode;
2052    assert(PyUnicode_IS_READY(unicode));
2053    if (PyUnicode_IS_ASCII(unicode))
2054        return;
2055
2056    len = PyUnicode_GET_LENGTH(unicode);
2057    kind = PyUnicode_KIND(unicode);
2058    if (kind == PyUnicode_1BYTE_KIND) {
2059        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2060        max_char = ucs1lib_find_max_char(u, u + len);
2061        if (max_char >= 128)
2062            return;
2063    }
2064    else if (kind == PyUnicode_2BYTE_KIND) {
2065        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2066        max_char = ucs2lib_find_max_char(u, u + len);
2067        if (max_char >= 256)
2068            return;
2069    }
2070    else {
2071        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2072        assert(kind == PyUnicode_4BYTE_KIND);
2073        max_char = ucs4lib_find_max_char(u, u + len);
2074        if (max_char >= 0x10000)
2075            return;
2076    }
2077    copy = PyUnicode_New(len, max_char);
2078    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2079    Py_DECREF(unicode);
2080    *p_unicode = copy;
2081}
2082
2083PyObject*
2084_PyUnicode_Copy(PyObject *unicode)
2085{
2086    Py_ssize_t length;
2087    PyObject *copy;
2088
2089    if (!PyUnicode_Check(unicode)) {
2090        PyErr_BadInternalCall();
2091        return NULL;
2092    }
2093    if (PyUnicode_READY(unicode) == -1)
2094        return NULL;
2095
2096    length = PyUnicode_GET_LENGTH(unicode);
2097    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2098    if (!copy)
2099        return NULL;
2100    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2101
2102    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2103              length * PyUnicode_KIND(unicode));
2104    assert(_PyUnicode_CheckConsistency(copy, 1));
2105    return copy;
2106}
2107
2108
2109/* Widen Unicode objects to larger buffers. Don't write terminating null
2110   character. Return NULL on error. */
2111
2112void*
2113_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2114{
2115    Py_ssize_t len;
2116    void *result;
2117    unsigned int skind;
2118
2119    if (PyUnicode_READY(s) == -1)
2120        return NULL;
2121
2122    len = PyUnicode_GET_LENGTH(s);
2123    skind = PyUnicode_KIND(s);
2124    if (skind >= kind) {
2125        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2126        return NULL;
2127    }
2128    switch (kind) {
2129    case PyUnicode_2BYTE_KIND:
2130        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2131        if (!result)
2132            return PyErr_NoMemory();
2133        assert(skind == PyUnicode_1BYTE_KIND);
2134        _PyUnicode_CONVERT_BYTES(
2135            Py_UCS1, Py_UCS2,
2136            PyUnicode_1BYTE_DATA(s),
2137            PyUnicode_1BYTE_DATA(s) + len,
2138            result);
2139        return result;
2140    case PyUnicode_4BYTE_KIND:
2141        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2142        if (!result)
2143            return PyErr_NoMemory();
2144        if (skind == PyUnicode_2BYTE_KIND) {
2145            _PyUnicode_CONVERT_BYTES(
2146                Py_UCS2, Py_UCS4,
2147                PyUnicode_2BYTE_DATA(s),
2148                PyUnicode_2BYTE_DATA(s) + len,
2149                result);
2150        }
2151        else {
2152            assert(skind == PyUnicode_1BYTE_KIND);
2153            _PyUnicode_CONVERT_BYTES(
2154                Py_UCS1, Py_UCS4,
2155                PyUnicode_1BYTE_DATA(s),
2156                PyUnicode_1BYTE_DATA(s) + len,
2157                result);
2158        }
2159        return result;
2160    default:
2161        break;
2162    }
2163    PyErr_SetString(PyExc_SystemError, "invalid kind");
2164    return NULL;
2165}
2166
2167static Py_UCS4*
2168as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2169        int copy_null)
2170{
2171    int kind;
2172    void *data;
2173    Py_ssize_t len, targetlen;
2174    if (PyUnicode_READY(string) == -1)
2175        return NULL;
2176    kind = PyUnicode_KIND(string);
2177    data = PyUnicode_DATA(string);
2178    len = PyUnicode_GET_LENGTH(string);
2179    targetlen = len;
2180    if (copy_null)
2181        targetlen++;
2182    if (!target) {
2183        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2184            PyErr_NoMemory();
2185            return NULL;
2186        }
2187        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2188        if (!target) {
2189            PyErr_NoMemory();
2190            return NULL;
2191        }
2192    }
2193    else {
2194        if (targetsize < targetlen) {
2195            PyErr_Format(PyExc_SystemError,
2196                         "string is longer than the buffer");
2197            if (copy_null && 0 < targetsize)
2198                target[0] = 0;
2199            return NULL;
2200        }
2201    }
2202    if (kind == PyUnicode_1BYTE_KIND) {
2203        Py_UCS1 *start = (Py_UCS1 *) data;
2204        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2205    }
2206    else if (kind == PyUnicode_2BYTE_KIND) {
2207        Py_UCS2 *start = (Py_UCS2 *) data;
2208        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2209    }
2210    else {
2211        assert(kind == PyUnicode_4BYTE_KIND);
2212        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2213    }
2214    if (copy_null)
2215        target[len] = 0;
2216    return target;
2217}
2218
2219Py_UCS4*
2220PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2221                 int copy_null)
2222{
2223    if (target == NULL || targetsize < 0) {
2224        PyErr_BadInternalCall();
2225        return NULL;
2226    }
2227    return as_ucs4(string, target, targetsize, copy_null);
2228}
2229
2230Py_UCS4*
2231PyUnicode_AsUCS4Copy(PyObject *string)
2232{
2233    return as_ucs4(string, NULL, 0, 1);
2234}
2235
2236#ifdef HAVE_WCHAR_H
2237
2238PyObject *
2239PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2240{
2241    if (w == NULL) {
2242        if (size == 0) {
2243            Py_INCREF(unicode_empty);
2244            return unicode_empty;
2245        }
2246        PyErr_BadInternalCall();
2247        return NULL;
2248    }
2249
2250    if (size == -1) {
2251        size = wcslen(w);
2252    }
2253
2254    return PyUnicode_FromUnicode(w, size);
2255}
2256
2257#endif /* HAVE_WCHAR_H */
2258
2259static void
2260makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2261        int zeropad, int width, int precision, char c)
2262{
2263    *fmt++ = '%';
2264    if (width) {
2265        if (zeropad)
2266            *fmt++ = '0';
2267        fmt += sprintf(fmt, "%d", width);
2268    }
2269    if (precision)
2270        fmt += sprintf(fmt, ".%d", precision);
2271    if (longflag)
2272        *fmt++ = 'l';
2273    else if (longlongflag) {
2274        /* longlongflag should only ever be nonzero on machines with
2275           HAVE_LONG_LONG defined */
2276#ifdef HAVE_LONG_LONG
2277        char *f = PY_FORMAT_LONG_LONG;
2278        while (*f)
2279            *fmt++ = *f++;
2280#else
2281        /* we shouldn't ever get here */
2282        assert(0);
2283        *fmt++ = 'l';
2284#endif
2285    }
2286    else if (size_tflag) {
2287        char *f = PY_FORMAT_SIZE_T;
2288        while (*f)
2289            *fmt++ = *f++;
2290    }
2291    *fmt++ = c;
2292    *fmt = '\0';
2293}
2294
2295/* helper for PyUnicode_FromFormatV() */
2296
2297static const char*
2298parse_format_flags(const char *f,
2299                   int *p_width, int *p_precision,
2300                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2301{
2302    int width, precision, longflag, longlongflag, size_tflag;
2303
2304    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2305    f++;
2306    width = 0;
2307    while (Py_ISDIGIT((unsigned)*f))
2308        width = (width*10) + *f++ - '0';
2309    precision = 0;
2310    if (*f == '.') {
2311        f++;
2312        while (Py_ISDIGIT((unsigned)*f))
2313            precision = (precision*10) + *f++ - '0';
2314        if (*f == '%') {
2315            /* "%.3%s" => f points to "3" */
2316            f--;
2317        }
2318    }
2319    if (*f == '\0') {
2320        /* bogus format "%.1" => go backward, f points to "1" */
2321        f--;
2322    }
2323    if (p_width != NULL)
2324        *p_width = width;
2325    if (p_precision != NULL)
2326        *p_precision = precision;
2327
2328    /* Handle %ld, %lu, %lld and %llu. */
2329    longflag = 0;
2330    longlongflag = 0;
2331    size_tflag = 0;
2332
2333    if (*f == 'l') {
2334        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2335            longflag = 1;
2336            ++f;
2337        }
2338#ifdef HAVE_LONG_LONG
2339        else if (f[1] == 'l' &&
2340                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2341            longlongflag = 1;
2342            f += 2;
2343        }
2344#endif
2345    }
2346    /* handle the size_t flag. */
2347    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2348        size_tflag = 1;
2349        ++f;
2350    }
2351    if (p_longflag != NULL)
2352        *p_longflag = longflag;
2353    if (p_longlongflag != NULL)
2354        *p_longlongflag = longlongflag;
2355    if (p_size_tflag != NULL)
2356        *p_size_tflag = size_tflag;
2357    return f;
2358}
2359
2360/* maximum number of characters required for output of %ld.  21 characters
2361   allows for 64-bit integers (in decimal) and an optional sign. */
2362#define MAX_LONG_CHARS 21
2363/* maximum number of characters required for output of %lld.
2364   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2365   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2366#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2367
2368PyObject *
2369PyUnicode_FromFormatV(const char *format, va_list vargs)
2370{
2371    va_list count;
2372    Py_ssize_t callcount = 0;
2373    PyObject **callresults = NULL;
2374    PyObject **callresult = NULL;
2375    Py_ssize_t n = 0;
2376    int width = 0;
2377    int precision = 0;
2378    int zeropad;
2379    const char* f;
2380    PyObject *string;
2381    /* used by sprintf */
2382    char fmt[61]; /* should be enough for %0width.precisionlld */
2383    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2384    Py_UCS4 argmaxchar;
2385    Py_ssize_t numbersize = 0;
2386    char *numberresults = NULL;
2387    char *numberresult = NULL;
2388    Py_ssize_t i;
2389    int kind;
2390    void *data;
2391
2392    Py_VA_COPY(count, vargs);
2393    /* step 1: count the number of %S/%R/%A/%s format specifications
2394     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2395     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2396     * result in an array)
2397     * also estimate a upper bound for all the number formats in the string,
2398     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2399     * buffer before putting everything together. */
2400    for (f = format; *f; f++) {
2401        if (*f == '%') {
2402            int longlongflag;
2403            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2404            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2405            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2406                ++callcount;
2407
2408            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2409#ifdef HAVE_LONG_LONG
2410                if (longlongflag) {
2411                    if (width < MAX_LONG_LONG_CHARS)
2412                        width = MAX_LONG_LONG_CHARS;
2413                }
2414                else
2415#endif
2416                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2417                       including sign.  Decimal takes the most space.  This
2418                       isn't enough for octal.  If a width is specified we
2419                       need more (which we allocate later). */
2420                    if (width < MAX_LONG_CHARS)
2421                        width = MAX_LONG_CHARS;
2422
2423                /* account for the size + '\0' to separate numbers
2424                   inside of the numberresults buffer */
2425                numbersize += (width + 1);
2426            }
2427        }
2428        else if ((unsigned char)*f > 127) {
2429            PyErr_Format(PyExc_ValueError,
2430                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2431                "string, got a non-ASCII byte: 0x%02x",
2432                (unsigned char)*f);
2433            return NULL;
2434        }
2435    }
2436    /* step 2: allocate memory for the results of
2437     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2438    if (callcount) {
2439        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2440        if (!callresults) {
2441            PyErr_NoMemory();
2442            return NULL;
2443        }
2444        callresult = callresults;
2445    }
2446    /* step 2.5: allocate memory for the results of formating numbers */
2447    if (numbersize) {
2448        numberresults = PyObject_Malloc(numbersize);
2449        if (!numberresults) {
2450            PyErr_NoMemory();
2451            goto fail;
2452        }
2453        numberresult = numberresults;
2454    }
2455
2456    /* step 3: format numbers and figure out how large a buffer we need */
2457    for (f = format; *f; f++) {
2458        if (*f == '%') {
2459            const char* p;
2460            int longflag;
2461            int longlongflag;
2462            int size_tflag;
2463            int numprinted;
2464
2465            p = f;
2466            zeropad = (f[1] == '0');
2467            f = parse_format_flags(f, &width, &precision,
2468                                   &longflag, &longlongflag, &size_tflag);
2469            switch (*f) {
2470            case 'c':
2471            {
2472                Py_UCS4 ordinal = va_arg(count, int);
2473                maxchar = MAX_MAXCHAR(maxchar, ordinal);
2474                n++;
2475                break;
2476            }
2477            case '%':
2478                n++;
2479                break;
2480            case 'i':
2481            case 'd':
2482                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2483                        width, precision, *f);
2484                if (longflag)
2485                    numprinted = sprintf(numberresult, fmt,
2486                                         va_arg(count, long));
2487#ifdef HAVE_LONG_LONG
2488                else if (longlongflag)
2489                    numprinted = sprintf(numberresult, fmt,
2490                                         va_arg(count, PY_LONG_LONG));
2491#endif
2492                else if (size_tflag)
2493                    numprinted = sprintf(numberresult, fmt,
2494                                         va_arg(count, Py_ssize_t));
2495                else
2496                    numprinted = sprintf(numberresult, fmt,
2497                                         va_arg(count, int));
2498                n += numprinted;
2499                /* advance by +1 to skip over the '\0' */
2500                numberresult += (numprinted + 1);
2501                assert(*(numberresult - 1) == '\0');
2502                assert(*(numberresult - 2) != '\0');
2503                assert(numprinted >= 0);
2504                assert(numberresult <= numberresults + numbersize);
2505                break;
2506            case 'u':
2507                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2508                        width, precision, 'u');
2509                if (longflag)
2510                    numprinted = sprintf(numberresult, fmt,
2511                                         va_arg(count, unsigned long));
2512#ifdef HAVE_LONG_LONG
2513                else if (longlongflag)
2514                    numprinted = sprintf(numberresult, fmt,
2515                                         va_arg(count, unsigned PY_LONG_LONG));
2516#endif
2517                else if (size_tflag)
2518                    numprinted = sprintf(numberresult, fmt,
2519                                         va_arg(count, size_t));
2520                else
2521                    numprinted = sprintf(numberresult, fmt,
2522                                         va_arg(count, unsigned int));
2523                n += numprinted;
2524                numberresult += (numprinted + 1);
2525                assert(*(numberresult - 1) == '\0');
2526                assert(*(numberresult - 2) != '\0');
2527                assert(numprinted >= 0);
2528                assert(numberresult <= numberresults + numbersize);
2529                break;
2530            case 'x':
2531                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2532                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2533                n += numprinted;
2534                numberresult += (numprinted + 1);
2535                assert(*(numberresult - 1) == '\0');
2536                assert(*(numberresult - 2) != '\0');
2537                assert(numprinted >= 0);
2538                assert(numberresult <= numberresults + numbersize);
2539                break;
2540            case 'p':
2541                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2542                /* %p is ill-defined:  ensure leading 0x. */
2543                if (numberresult[1] == 'X')
2544                    numberresult[1] = 'x';
2545                else if (numberresult[1] != 'x') {
2546                    memmove(numberresult + 2, numberresult,
2547                            strlen(numberresult) + 1);
2548                    numberresult[0] = '0';
2549                    numberresult[1] = 'x';
2550                    numprinted += 2;
2551                }
2552                n += numprinted;
2553                numberresult += (numprinted + 1);
2554                assert(*(numberresult - 1) == '\0');
2555                assert(*(numberresult - 2) != '\0');
2556                assert(numprinted >= 0);
2557                assert(numberresult <= numberresults + numbersize);
2558                break;
2559            case 's':
2560            {
2561                /* UTF-8 */
2562                const char *s = va_arg(count, const char*);
2563                PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2564                if (!str)
2565                    goto fail;
2566                /* since PyUnicode_DecodeUTF8 returns already flexible
2567                   unicode objects, there is no need to call ready on them */
2568                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2569                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2570                n += PyUnicode_GET_LENGTH(str);
2571                /* Remember the str and switch to the next slot */
2572                *callresult++ = str;
2573                break;
2574            }
2575            case 'U':
2576            {
2577                PyObject *obj = va_arg(count, PyObject *);
2578                assert(obj && _PyUnicode_CHECK(obj));
2579                if (PyUnicode_READY(obj) == -1)
2580                    goto fail;
2581                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2582                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2583                n += PyUnicode_GET_LENGTH(obj);
2584                break;
2585            }
2586            case 'V':
2587            {
2588                PyObject *obj = va_arg(count, PyObject *);
2589                const char *str = va_arg(count, const char *);
2590                PyObject *str_obj;
2591                assert(obj || str);
2592                assert(!obj || _PyUnicode_CHECK(obj));
2593                if (obj) {
2594                    if (PyUnicode_READY(obj) == -1)
2595                        goto fail;
2596                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2597                    maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2598                    n += PyUnicode_GET_LENGTH(obj);
2599                    *callresult++ = NULL;
2600                }
2601                else {
2602                    str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2603                    if (!str_obj)
2604                        goto fail;
2605                    if (PyUnicode_READY(str_obj) == -1) {
2606                        Py_DECREF(str_obj);
2607                        goto fail;
2608                    }
2609                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2610                    maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2611                    n += PyUnicode_GET_LENGTH(str_obj);
2612                    *callresult++ = str_obj;
2613                }
2614                break;
2615            }
2616            case 'S':
2617            {
2618                PyObject *obj = va_arg(count, PyObject *);
2619                PyObject *str;
2620                assert(obj);
2621                str = PyObject_Str(obj);
2622                if (!str)
2623                    goto fail;
2624                if (PyUnicode_READY(str) == -1) {
2625                    Py_DECREF(str);
2626                    goto fail;
2627                }
2628                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2629                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2630                n += PyUnicode_GET_LENGTH(str);
2631                /* Remember the str and switch to the next slot */
2632                *callresult++ = str;
2633                break;
2634            }
2635            case 'R':
2636            {
2637                PyObject *obj = va_arg(count, PyObject *);
2638                PyObject *repr;
2639                assert(obj);
2640                repr = PyObject_Repr(obj);
2641                if (!repr)
2642                    goto fail;
2643                if (PyUnicode_READY(repr) == -1) {
2644                    Py_DECREF(repr);
2645                    goto fail;
2646                }
2647                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2648                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2649                n += PyUnicode_GET_LENGTH(repr);
2650                /* Remember the repr and switch to the next slot */
2651                *callresult++ = repr;
2652                break;
2653            }
2654            case 'A':
2655            {
2656                PyObject *obj = va_arg(count, PyObject *);
2657                PyObject *ascii;
2658                assert(obj);
2659                ascii = PyObject_ASCII(obj);
2660                if (!ascii)
2661                    goto fail;
2662                if (PyUnicode_READY(ascii) == -1) {
2663                    Py_DECREF(ascii);
2664                    goto fail;
2665                }
2666                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2667                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2668                n += PyUnicode_GET_LENGTH(ascii);
2669                /* Remember the repr and switch to the next slot */
2670                *callresult++ = ascii;
2671                break;
2672            }
2673            default:
2674                /* if we stumble upon an unknown
2675                   formatting code, copy the rest of
2676                   the format string to the output
2677                   string. (we cannot just skip the
2678                   code, since there's no way to know
2679                   what's in the argument list) */
2680                n += strlen(p);
2681                goto expand;
2682            }
2683        } else
2684            n++;
2685    }
2686  expand:
2687    /* step 4: fill the buffer */
2688    /* Since we've analyzed how much space we need,
2689       we don't have to resize the string.
2690       There can be no errors beyond this point. */
2691    string = PyUnicode_New(n, maxchar);
2692    if (!string)
2693        goto fail;
2694    kind = PyUnicode_KIND(string);
2695    data = PyUnicode_DATA(string);
2696    callresult = callresults;
2697    numberresult = numberresults;
2698
2699    for (i = 0, f = format; *f; f++) {
2700        if (*f == '%') {
2701            const char* p;
2702
2703            p = f;
2704            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2705            /* checking for == because the last argument could be a empty
2706               string, which causes i to point to end, the assert at the end of
2707               the loop */
2708            assert(i <= PyUnicode_GET_LENGTH(string));
2709
2710            switch (*f) {
2711            case 'c':
2712            {
2713                const int ordinal = va_arg(vargs, int);
2714                PyUnicode_WRITE(kind, data, i++, ordinal);
2715                break;
2716            }
2717            case 'i':
2718            case 'd':
2719            case 'u':
2720            case 'x':
2721            case 'p':
2722            {
2723                Py_ssize_t written;
2724                /* unused, since we already have the result */
2725                if (*f == 'p')
2726                    (void) va_arg(vargs, void *);
2727                else
2728                    (void) va_arg(vargs, int);
2729                /* extract the result from numberresults and append. */
2730                written = unicode_write_cstr(string, i, numberresult);
2731                /* skip over the separating '\0' */
2732                i += written;
2733                numberresult += written;
2734                assert(*numberresult == '\0');
2735                numberresult++;
2736                assert(numberresult <= numberresults + numbersize);
2737                break;
2738            }
2739            case 's':
2740            {
2741                /* unused, since we already have the result */
2742                Py_ssize_t size;
2743                (void) va_arg(vargs, char *);
2744                size = PyUnicode_GET_LENGTH(*callresult);
2745                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2746                _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2747                i += size;
2748                /* We're done with the unicode()/repr() => forget it */
2749                Py_DECREF(*callresult);
2750                /* switch to next unicode()/repr() result */
2751                ++callresult;
2752                break;
2753            }
2754            case 'U':
2755            {
2756                PyObject *obj = va_arg(vargs, PyObject *);
2757                Py_ssize_t size;
2758                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2759                size = PyUnicode_GET_LENGTH(obj);
2760                _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2761                i += size;
2762                break;
2763            }
2764            case 'V':
2765            {
2766                Py_ssize_t size;
2767                PyObject *obj = va_arg(vargs, PyObject *);
2768                va_arg(vargs, const char *);
2769                if (obj) {
2770                    size = PyUnicode_GET_LENGTH(obj);
2771                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2772                    _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2773                    i += size;
2774                } else {
2775                    size = PyUnicode_GET_LENGTH(*callresult);
2776                    assert(PyUnicode_KIND(*callresult) <=
2777                           PyUnicode_KIND(string));
2778                    _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2779                    i += size;
2780                    Py_DECREF(*callresult);
2781                }
2782                ++callresult;
2783                break;
2784            }
2785            case 'S':
2786            case 'R':
2787            case 'A':
2788            {
2789                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2790                /* unused, since we already have the result */
2791                (void) va_arg(vargs, PyObject *);
2792                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2793                _PyUnicode_FastCopyCharacters(string, i, *callresult, 0,  size);
2794                i += size;
2795                /* We're done with the unicode()/repr() => forget it */
2796                Py_DECREF(*callresult);
2797                /* switch to next unicode()/repr() result */
2798                ++callresult;
2799                break;
2800            }
2801            case '%':
2802                PyUnicode_WRITE(kind, data, i++, '%');
2803                break;
2804            default:
2805                i += unicode_write_cstr(string, i, p);
2806                assert(i == PyUnicode_GET_LENGTH(string));
2807                goto end;
2808            }
2809        }
2810        else {
2811            assert(i < PyUnicode_GET_LENGTH(string));
2812            PyUnicode_WRITE(kind, data, i++, *f);
2813        }
2814    }
2815    assert(i == PyUnicode_GET_LENGTH(string));
2816
2817  end:
2818    if (callresults)
2819        PyObject_Free(callresults);
2820    if (numberresults)
2821        PyObject_Free(numberresults);
2822    return unicode_result(string);
2823  fail:
2824    if (callresults) {
2825        PyObject **callresult2 = callresults;
2826        while (callresult2 < callresult) {
2827            Py_XDECREF(*callresult2);
2828            ++callresult2;
2829        }
2830        PyObject_Free(callresults);
2831    }
2832    if (numberresults)
2833        PyObject_Free(numberresults);
2834    return NULL;
2835}
2836
2837PyObject *
2838PyUnicode_FromFormat(const char *format, ...)
2839{
2840    PyObject* ret;
2841    va_list vargs;
2842
2843#ifdef HAVE_STDARG_PROTOTYPES
2844    va_start(vargs, format);
2845#else
2846    va_start(vargs);
2847#endif
2848    ret = PyUnicode_FromFormatV(format, vargs);
2849    va_end(vargs);
2850    return ret;
2851}
2852
2853#ifdef HAVE_WCHAR_H
2854
2855/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2856   convert a Unicode object to a wide character string.
2857
2858   - If w is NULL: return the number of wide characters (including the null
2859     character) required to convert the unicode object. Ignore size argument.
2860
2861   - Otherwise: return the number of wide characters (excluding the null
2862     character) written into w. Write at most size wide characters (including
2863     the null character). */
2864static Py_ssize_t
2865unicode_aswidechar(PyObject *unicode,
2866                   wchar_t *w,
2867                   Py_ssize_t size)
2868{
2869    Py_ssize_t res;
2870    const wchar_t *wstr;
2871
2872    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2873    if (wstr == NULL)
2874        return -1;
2875
2876    if (w != NULL) {
2877        if (size > res)
2878            size = res + 1;
2879        else
2880            res = size;
2881        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2882        return res;
2883    }
2884    else
2885        return res + 1;
2886}
2887
2888Py_ssize_t
2889PyUnicode_AsWideChar(PyObject *unicode,
2890                     wchar_t *w,
2891                     Py_ssize_t size)
2892{
2893    if (unicode == NULL) {
2894        PyErr_BadInternalCall();
2895        return -1;
2896    }
2897    return unicode_aswidechar(unicode, w, size);
2898}
2899
2900wchar_t*
2901PyUnicode_AsWideCharString(PyObject *unicode,
2902                           Py_ssize_t *size)
2903{
2904    wchar_t* buffer;
2905    Py_ssize_t buflen;
2906
2907    if (unicode == NULL) {
2908        PyErr_BadInternalCall();
2909        return NULL;
2910    }
2911
2912    buflen = unicode_aswidechar(unicode, NULL, 0);
2913    if (buflen == -1)
2914        return NULL;
2915    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2916        PyErr_NoMemory();
2917        return NULL;
2918    }
2919
2920    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2921    if (buffer == NULL) {
2922        PyErr_NoMemory();
2923        return NULL;
2924    }
2925    buflen = unicode_aswidechar(unicode, buffer, buflen);
2926    if (buflen == -1)
2927        return NULL;
2928    if (size != NULL)
2929        *size = buflen;
2930    return buffer;
2931}
2932
2933#endif /* HAVE_WCHAR_H */
2934
2935PyObject *
2936PyUnicode_FromOrdinal(int ordinal)
2937{
2938    PyObject *v;
2939    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2940        PyErr_SetString(PyExc_ValueError,
2941                        "chr() arg not in range(0x110000)");
2942        return NULL;
2943    }
2944
2945    if (ordinal < 256)
2946        return get_latin1_char(ordinal);
2947
2948    v = PyUnicode_New(1, ordinal);
2949    if (v == NULL)
2950        return NULL;
2951    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2952    assert(_PyUnicode_CheckConsistency(v, 1));
2953    return v;
2954}
2955
2956PyObject *
2957PyUnicode_FromObject(register PyObject *obj)
2958{
2959    /* XXX Perhaps we should make this API an alias of
2960       PyObject_Str() instead ?! */
2961    if (PyUnicode_CheckExact(obj)) {
2962        if (PyUnicode_READY(obj) == -1)
2963            return NULL;
2964        Py_INCREF(obj);
2965        return obj;
2966    }
2967    if (PyUnicode_Check(obj)) {
2968        /* For a Unicode subtype that's not a Unicode object,
2969           return a true Unicode object with the same data. */
2970        return _PyUnicode_Copy(obj);
2971    }
2972    PyErr_Format(PyExc_TypeError,
2973                 "Can't convert '%.100s' object to str implicitly",
2974                 Py_TYPE(obj)->tp_name);
2975    return NULL;
2976}
2977
2978PyObject *
2979PyUnicode_FromEncodedObject(register PyObject *obj,
2980                            const char *encoding,
2981                            const char *errors)
2982{
2983    Py_buffer buffer;
2984    PyObject *v;
2985
2986    if (obj == NULL) {
2987        PyErr_BadInternalCall();
2988        return NULL;
2989    }
2990
2991    /* Decoding bytes objects is the most common case and should be fast */
2992    if (PyBytes_Check(obj)) {
2993        if (PyBytes_GET_SIZE(obj) == 0) {
2994            Py_INCREF(unicode_empty);
2995            v = unicode_empty;
2996        }
2997        else {
2998            v = PyUnicode_Decode(
2999                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3000                    encoding, errors);
3001        }
3002        return v;
3003    }
3004
3005    if (PyUnicode_Check(obj)) {
3006        PyErr_SetString(PyExc_TypeError,
3007                        "decoding str is not supported");
3008        return NULL;
3009    }
3010
3011    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3012    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3013        PyErr_Format(PyExc_TypeError,
3014                     "coercing to str: need bytes, bytearray "
3015                     "or buffer-like object, %.80s found",
3016                     Py_TYPE(obj)->tp_name);
3017        return NULL;
3018    }
3019
3020    if (buffer.len == 0) {
3021        Py_INCREF(unicode_empty);
3022        v = unicode_empty;
3023    }
3024    else
3025        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3026
3027    PyBuffer_Release(&buffer);
3028    return v;
3029}
3030
3031/* Convert encoding to lower case and replace '_' with '-' in order to
3032   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3033   1 on success. */
3034static int
3035normalize_encoding(const char *encoding,
3036                   char *lower,
3037                   size_t lower_len)
3038{
3039    const char *e;
3040    char *l;
3041    char *l_end;
3042
3043    if (encoding == NULL) {
3044        strcpy(lower, "utf-8");
3045        return 1;
3046    }
3047    e = encoding;
3048    l = lower;
3049    l_end = &lower[lower_len - 1];
3050    while (*e) {
3051        if (l == l_end)
3052            return 0;
3053        if (Py_ISUPPER(*e)) {
3054            *l++ = Py_TOLOWER(*e++);
3055        }
3056        else if (*e == '_') {
3057            *l++ = '-';
3058            e++;
3059        }
3060        else {
3061            *l++ = *e++;
3062        }
3063    }
3064    *l = '\0';
3065    return 1;
3066}
3067
3068PyObject *
3069PyUnicode_Decode(const char *s,
3070                 Py_ssize_t size,
3071                 const char *encoding,
3072                 const char *errors)
3073{
3074    PyObject *buffer = NULL, *unicode;
3075    Py_buffer info;
3076    char lower[11];  /* Enough for any encoding shortcut */
3077
3078    /* Shortcuts for common default encodings */
3079    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3080        if ((strcmp(lower, "utf-8") == 0) ||
3081            (strcmp(lower, "utf8") == 0))
3082            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3083        else if ((strcmp(lower, "latin-1") == 0) ||
3084                 (strcmp(lower, "latin1") == 0) ||
3085                 (strcmp(lower, "iso-8859-1") == 0))
3086            return PyUnicode_DecodeLatin1(s, size, errors);
3087#ifdef HAVE_MBCS
3088        else if (strcmp(lower, "mbcs") == 0)
3089            return PyUnicode_DecodeMBCS(s, size, errors);
3090#endif
3091        else if (strcmp(lower, "ascii") == 0)
3092            return PyUnicode_DecodeASCII(s, size, errors);
3093        else if (strcmp(lower, "utf-16") == 0)
3094            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3095        else if (strcmp(lower, "utf-32") == 0)
3096            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3097    }
3098
3099    /* Decode via the codec registry */
3100    buffer = NULL;
3101    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3102        goto onError;
3103    buffer = PyMemoryView_FromBuffer(&info);
3104    if (buffer == NULL)
3105        goto onError;
3106    unicode = PyCodec_Decode(buffer, encoding, errors);
3107    if (unicode == NULL)
3108        goto onError;
3109    if (!PyUnicode_Check(unicode)) {
3110        PyErr_Format(PyExc_TypeError,
3111                     "decoder did not return a str object (type=%.400s)",
3112                     Py_TYPE(unicode)->tp_name);
3113        Py_DECREF(unicode);
3114        goto onError;
3115    }
3116    Py_DECREF(buffer);
3117    return unicode_result(unicode);
3118
3119  onError:
3120    Py_XDECREF(buffer);
3121    return NULL;
3122}
3123
3124PyObject *
3125PyUnicode_AsDecodedObject(PyObject *unicode,
3126                          const char *encoding,
3127                          const char *errors)
3128{
3129    PyObject *v;
3130
3131    if (!PyUnicode_Check(unicode)) {
3132        PyErr_BadArgument();
3133        goto onError;
3134    }
3135
3136    if (encoding == NULL)
3137        encoding = PyUnicode_GetDefaultEncoding();
3138
3139    /* Decode via the codec registry */
3140    v = PyCodec_Decode(unicode, encoding, errors);
3141    if (v == NULL)
3142        goto onError;
3143    return unicode_result(v);
3144
3145  onError:
3146    return NULL;
3147}
3148
3149PyObject *
3150PyUnicode_AsDecodedUnicode(PyObject *unicode,
3151                           const char *encoding,
3152                           const char *errors)
3153{
3154    PyObject *v;
3155
3156    if (!PyUnicode_Check(unicode)) {
3157        PyErr_BadArgument();
3158        goto onError;
3159    }
3160
3161    if (encoding == NULL)
3162        encoding = PyUnicode_GetDefaultEncoding();
3163
3164    /* Decode via the codec registry */
3165    v = PyCodec_Decode(unicode, encoding, errors);
3166    if (v == NULL)
3167        goto onError;
3168    if (!PyUnicode_Check(v)) {
3169        PyErr_Format(PyExc_TypeError,
3170                     "decoder did not return a str object (type=%.400s)",
3171                     Py_TYPE(v)->tp_name);
3172        Py_DECREF(v);
3173        goto onError;
3174    }
3175    return unicode_result(v);
3176
3177  onError:
3178    return NULL;
3179}
3180
3181PyObject *
3182PyUnicode_Encode(const Py_UNICODE *s,
3183                 Py_ssize_t size,
3184                 const char *encoding,
3185                 const char *errors)
3186{
3187    PyObject *v, *unicode;
3188
3189    unicode = PyUnicode_FromUnicode(s, size);
3190    if (unicode == NULL)
3191        return NULL;
3192    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3193    Py_DECREF(unicode);
3194    return v;
3195}
3196
3197PyObject *
3198PyUnicode_AsEncodedObject(PyObject *unicode,
3199                          const char *encoding,
3200                          const char *errors)
3201{
3202    PyObject *v;
3203
3204    if (!PyUnicode_Check(unicode)) {
3205        PyErr_BadArgument();
3206        goto onError;
3207    }
3208
3209    if (encoding == NULL)
3210        encoding = PyUnicode_GetDefaultEncoding();
3211
3212    /* Encode via the codec registry */
3213    v = PyCodec_Encode(unicode, encoding, errors);
3214    if (v == NULL)
3215        goto onError;
3216    return v;
3217
3218  onError:
3219    return NULL;
3220}
3221
3222static size_t
3223wcstombs_errorpos(const wchar_t *wstr)
3224{
3225    size_t len;
3226#if SIZEOF_WCHAR_T == 2
3227    wchar_t buf[3];
3228#else
3229    wchar_t buf[2];
3230#endif
3231    char outbuf[MB_LEN_MAX];
3232    const wchar_t *start, *previous;
3233
3234#if SIZEOF_WCHAR_T == 2
3235    buf[2] = 0;
3236#else
3237    buf[1] = 0;
3238#endif
3239    start = wstr;
3240    while (*wstr != L'\0')
3241    {
3242        previous = wstr;
3243#if SIZEOF_WCHAR_T == 2
3244        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3245            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3246        {
3247            buf[0] = wstr[0];
3248            buf[1] = wstr[1];
3249            wstr += 2;
3250        }
3251        else {
3252            buf[0] = *wstr;
3253            buf[1] = 0;
3254            wstr++;
3255        }
3256#else
3257        buf[0] = *wstr;
3258        wstr++;
3259#endif
3260        len = wcstombs(outbuf, buf, sizeof(outbuf));
3261        if (len == (size_t)-1)
3262            return previous - start;
3263    }
3264
3265    /* failed to find the unencodable character */
3266    return 0;
3267}
3268
3269static int
3270locale_error_handler(const char *errors, int *surrogateescape)
3271{
3272    if (errors == NULL) {
3273        *surrogateescape = 0;
3274        return 0;
3275    }
3276
3277    if (strcmp(errors, "strict") == 0) {
3278        *surrogateescape = 0;
3279        return 0;
3280    }
3281    if (strcmp(errors, "surrogateescape") == 0) {
3282        *surrogateescape = 1;
3283        return 0;
3284    }
3285    PyErr_Format(PyExc_ValueError,
3286                 "only 'strict' and 'surrogateescape' error handlers "
3287                 "are supported, not '%s'",
3288                 errors);
3289    return -1;
3290}
3291
3292PyObject *
3293PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3294{
3295    Py_ssize_t wlen, wlen2;
3296    wchar_t *wstr;
3297    PyObject *bytes = NULL;
3298    char *errmsg;
3299    PyObject *reason;
3300    PyObject *exc;
3301    size_t error_pos;
3302    int surrogateescape;
3303
3304    if (locale_error_handler(errors, &surrogateescape) < 0)
3305        return NULL;
3306
3307    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3308    if (wstr == NULL)
3309        return NULL;
3310
3311    wlen2 = wcslen(wstr);
3312    if (wlen2 != wlen) {
3313        PyMem_Free(wstr);
3314        PyErr_SetString(PyExc_TypeError, "embedded null character");
3315        return NULL;
3316    }
3317
3318    if (surrogateescape) {
3319        /* locale encoding with surrogateescape */
3320        char *str;
3321
3322        str = _Py_wchar2char(wstr, &error_pos);
3323        if (str == NULL) {
3324            if (error_pos == (size_t)-1) {
3325                PyErr_NoMemory();
3326                PyMem_Free(wstr);
3327                return NULL;
3328            }
3329            else {
3330                goto encode_error;
3331            }
3332        }
3333        PyMem_Free(wstr);
3334
3335        bytes = PyBytes_FromString(str);
3336        PyMem_Free(str);
3337    }
3338    else {
3339        size_t len, len2;
3340
3341        len = wcstombs(NULL, wstr, 0);
3342        if (len == (size_t)-1) {
3343            error_pos = (size_t)-1;
3344            goto encode_error;
3345        }
3346
3347        bytes = PyBytes_FromStringAndSize(NULL, len);
3348        if (bytes == NULL) {
3349            PyMem_Free(wstr);
3350            return NULL;
3351        }
3352
3353        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3354        if (len2 == (size_t)-1 || len2 > len) {
3355            error_pos = (size_t)-1;
3356            goto encode_error;
3357        }
3358        PyMem_Free(wstr);
3359    }
3360    return bytes;
3361
3362encode_error:
3363    errmsg = strerror(errno);
3364    assert(errmsg != NULL);
3365
3366    if (error_pos == (size_t)-1)
3367        error_pos = wcstombs_errorpos(wstr);
3368
3369    PyMem_Free(wstr);
3370    Py_XDECREF(bytes);
3371
3372    if (errmsg != NULL) {
3373        size_t errlen;
3374        wstr = _Py_char2wchar(errmsg, &errlen);
3375        if (wstr != NULL) {
3376            reason = PyUnicode_FromWideChar(wstr, errlen);
3377            PyMem_Free(wstr);
3378        } else
3379            errmsg = NULL;
3380    }
3381    if (errmsg == NULL)
3382        reason = PyUnicode_FromString(
3383            "wcstombs() encountered an unencodable "
3384            "wide character");
3385    if (reason == NULL)
3386        return NULL;
3387
3388    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3389                                "locale", unicode,
3390                                (Py_ssize_t)error_pos,
3391                                (Py_ssize_t)(error_pos+1),
3392                                reason);
3393    Py_DECREF(reason);
3394    if (exc != NULL) {
3395        PyCodec_StrictErrors(exc);
3396        Py_XDECREF(exc);
3397    }
3398    return NULL;
3399}
3400
3401PyObject *
3402PyUnicode_EncodeFSDefault(PyObject *unicode)
3403{
3404#ifdef HAVE_MBCS
3405    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3406#elif defined(__APPLE__)
3407    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3408#else
3409    PyInterpreterState *interp = PyThreadState_GET()->interp;
3410    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3411       cannot use it to encode and decode filenames before it is loaded. Load
3412       the Python codec requires to encode at least its own filename. Use the C
3413       version of the locale codec until the codec registry is initialized and
3414       the Python codec is loaded.
3415
3416       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3417       cannot only rely on it: check also interp->fscodec_initialized for
3418       subinterpreters. */
3419    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3420        return PyUnicode_AsEncodedString(unicode,
3421                                         Py_FileSystemDefaultEncoding,
3422                                         "surrogateescape");
3423    }
3424    else {
3425        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3426    }
3427#endif
3428}
3429
3430PyObject *
3431PyUnicode_AsEncodedString(PyObject *unicode,
3432                          const char *encoding,
3433                          const char *errors)
3434{
3435    PyObject *v;
3436    char lower[11];  /* Enough for any encoding shortcut */
3437
3438    if (!PyUnicode_Check(unicode)) {
3439        PyErr_BadArgument();
3440        return NULL;
3441    }
3442
3443    /* Shortcuts for common default encodings */
3444    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3445        if ((strcmp(lower, "utf-8") == 0) ||
3446            (strcmp(lower, "utf8") == 0))
3447        {
3448            if (errors == NULL || strcmp(errors, "strict") == 0)
3449                return _PyUnicode_AsUTF8String(unicode, NULL);
3450            else
3451                return _PyUnicode_AsUTF8String(unicode, errors);
3452        }
3453        else if ((strcmp(lower, "latin-1") == 0) ||
3454                 (strcmp(lower, "latin1") == 0) ||
3455                 (strcmp(lower, "iso-8859-1") == 0))
3456            return _PyUnicode_AsLatin1String(unicode, errors);
3457#ifdef HAVE_MBCS
3458        else if (strcmp(lower, "mbcs") == 0)
3459            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3460#endif
3461        else if (strcmp(lower, "ascii") == 0)
3462            return _PyUnicode_AsASCIIString(unicode, errors);
3463    }
3464
3465    /* Encode via the codec registry */
3466    v = PyCodec_Encode(unicode, encoding, errors);
3467    if (v == NULL)
3468        return NULL;
3469
3470    /* The normal path */
3471    if (PyBytes_Check(v))
3472        return v;
3473
3474    /* If the codec returns a buffer, raise a warning and convert to bytes */
3475    if (PyByteArray_Check(v)) {
3476        int error;
3477        PyObject *b;
3478
3479        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3480            "encoder %s returned bytearray instead of bytes",
3481            encoding);
3482        if (error) {
3483            Py_DECREF(v);
3484            return NULL;
3485        }
3486
3487        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3488        Py_DECREF(v);
3489        return b;
3490    }
3491
3492    PyErr_Format(PyExc_TypeError,
3493                 "encoder did not return a bytes object (type=%.400s)",
3494                 Py_TYPE(v)->tp_name);
3495    Py_DECREF(v);
3496    return NULL;
3497}
3498
3499PyObject *
3500PyUnicode_AsEncodedUnicode(PyObject *unicode,
3501                           const char *encoding,
3502                           const char *errors)
3503{
3504    PyObject *v;
3505
3506    if (!PyUnicode_Check(unicode)) {
3507        PyErr_BadArgument();
3508        goto onError;
3509    }
3510
3511    if (encoding == NULL)
3512        encoding = PyUnicode_GetDefaultEncoding();
3513
3514    /* Encode via the codec registry */
3515    v = PyCodec_Encode(unicode, encoding, errors);
3516    if (v == NULL)
3517        goto onError;
3518    if (!PyUnicode_Check(v)) {
3519        PyErr_Format(PyExc_TypeError,
3520                     "encoder did not return an str object (type=%.400s)",
3521                     Py_TYPE(v)->tp_name);
3522        Py_DECREF(v);
3523        goto onError;
3524    }
3525    return v;
3526
3527  onError:
3528    return NULL;
3529}
3530
3531static size_t
3532mbstowcs_errorpos(const char *str, size_t len)
3533{
3534#ifdef HAVE_MBRTOWC
3535    const char *start = str;
3536    mbstate_t mbs;
3537    size_t converted;
3538    wchar_t ch;
3539
3540    memset(&mbs, 0, sizeof mbs);
3541    while (len)
3542    {
3543        converted = mbrtowc(&ch, (char*)str, len, &mbs);
3544        if (converted == 0)
3545            /* Reached end of string */
3546            break;
3547        if (converted == (size_t)-1 || converted == (size_t)-2) {
3548            /* Conversion error or incomplete character */
3549            return str - start;
3550        }
3551        else {
3552            str += converted;
3553            len -= converted;
3554        }
3555    }
3556    /* failed to find the undecodable byte sequence */
3557    return 0;
3558#endif
3559    return 0;
3560}
3561
3562PyObject*
3563PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3564                              const char *errors)
3565{
3566    wchar_t smallbuf[256];
3567    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3568    wchar_t *wstr;
3569    size_t wlen, wlen2;
3570    PyObject *unicode;
3571    int surrogateescape;
3572    size_t error_pos;
3573    char *errmsg;
3574    PyObject *reason, *exc;
3575
3576    if (locale_error_handler(errors, &surrogateescape) < 0)
3577        return NULL;
3578
3579    if (str[len] != '\0' || len != strlen(str)) {
3580        PyErr_SetString(PyExc_TypeError, "embedded null character");
3581        return NULL;
3582    }
3583
3584    if (surrogateescape)
3585    {
3586        wstr = _Py_char2wchar(str, &wlen);
3587        if (wstr == NULL) {
3588            if (wlen == (size_t)-1)
3589                PyErr_NoMemory();
3590            else
3591                PyErr_SetFromErrno(PyExc_OSError);
3592            return NULL;
3593        }
3594
3595        unicode = PyUnicode_FromWideChar(wstr, wlen);
3596        PyMem_Free(wstr);
3597    }
3598    else {
3599#ifndef HAVE_BROKEN_MBSTOWCS
3600        wlen = mbstowcs(NULL, str, 0);
3601#else
3602        wlen = len;
3603#endif
3604        if (wlen == (size_t)-1)
3605            goto decode_error;
3606        if (wlen+1 <= smallbuf_len) {
3607            wstr = smallbuf;
3608        }
3609        else {
3610            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3611                return PyErr_NoMemory();
3612
3613            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3614            if (!wstr)
3615                return PyErr_NoMemory();
3616        }
3617
3618        /* This shouldn't fail now */
3619        wlen2 = mbstowcs(wstr, str, wlen+1);
3620        if (wlen2 == (size_t)-1) {
3621            if (wstr != smallbuf)
3622                PyMem_Free(wstr);
3623            goto decode_error;
3624        }
3625#ifdef HAVE_BROKEN_MBSTOWCS
3626        assert(wlen2 == wlen);
3627#endif
3628        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3629        if (wstr != smallbuf)
3630            PyMem_Free(wstr);
3631    }
3632    return unicode;
3633
3634decode_error:
3635    errmsg = strerror(errno);
3636    assert(errmsg != NULL);
3637
3638    error_pos = mbstowcs_errorpos(str, len);
3639    if (errmsg != NULL) {
3640        size_t errlen;
3641        wstr = _Py_char2wchar(errmsg, &errlen);
3642        if (wstr != NULL) {
3643            reason = PyUnicode_FromWideChar(wstr, errlen);
3644            PyMem_Free(wstr);
3645        } else
3646            errmsg = NULL;
3647    }
3648    if (errmsg == NULL)
3649        reason = PyUnicode_FromString(
3650            "mbstowcs() encountered an invalid multibyte sequence");
3651    if (reason == NULL)
3652        return NULL;
3653
3654    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3655                                "locale", str, len,
3656                                (Py_ssize_t)error_pos,
3657                                (Py_ssize_t)(error_pos+1),
3658                                reason);
3659    Py_DECREF(reason);
3660    if (exc != NULL) {
3661        PyCodec_StrictErrors(exc);
3662        Py_XDECREF(exc);
3663    }
3664    return NULL;
3665}
3666
3667PyObject*
3668PyUnicode_DecodeLocale(const char *str, const char *errors)
3669{
3670    Py_ssize_t size = (Py_ssize_t)strlen(str);
3671    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3672}
3673
3674
3675PyObject*
3676PyUnicode_DecodeFSDefault(const char *s) {
3677    Py_ssize_t size = (Py_ssize_t)strlen(s);
3678    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3679}
3680
3681PyObject*
3682PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3683{
3684#ifdef HAVE_MBCS
3685    return PyUnicode_DecodeMBCS(s, size, NULL);
3686#elif defined(__APPLE__)
3687    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3688#else
3689    PyInterpreterState *interp = PyThreadState_GET()->interp;
3690    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3691       cannot use it to encode and decode filenames before it is loaded. Load
3692       the Python codec requires to encode at least its own filename. Use the C
3693       version of the locale codec until the codec registry is initialized and
3694       the Python codec is loaded.
3695
3696       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3697       cannot only rely on it: check also interp->fscodec_initialized for
3698       subinterpreters. */
3699    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3700        return PyUnicode_Decode(s, size,
3701                                Py_FileSystemDefaultEncoding,
3702                                "surrogateescape");
3703    }
3704    else {
3705        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3706    }
3707#endif
3708}
3709
3710
3711int
3712_PyUnicode_HasNULChars(PyObject* s)
3713{
3714    static PyObject *nul = NULL;
3715
3716    if (nul == NULL)
3717        nul = PyUnicode_FromStringAndSize("\0", 1);
3718    if (nul == NULL)
3719        return -1;
3720    return PyUnicode_Contains(s, nul);
3721}
3722
3723
3724int
3725PyUnicode_FSConverter(PyObject* arg, void* addr)
3726{
3727    PyObject *output = NULL;
3728    Py_ssize_t size;
3729    void *data;
3730    if (arg == NULL) {
3731        Py_DECREF(*(PyObject**)addr);
3732        return 1;
3733    }
3734    if (PyBytes_Check(arg)) {
3735        output = arg;
3736        Py_INCREF(output);
3737    }
3738    else {
3739        arg = PyUnicode_FromObject(arg);
3740        if (!arg)
3741            return 0;
3742        output = PyUnicode_EncodeFSDefault(arg);
3743        Py_DECREF(arg);
3744        if (!output)
3745            return 0;
3746        if (!PyBytes_Check(output)) {
3747            Py_DECREF(output);
3748            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3749            return 0;
3750        }
3751    }
3752    size = PyBytes_GET_SIZE(output);
3753    data = PyBytes_AS_STRING(output);
3754    if (size != strlen(data)) {
3755        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3756        Py_DECREF(output);
3757        return 0;
3758    }
3759    *(PyObject**)addr = output;
3760    return Py_CLEANUP_SUPPORTED;
3761}
3762
3763
3764int
3765PyUnicode_FSDecoder(PyObject* arg, void* addr)
3766{
3767    PyObject *output = NULL;
3768    if (arg == NULL) {
3769        Py_DECREF(*(PyObject**)addr);
3770        return 1;
3771    }
3772    if (PyUnicode_Check(arg)) {
3773        if (PyUnicode_READY(arg) == -1)
3774            return 0;
3775        output = arg;
3776        Py_INCREF(output);
3777    }
3778    else {
3779        arg = PyBytes_FromObject(arg);
3780        if (!arg)
3781            return 0;
3782        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3783                                                  PyBytes_GET_SIZE(arg));
3784        Py_DECREF(arg);
3785        if (!output)
3786            return 0;
3787        if (!PyUnicode_Check(output)) {
3788            Py_DECREF(output);
3789            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3790            return 0;
3791        }
3792    }
3793    if (PyUnicode_READY(output) == -1) {
3794        Py_DECREF(output);
3795        return 0;
3796    }
3797    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3798                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3799        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3800        Py_DECREF(output);
3801        return 0;
3802    }
3803    *(PyObject**)addr = output;
3804    return Py_CLEANUP_SUPPORTED;
3805}
3806
3807
3808char*
3809PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3810{
3811    PyObject *bytes;
3812
3813    if (!PyUnicode_Check(unicode)) {
3814        PyErr_BadArgument();
3815        return NULL;
3816    }
3817    if (PyUnicode_READY(unicode) == -1)
3818        return NULL;
3819
3820    if (PyUnicode_UTF8(unicode) == NULL) {
3821        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3822        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3823        if (bytes == NULL)
3824            return NULL;
3825        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3826        if (_PyUnicode_UTF8(unicode) == NULL) {
3827            Py_DECREF(bytes);
3828            return NULL;
3829        }
3830        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3831        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3832                  PyBytes_AS_STRING(bytes),
3833                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3834        Py_DECREF(bytes);
3835    }
3836
3837    if (psize)
3838        *psize = PyUnicode_UTF8_LENGTH(unicode);
3839    return PyUnicode_UTF8(unicode);
3840}
3841
3842char*
3843PyUnicode_AsUTF8(PyObject *unicode)
3844{
3845    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3846}
3847
3848#ifdef Py_DEBUG
3849static int unicode_as_unicode_calls = 0;
3850#endif
3851
3852
3853Py_UNICODE *
3854PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3855{
3856    const unsigned char *one_byte;
3857#if SIZEOF_WCHAR_T == 4
3858    const Py_UCS2 *two_bytes;
3859#else
3860    const Py_UCS4 *four_bytes;
3861    const Py_UCS4 *ucs4_end;
3862    Py_ssize_t num_surrogates;
3863#endif
3864    wchar_t *w;
3865    wchar_t *wchar_end;
3866
3867    if (!PyUnicode_Check(unicode)) {
3868        PyErr_BadArgument();
3869        return NULL;
3870    }
3871    if (_PyUnicode_WSTR(unicode) == NULL) {
3872        /* Non-ASCII compact unicode object */
3873        assert(_PyUnicode_KIND(unicode) != 0);
3874        assert(PyUnicode_IS_READY(unicode));
3875
3876#ifdef Py_DEBUG
3877        ++unicode_as_unicode_calls;
3878#endif
3879
3880        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3881#if SIZEOF_WCHAR_T == 2
3882            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3883            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3884            num_surrogates = 0;
3885
3886            for (; four_bytes < ucs4_end; ++four_bytes) {
3887                if (*four_bytes > 0xFFFF)
3888                    ++num_surrogates;
3889            }
3890
3891            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3892                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3893            if (!_PyUnicode_WSTR(unicode)) {
3894                PyErr_NoMemory();
3895                return NULL;
3896            }
3897            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3898
3899            w = _PyUnicode_WSTR(unicode);
3900            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3901            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3902            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3903                if (*four_bytes > 0xFFFF) {
3904                    assert(*four_bytes <= MAX_UNICODE);
3905                    /* encode surrogate pair in this case */
3906                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3907                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3908                }
3909                else
3910                    *w = *four_bytes;
3911
3912                if (w > wchar_end) {
3913                    assert(0 && "Miscalculated string end");
3914                }
3915            }
3916            *w = 0;
3917#else
3918            /* sizeof(wchar_t) == 4 */
3919            Py_FatalError("Impossible unicode object state, wstr and str "
3920                          "should share memory already.");
3921            return NULL;
3922#endif
3923        }
3924        else {
3925            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3926                                                  (_PyUnicode_LENGTH(unicode) + 1));
3927            if (!_PyUnicode_WSTR(unicode)) {
3928                PyErr_NoMemory();
3929                return NULL;
3930            }
3931            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3932                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3933            w = _PyUnicode_WSTR(unicode);
3934            wchar_end = w + _PyUnicode_LENGTH(unicode);
3935
3936            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3937                one_byte = PyUnicode_1BYTE_DATA(unicode);
3938                for (; w < wchar_end; ++one_byte, ++w)
3939                    *w = *one_byte;
3940                /* null-terminate the wstr */
3941                *w = 0;
3942            }
3943            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3944#if SIZEOF_WCHAR_T == 4
3945                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3946                for (; w < wchar_end; ++two_bytes, ++w)
3947                    *w = *two_bytes;
3948                /* null-terminate the wstr */
3949                *w = 0;
3950#else
3951                /* sizeof(wchar_t) == 2 */
3952                PyObject_FREE(_PyUnicode_WSTR(unicode));
3953                _PyUnicode_WSTR(unicode) = NULL;
3954                Py_FatalError("Impossible unicode object state, wstr "
3955                              "and str should share memory already.");
3956                return NULL;
3957#endif
3958            }
3959            else {
3960                assert(0 && "This should never happen.");
3961            }
3962        }
3963    }
3964    if (size != NULL)
3965        *size = PyUnicode_WSTR_LENGTH(unicode);
3966    return _PyUnicode_WSTR(unicode);
3967}
3968
3969Py_UNICODE *
3970PyUnicode_AsUnicode(PyObject *unicode)
3971{
3972    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3973}
3974
3975
3976Py_ssize_t
3977PyUnicode_GetSize(PyObject *unicode)
3978{
3979    if (!PyUnicode_Check(unicode)) {
3980        PyErr_BadArgument();
3981        goto onError;
3982    }
3983    return PyUnicode_GET_SIZE(unicode);
3984
3985  onError:
3986    return -1;
3987}
3988
3989Py_ssize_t
3990PyUnicode_GetLength(PyObject *unicode)
3991{
3992    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3993        PyErr_BadArgument();
3994        return -1;
3995    }
3996
3997    return PyUnicode_GET_LENGTH(unicode);
3998}
3999
4000Py_UCS4
4001PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4002{
4003    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4004        PyErr_BadArgument();
4005        return (Py_UCS4)-1;
4006    }
4007    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4008        PyErr_SetString(PyExc_IndexError, "string index out of range");
4009        return (Py_UCS4)-1;
4010    }
4011    return PyUnicode_READ_CHAR(unicode, index);
4012}
4013
4014int
4015PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4016{
4017    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4018        PyErr_BadArgument();
4019        return -1;
4020    }
4021    assert(PyUnicode_IS_READY(unicode));
4022    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4023        PyErr_SetString(PyExc_IndexError, "string index out of range");
4024        return -1;
4025    }
4026    if (unicode_check_modifiable(unicode))
4027        return -1;
4028    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4029        PyErr_SetString(PyExc_ValueError, "character out of range");
4030        return -1;
4031    }
4032    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4033                    index, ch);
4034    return 0;
4035}
4036
4037const char *
4038PyUnicode_GetDefaultEncoding(void)
4039{
4040    return "utf-8";
4041}
4042
4043/* create or adjust a UnicodeDecodeError */
4044static void
4045make_decode_exception(PyObject **exceptionObject,
4046                      const char *encoding,
4047                      const char *input, Py_ssize_t length,
4048                      Py_ssize_t startpos, Py_ssize_t endpos,
4049                      const char *reason)
4050{
4051    if (*exceptionObject == NULL) {
4052        *exceptionObject = PyUnicodeDecodeError_Create(
4053            encoding, input, length, startpos, endpos, reason);
4054    }
4055    else {
4056        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4057            goto onError;
4058        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4059            goto onError;
4060        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4061            goto onError;
4062    }
4063    return;
4064
4065onError:
4066    Py_DECREF(*exceptionObject);
4067    *exceptionObject = NULL;
4068}
4069
4070/* error handling callback helper:
4071   build arguments, call the callback and check the arguments,
4072   if no exception occurred, copy the replacement to the output
4073   and adjust various state variables.
4074   return 0 on success, -1 on error
4075*/
4076
4077static int
4078unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
4079                                 const char *encoding, const char *reason,
4080                                 const char **input, const char **inend, Py_ssize_t *startinpos,
4081                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4082                                 PyObject **output, Py_ssize_t *outpos)
4083{
4084    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4085
4086    PyObject *restuple = NULL;
4087    PyObject *repunicode = NULL;
4088    Py_ssize_t outsize;
4089    Py_ssize_t insize;
4090    Py_ssize_t requiredsize;
4091    Py_ssize_t newpos;
4092    PyObject *inputobj = NULL;
4093    int res = -1;
4094
4095    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4096        outsize = PyUnicode_GET_LENGTH(*output);
4097    else
4098        outsize = _PyUnicode_WSTR_LENGTH(*output);
4099
4100    if (*errorHandler == NULL) {
4101        *errorHandler = PyCodec_LookupError(errors);
4102        if (*errorHandler == NULL)
4103            goto onError;
4104    }
4105
4106    make_decode_exception(exceptionObject,
4107        encoding,
4108        *input, *inend - *input,
4109        *startinpos, *endinpos,
4110        reason);
4111    if (*exceptionObject == NULL)
4112        goto onError;
4113
4114    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4115    if (restuple == NULL)
4116        goto onError;
4117    if (!PyTuple_Check(restuple)) {
4118        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4119        goto onError;
4120    }
4121    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4122        goto onError;
4123    if (PyUnicode_READY(repunicode) == -1)
4124        goto onError;
4125
4126    /* Copy back the bytes variables, which might have been modified by the
4127       callback */
4128    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4129    if (!inputobj)
4130        goto onError;
4131    if (!PyBytes_Check(inputobj)) {
4132        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4133    }
4134    *input = PyBytes_AS_STRING(inputobj);
4135    insize = PyBytes_GET_SIZE(inputobj);
4136    *inend = *input + insize;
4137    /* we can DECREF safely, as the exception has another reference,
4138       so the object won't go away. */
4139    Py_DECREF(inputobj);
4140
4141    if (newpos<0)
4142        newpos = insize+newpos;
4143    if (newpos<0 || newpos>insize) {
4144        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4145        goto onError;
4146    }
4147
4148    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4149        /* need more space? (at least enough for what we
4150           have+the replacement+the rest of the string (starting
4151           at the new input position), so we won't have to check space
4152           when there are no errors in the rest of the string) */
4153        Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4154        requiredsize = *outpos + replen + insize-newpos;
4155        if (requiredsize > outsize) {
4156            if (requiredsize<2*outsize)
4157                requiredsize = 2*outsize;
4158            if (unicode_resize(output, requiredsize) < 0)
4159                goto onError;
4160        }
4161        if (unicode_widen(output, *outpos,
4162                          PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
4163            goto onError;
4164        _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
4165        *outpos += replen;
4166    }
4167    else {
4168        wchar_t *repwstr;
4169        Py_ssize_t repwlen;
4170        repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4171        if (repwstr == NULL)
4172            goto onError;
4173        /* need more space? (at least enough for what we
4174           have+the replacement+the rest of the string (starting
4175           at the new input position), so we won't have to check space
4176           when there are no errors in the rest of the string) */
4177        requiredsize = *outpos + repwlen + insize-newpos;
4178        if (requiredsize > outsize) {
4179            if (requiredsize < 2*outsize)
4180                requiredsize = 2*outsize;
4181            if (unicode_resize(output, requiredsize) < 0)
4182                goto onError;
4183        }
4184        wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4185        *outpos += repwlen;
4186    }
4187    *endinpos = newpos;
4188    *inptr = *input + newpos;
4189
4190    /* we made it! */
4191    res = 0;
4192
4193  onError:
4194    Py_XDECREF(restuple);
4195    return res;
4196}
4197
4198/* --- UTF-7 Codec -------------------------------------------------------- */
4199
4200/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4201
4202/* Three simple macros defining base-64. */
4203
4204/* Is c a base-64 character? */
4205
4206#define IS_BASE64(c) \
4207    (((c) >= 'A' && (c) <= 'Z') ||     \
4208     ((c) >= 'a' && (c) <= 'z') ||     \
4209     ((c) >= '0' && (c) <= '9') ||     \
4210     (c) == '+' || (c) == '/')
4211
4212/* given that c is a base-64 character, what is its base-64 value? */
4213
4214#define FROM_BASE64(c)                                                  \
4215    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4216     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4217     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4218     (c) == '+' ? 62 : 63)
4219
4220/* What is the base-64 character of the bottom 6 bits of n? */
4221
4222#define TO_BASE64(n)  \
4223    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4224
4225/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4226 * decoded as itself.  We are permissive on decoding; the only ASCII
4227 * byte not decoding to itself is the + which begins a base64
4228 * string. */
4229
4230#define DECODE_DIRECT(c)                                \
4231    ((c) <= 127 && (c) != '+')
4232
4233/* The UTF-7 encoder treats ASCII characters differently according to
4234 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4235 * the above).  See RFC2152.  This array identifies these different
4236 * sets:
4237 * 0 : "Set D"
4238 *     alphanumeric and '(),-./:?
4239 * 1 : "Set O"
4240 *     !"#$%&*;<=>@[]^_`{|}
4241 * 2 : "whitespace"
4242 *     ht nl cr sp
4243 * 3 : special (must be base64 encoded)
4244 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4245 */
4246
4247static
4248char utf7_category[128] = {
4249/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4250    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4251/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4252    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4253/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4254    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4255/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4256    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4257/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4258    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4259/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4260    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4261/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4262    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4263/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4264    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4265};
4266
4267/* ENCODE_DIRECT: this character should be encoded as itself.  The
4268 * answer depends on whether we are encoding set O as itself, and also
4269 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4270 * clear that the answers to these questions vary between
4271 * applications, so this code needs to be flexible.  */
4272
4273#define ENCODE_DIRECT(c, directO, directWS)             \
4274    ((c) < 128 && (c) > 0 &&                            \
4275     ((utf7_category[(c)] == 0) ||                      \
4276      (directWS && (utf7_category[(c)] == 2)) ||        \
4277      (directO && (utf7_category[(c)] == 1))))
4278
4279PyObject *
4280PyUnicode_DecodeUTF7(const char *s,
4281                     Py_ssize_t size,
4282                     const char *errors)
4283{
4284    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4285}
4286
4287/* The decoder.  The only state we preserve is our read position,
4288 * i.e. how many characters we have consumed.  So if we end in the
4289 * middle of a shift sequence we have to back off the read position
4290 * and the output to the beginning of the sequence, otherwise we lose
4291 * all the shift state (seen bits, number of bits seen, high
4292 * surrogate). */
4293
4294PyObject *
4295PyUnicode_DecodeUTF7Stateful(const char *s,
4296                             Py_ssize_t size,
4297                             const char *errors,
4298                             Py_ssize_t *consumed)
4299{
4300    const char *starts = s;
4301    Py_ssize_t startinpos;
4302    Py_ssize_t endinpos;
4303    Py_ssize_t outpos;
4304    const char *e;
4305    PyObject *unicode;
4306    const char *errmsg = "";
4307    int inShift = 0;
4308    Py_ssize_t shiftOutStart;
4309    unsigned int base64bits = 0;
4310    unsigned long base64buffer = 0;
4311    Py_UCS4 surrogate = 0;
4312    PyObject *errorHandler = NULL;
4313    PyObject *exc = NULL;
4314
4315    /* Start off assuming it's all ASCII. Widen later as necessary. */
4316    unicode = PyUnicode_New(size, 127);
4317    if (!unicode)
4318        return NULL;
4319    if (size == 0) {
4320        if (consumed)
4321            *consumed = 0;
4322        return unicode;
4323    }
4324
4325    shiftOutStart = outpos = 0;
4326    e = s + size;
4327
4328    while (s < e) {
4329        Py_UCS4 ch;
4330      restart:
4331        ch = (unsigned char) *s;
4332
4333        if (inShift) { /* in a base-64 section */
4334            if (IS_BASE64(ch)) { /* consume a base-64 character */
4335                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4336                base64bits += 6;
4337                s++;
4338                if (base64bits >= 16) {
4339                    /* we have enough bits for a UTF-16 value */
4340                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4341                    base64bits -= 16;
4342                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4343                    if (surrogate) {
4344                        /* expecting a second surrogate */
4345                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4346                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4347                            if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4348                                goto onError;
4349                            surrogate = 0;
4350                            continue;
4351                        }
4352                        else {
4353                            if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4354                                goto onError;
4355                            surrogate = 0;
4356                        }
4357                    }
4358                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4359                        /* first surrogate */
4360                        surrogate = outCh;
4361                    }
4362                    else {
4363                        if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4364                            goto onError;
4365                    }
4366                }
4367            }
4368            else { /* now leaving a base-64 section */
4369                inShift = 0;
4370                s++;
4371                if (surrogate) {
4372                    if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4373                        goto onError;
4374                    surrogate = 0;
4375                }
4376                if (base64bits > 0) { /* left-over bits */
4377                    if (base64bits >= 6) {
4378                        /* We've seen at least one base-64 character */
4379                        errmsg = "partial character in shift sequence";
4380                        goto utf7Error;
4381                    }
4382                    else {
4383                        /* Some bits remain; they should be zero */
4384                        if (base64buffer != 0) {
4385                            errmsg = "non-zero padding bits in shift sequence";
4386                            goto utf7Error;
4387                        }
4388                    }
4389                }
4390                if (ch != '-') {
4391                    /* '-' is absorbed; other terminating
4392                       characters are preserved */
4393                    if (unicode_putchar(&unicode, &outpos, ch) < 0)
4394                        goto onError;
4395                }
4396            }
4397        }
4398        else if ( ch == '+' ) {
4399            startinpos = s-starts;
4400            s++; /* consume '+' */
4401            if (s < e && *s == '-') { /* '+-' encodes '+' */
4402                s++;
4403                if (unicode_putchar(&unicode, &outpos, '+') < 0)
4404                    goto onError;
4405            }
4406            else { /* begin base64-encoded section */
4407                inShift = 1;
4408                shiftOutStart = outpos;
4409                base64bits = 0;
4410            }
4411        }
4412        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4413            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4414                goto onError;
4415            s++;
4416        }
4417        else {
4418            startinpos = s-starts;
4419            s++;
4420            errmsg = "unexpected special character";
4421            goto utf7Error;
4422        }
4423        continue;
4424utf7Error:
4425        endinpos = s-starts;
4426        if (unicode_decode_call_errorhandler(
4427                errors, &errorHandler,
4428                "utf7", errmsg,
4429                &starts, &e, &startinpos, &endinpos, &exc, &s,
4430                &unicode, &outpos))
4431            goto onError;
4432    }
4433
4434    /* end of string */
4435
4436    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4437        /* if we're in an inconsistent state, that's an error */
4438        if (surrogate ||
4439                (base64bits >= 6) ||
4440                (base64bits > 0 && base64buffer != 0)) {
4441            endinpos = size;
4442            if (unicode_decode_call_errorhandler(
4443                    errors, &errorHandler,
4444                    "utf7", "unterminated shift sequence",
4445                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4446                    &unicode, &outpos))
4447                goto onError;
4448            if (s < e)
4449                goto restart;
4450        }
4451    }
4452
4453    /* return state */
4454    if (consumed) {
4455        if (inShift) {
4456            outpos = shiftOutStart; /* back off output */
4457            *consumed = startinpos;
4458        }
4459        else {
4460            *consumed = s-starts;
4461        }
4462    }
4463
4464    if (unicode_resize(&unicode, outpos) < 0)
4465        goto onError;
4466
4467    Py_XDECREF(errorHandler);
4468    Py_XDECREF(exc);
4469    return unicode_result(unicode);
4470
4471  onError:
4472    Py_XDECREF(errorHandler);
4473    Py_XDECREF(exc);
4474    Py_DECREF(unicode);
4475    return NULL;
4476}
4477
4478
4479PyObject *
4480_PyUnicode_EncodeUTF7(PyObject *str,
4481                      int base64SetO,
4482                      int base64WhiteSpace,
4483                      const char *errors)
4484{
4485    int kind;
4486    void *data;
4487    Py_ssize_t len;
4488    PyObject *v;
4489    Py_ssize_t allocated;
4490    int inShift = 0;
4491    Py_ssize_t i;
4492    unsigned int base64bits = 0;
4493    unsigned long base64buffer = 0;
4494    char * out;
4495    char * start;
4496
4497    if (PyUnicode_READY(str) == -1)
4498        return NULL;
4499    kind = PyUnicode_KIND(str);
4500    data = PyUnicode_DATA(str);
4501    len = PyUnicode_GET_LENGTH(str);
4502
4503    if (len == 0)
4504        return PyBytes_FromStringAndSize(NULL, 0);
4505
4506    /* It might be possible to tighten this worst case */
4507    allocated = 8 * len;
4508    if (allocated / 8 != len)
4509        return PyErr_NoMemory();
4510
4511    v = PyBytes_FromStringAndSize(NULL, allocated);
4512    if (v == NULL)
4513        return NULL;
4514
4515    start = out = PyBytes_AS_STRING(v);
4516    for (i = 0; i < len; ++i) {
4517        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4518
4519        if (inShift) {
4520            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4521                /* shifting out */
4522                if (base64bits) { /* output remaining bits */
4523                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4524                    base64buffer = 0;
4525                    base64bits = 0;
4526                }
4527                inShift = 0;
4528                /* Characters not in the BASE64 set implicitly unshift the sequence
4529                   so no '-' is required, except if the character is itself a '-' */
4530                if (IS_BASE64(ch) || ch == '-') {
4531                    *out++ = '-';
4532                }
4533                *out++ = (char) ch;
4534            }
4535            else {
4536                goto encode_char;
4537            }
4538        }
4539        else { /* not in a shift sequence */
4540            if (ch == '+') {
4541                *out++ = '+';
4542                        *out++ = '-';
4543            }
4544            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4545                *out++ = (char) ch;
4546            }
4547            else {
4548                *out++ = '+';
4549                inShift = 1;
4550                goto encode_char;
4551            }
4552        }
4553        continue;
4554encode_char:
4555        if (ch >= 0x10000) {
4556            assert(ch <= MAX_UNICODE);
4557
4558            /* code first surrogate */
4559            base64bits += 16;
4560            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4561            while (base64bits >= 6) {
4562                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4563                base64bits -= 6;
4564            }
4565            /* prepare second surrogate */
4566            ch = Py_UNICODE_LOW_SURROGATE(ch);
4567        }
4568        base64bits += 16;
4569        base64buffer = (base64buffer << 16) | ch;
4570        while (base64bits >= 6) {
4571            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4572            base64bits -= 6;
4573        }
4574    }
4575    if (base64bits)
4576        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4577    if (inShift)
4578        *out++ = '-';
4579    if (_PyBytes_Resize(&v, out - start) < 0)
4580        return NULL;
4581    return v;
4582}
4583PyObject *
4584PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4585                     Py_ssize_t size,
4586                     int base64SetO,
4587                     int base64WhiteSpace,
4588                     const char *errors)
4589{
4590    PyObject *result;
4591    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4592    if (tmp == NULL)
4593        return NULL;
4594    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4595                                   base64WhiteSpace, errors);
4596    Py_DECREF(tmp);
4597    return result;
4598}
4599
4600#undef IS_BASE64
4601#undef FROM_BASE64
4602#undef TO_BASE64
4603#undef DECODE_DIRECT
4604#undef ENCODE_DIRECT
4605
4606/* --- UTF-8 Codec -------------------------------------------------------- */
4607
4608PyObject *
4609PyUnicode_DecodeUTF8(const char *s,
4610                     Py_ssize_t size,
4611                     const char *errors)
4612{
4613    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4614}
4615
4616#include "stringlib/asciilib.h"
4617#include "stringlib/codecs.h"
4618#include "stringlib/undef.h"
4619
4620#include "stringlib/ucs1lib.h"
4621#include "stringlib/codecs.h"
4622#include "stringlib/undef.h"
4623
4624#include "stringlib/ucs2lib.h"
4625#include "stringlib/codecs.h"
4626#include "stringlib/undef.h"
4627
4628#include "stringlib/ucs4lib.h"
4629#include "stringlib/codecs.h"
4630#include "stringlib/undef.h"
4631
4632/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4633#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4634
4635/* Mask to quickly check whether a C 'long' contains a
4636   non-ASCII, UTF8-encoded char. */
4637#if (SIZEOF_LONG == 8)
4638# define ASCII_CHAR_MASK 0x8080808080808080L
4639#elif (SIZEOF_LONG == 4)
4640# define ASCII_CHAR_MASK 0x80808080L
4641#else
4642# error C 'long' size should be either 4 or 8!
4643#endif
4644
4645static Py_ssize_t
4646ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4647{
4648    const char *p = start;
4649    const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
4650
4651#if SIZEOF_LONG <= SIZEOF_VOID_P
4652    assert(!((size_t) dest & LONG_PTR_MASK));
4653    if (!((size_t) p & LONG_PTR_MASK)) {
4654        /* Fast path, see in STRINGLIB(utf8_decode) for
4655           an explanation. */
4656        /* Help register allocation */
4657        register const char *_p = p;
4658        register Py_UCS1 * q = dest;
4659        while (_p < aligned_end) {
4660            unsigned long value = *(const unsigned long *) _p;
4661            if (value & ASCII_CHAR_MASK)
4662                break;
4663            *((unsigned long *)q) = value;
4664            _p += SIZEOF_LONG;
4665            q += SIZEOF_LONG;
4666        }
4667        p = _p;
4668        while (p < end) {
4669            if ((unsigned char)*p & 0x80)
4670                break;
4671            *q++ = *p++;
4672        }
4673        return p - start;
4674    }
4675#endif
4676    while (p < end) {
4677        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4678           for an explanation. */
4679        if (!((size_t) p & LONG_PTR_MASK)) {
4680            /* Help register allocation */
4681            register const char *_p = p;
4682            while (_p < aligned_end) {
4683                unsigned long value = *(unsigned long *) _p;
4684                if (value & ASCII_CHAR_MASK)
4685                    break;
4686                _p += SIZEOF_LONG;
4687            }
4688            p = _p;
4689            if (_p == end)
4690                break;
4691        }
4692        if ((unsigned char)*p & 0x80)
4693            break;
4694        ++p;
4695    }
4696    memcpy(dest, start, p - start);
4697    return p - start;
4698}
4699
4700PyObject *
4701PyUnicode_DecodeUTF8Stateful(const char *s,
4702                             Py_ssize_t size,
4703                             const char *errors,
4704                             Py_ssize_t *consumed)
4705{
4706    PyObject *unicode;
4707    const char *starts = s;
4708    const char *end = s + size;
4709    Py_ssize_t outpos;
4710
4711    Py_ssize_t startinpos;
4712    Py_ssize_t endinpos;
4713    const char *errmsg = "";
4714    PyObject *errorHandler = NULL;
4715    PyObject *exc = NULL;
4716
4717    if (size == 0) {
4718        if (consumed)
4719            *consumed = 0;
4720        Py_INCREF(unicode_empty);
4721        return unicode_empty;
4722    }
4723
4724    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4725    if (size == 1 && (unsigned char)s[0] < 128) {
4726        if (consumed)
4727            *consumed = 1;
4728        return get_latin1_char((unsigned char)s[0]);
4729    }
4730
4731    unicode = PyUnicode_New(size, 127);
4732    if (!unicode)
4733        return NULL;
4734
4735    outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4736    s += outpos;
4737    while (s < end) {
4738        Py_UCS4 ch;
4739        int kind = PyUnicode_KIND(unicode);
4740        if (kind == PyUnicode_1BYTE_KIND) {
4741            if (PyUnicode_IS_ASCII(unicode))
4742                ch = asciilib_utf8_decode(&s, end,
4743                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4744            else
4745                ch = ucs1lib_utf8_decode(&s, end,
4746                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4747        } else if (kind == PyUnicode_2BYTE_KIND) {
4748            ch = ucs2lib_utf8_decode(&s, end,
4749                    PyUnicode_2BYTE_DATA(unicode), &outpos);
4750        } else {
4751            assert(kind == PyUnicode_4BYTE_KIND);
4752            ch = ucs4lib_utf8_decode(&s, end,
4753                    PyUnicode_4BYTE_DATA(unicode), &outpos);
4754        }
4755
4756        switch (ch) {
4757        case 0:
4758            if (s == end || consumed)
4759                goto End;
4760            errmsg = "unexpected end of data";
4761            startinpos = s - starts;
4762            endinpos = startinpos + 1;
4763            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4764                endinpos++;
4765            break;
4766        case 1:
4767            errmsg = "invalid start byte";
4768            startinpos = s - starts;
4769            endinpos = startinpos + 1;
4770            break;
4771        case 2:
4772            errmsg = "invalid continuation byte";
4773            startinpos = s - starts;
4774            endinpos = startinpos + 1;
4775            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4776                endinpos++;
4777            break;
4778        default:
4779            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4780                goto onError;
4781            continue;
4782        }
4783
4784        if (unicode_decode_call_errorhandler(
4785                errors, &errorHandler,
4786                "utf-8", errmsg,
4787                &starts, &end, &startinpos, &endinpos, &exc, &s,
4788                &unicode, &outpos))
4789            goto onError;
4790    }
4791
4792End:
4793    if (unicode_resize(&unicode, outpos) < 0)
4794        goto onError;
4795
4796    if (consumed)
4797        *consumed = s - starts;
4798
4799    Py_XDECREF(errorHandler);
4800    Py_XDECREF(exc);
4801    assert(_PyUnicode_CheckConsistency(unicode, 1));
4802    return unicode;
4803
4804onError:
4805    Py_XDECREF(errorHandler);
4806    Py_XDECREF(exc);
4807    Py_XDECREF(unicode);
4808    return NULL;
4809}
4810
4811#ifdef __APPLE__
4812
4813/* Simplified UTF-8 decoder using surrogateescape error handler,
4814   used to decode the command line arguments on Mac OS X. */
4815
4816wchar_t*
4817_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4818{
4819    const char *e;
4820    wchar_t *unicode;
4821    Py_ssize_t outpos;
4822
4823    /* Note: size will always be longer than the resulting Unicode
4824       character count */
4825    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4826        PyErr_NoMemory();
4827        return NULL;
4828    }
4829    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4830    if (!unicode)
4831        return NULL;
4832
4833    /* Unpack UTF-8 encoded data */
4834    e = s + size;
4835    outpos = 0;
4836    while (s < e) {
4837        Py_UCS4 ch;
4838#if SIZEOF_WCHAR_T == 4
4839        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4840#else
4841        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4842#endif
4843        if (ch > 0xFF) {
4844#if SIZEOF_WCHAR_T == 4
4845            assert(0);
4846#else
4847            assert(Py_UNICODE_IS_SURROGATE(ch));
4848            /*  compute and append the two surrogates: */
4849            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4850            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4851#endif
4852        }
4853        else {
4854            if (!ch && s == e)
4855                break;
4856            /* surrogateescape */
4857            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4858        }
4859    }
4860    unicode[outpos] = L'\0';
4861    return unicode;
4862}
4863
4864#endif /* __APPLE__ */
4865
4866/* Primary internal function which creates utf8 encoded bytes objects.
4867
4868   Allocation strategy:  if the string is short, convert into a stack buffer
4869   and allocate exactly as much space needed at the end.  Else allocate the
4870   maximum possible needed (4 result bytes per Unicode character), and return
4871   the excess memory at the end.
4872*/
4873PyObject *
4874_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4875{
4876    enum PyUnicode_Kind kind;
4877    void *data;
4878    Py_ssize_t size;
4879
4880    if (!PyUnicode_Check(unicode)) {
4881        PyErr_BadArgument();
4882        return NULL;
4883    }
4884
4885    if (PyUnicode_READY(unicode) == -1)
4886        return NULL;
4887
4888    if (PyUnicode_UTF8(unicode))
4889        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4890                                         PyUnicode_UTF8_LENGTH(unicode));
4891
4892    kind = PyUnicode_KIND(unicode);
4893    data = PyUnicode_DATA(unicode);
4894    size = PyUnicode_GET_LENGTH(unicode);
4895
4896    switch (kind) {
4897    default:
4898        assert(0);
4899    case PyUnicode_1BYTE_KIND:
4900        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4901        assert(!PyUnicode_IS_ASCII(unicode));
4902        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4903    case PyUnicode_2BYTE_KIND:
4904        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4905    case PyUnicode_4BYTE_KIND:
4906        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4907    }
4908}
4909
4910PyObject *
4911PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4912                     Py_ssize_t size,
4913                     const char *errors)
4914{
4915    PyObject *v, *unicode;
4916
4917    unicode = PyUnicode_FromUnicode(s, size);
4918    if (unicode == NULL)
4919        return NULL;
4920    v = _PyUnicode_AsUTF8String(unicode, errors);
4921    Py_DECREF(unicode);
4922    return v;
4923}
4924
4925PyObject *
4926PyUnicode_AsUTF8String(PyObject *unicode)
4927{
4928    return _PyUnicode_AsUTF8String(unicode, NULL);
4929}
4930
4931/* --- UTF-32 Codec ------------------------------------------------------- */
4932
4933PyObject *
4934PyUnicode_DecodeUTF32(const char *s,
4935                      Py_ssize_t size,
4936                      const char *errors,
4937                      int *byteorder)
4938{
4939    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4940}
4941
4942PyObject *
4943PyUnicode_DecodeUTF32Stateful(const char *s,
4944                              Py_ssize_t size,
4945                              const char *errors,
4946                              int *byteorder,
4947                              Py_ssize_t *consumed)
4948{
4949    const char *starts = s;
4950    Py_ssize_t startinpos;
4951    Py_ssize_t endinpos;
4952    Py_ssize_t outpos;
4953    PyObject *unicode;
4954    const unsigned char *q, *e;
4955    int bo = 0;       /* assume native ordering by default */
4956    const char *errmsg = "";
4957    /* Offsets from q for retrieving bytes in the right order. */
4958#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4959    int iorder[] = {0, 1, 2, 3};
4960#else
4961    int iorder[] = {3, 2, 1, 0};
4962#endif
4963    PyObject *errorHandler = NULL;
4964    PyObject *exc = NULL;
4965
4966    q = (unsigned char *)s;
4967    e = q + size;
4968
4969    if (byteorder)
4970        bo = *byteorder;
4971
4972    /* Check for BOM marks (U+FEFF) in the input and adjust current
4973       byte order setting accordingly. In native mode, the leading BOM
4974       mark is skipped, in all other modes, it is copied to the output
4975       stream as-is (giving a ZWNBSP character). */
4976    if (bo == 0) {
4977        if (size >= 4) {
4978            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4979                (q[iorder[1]] << 8) | q[iorder[0]];
4980#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4981            if (bom == 0x0000FEFF) {
4982                q += 4;
4983                bo = -1;
4984            }
4985            else if (bom == 0xFFFE0000) {
4986                q += 4;
4987                bo = 1;
4988            }
4989#else
4990            if (bom == 0x0000FEFF) {
4991                q += 4;
4992                bo = 1;
4993            }
4994            else if (bom == 0xFFFE0000) {
4995                q += 4;
4996                bo = -1;
4997            }
4998#endif
4999        }
5000    }
5001
5002    if (bo == -1) {
5003        /* force LE */
5004        iorder[0] = 0;
5005        iorder[1] = 1;
5006        iorder[2] = 2;
5007        iorder[3] = 3;
5008    }
5009    else if (bo == 1) {
5010        /* force BE */
5011        iorder[0] = 3;
5012        iorder[1] = 2;
5013        iorder[2] = 1;
5014        iorder[3] = 0;
5015    }
5016
5017    /* This might be one to much, because of a BOM */
5018    unicode = PyUnicode_New((size+3)/4, 127);
5019    if (!unicode)
5020        return NULL;
5021    if (size == 0)
5022        return unicode;
5023    outpos = 0;
5024
5025    while (q < e) {
5026        Py_UCS4 ch;
5027        /* remaining bytes at the end? (size should be divisible by 4) */
5028        if (e-q<4) {
5029            if (consumed)
5030                break;
5031            errmsg = "truncated data";
5032            startinpos = ((const char *)q)-starts;
5033            endinpos = ((const char *)e)-starts;
5034            goto utf32Error;
5035            /* The remaining input chars are ignored if the callback
5036               chooses to skip the input */
5037        }
5038        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5039            (q[iorder[1]] << 8) | q[iorder[0]];
5040
5041        if (ch >= 0x110000)
5042        {
5043            errmsg = "codepoint not in range(0x110000)";
5044            startinpos = ((const char *)q)-starts;
5045            endinpos = startinpos+4;
5046            goto utf32Error;
5047        }
5048        if (unicode_putchar(&unicode, &outpos, ch) < 0)
5049            goto onError;
5050        q += 4;
5051        continue;
5052      utf32Error:
5053        if (unicode_decode_call_errorhandler(
5054                errors, &errorHandler,
5055                "utf32", errmsg,
5056                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5057                &unicode, &outpos))
5058            goto onError;
5059    }
5060
5061    if (byteorder)
5062        *byteorder = bo;
5063
5064    if (consumed)
5065        *consumed = (const char *)q-starts;
5066
5067    /* Adjust length */
5068    if (unicode_resize(&unicode, outpos) < 0)
5069        goto onError;
5070
5071    Py_XDECREF(errorHandler);
5072    Py_XDECREF(exc);
5073    return unicode_result(unicode);
5074
5075  onError:
5076    Py_DECREF(unicode);
5077    Py_XDECREF(errorHandler);
5078    Py_XDECREF(exc);
5079    return NULL;
5080}
5081
5082PyObject *
5083_PyUnicode_EncodeUTF32(PyObject *str,
5084                       const char *errors,
5085                       int byteorder)
5086{
5087    int kind;
5088    void *data;
5089    Py_ssize_t len;
5090    PyObject *v;
5091    unsigned char *p;
5092    Py_ssize_t nsize, bytesize, i;
5093    /* Offsets from p for storing byte pairs in the right order. */
5094#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5095    int iorder[] = {0, 1, 2, 3};
5096#else
5097    int iorder[] = {3, 2, 1, 0};
5098#endif
5099
5100#define STORECHAR(CH)                           \
5101    do {                                        \
5102        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5103        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5104        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5105        p[iorder[0]] = (CH) & 0xff;             \
5106        p += 4;                                 \
5107    } while(0)
5108
5109    if (!PyUnicode_Check(str)) {
5110        PyErr_BadArgument();
5111        return NULL;
5112    }
5113    if (PyUnicode_READY(str) == -1)
5114        return NULL;
5115    kind = PyUnicode_KIND(str);
5116    data = PyUnicode_DATA(str);
5117    len = PyUnicode_GET_LENGTH(str);
5118
5119    nsize = len + (byteorder == 0);
5120    bytesize = nsize * 4;
5121    if (bytesize / 4 != nsize)
5122        return PyErr_NoMemory();
5123    v = PyBytes_FromStringAndSize(NULL, bytesize);
5124    if (v == NULL)
5125        return NULL;
5126
5127    p = (unsigned char *)PyBytes_AS_STRING(v);
5128    if (byteorder == 0)
5129        STORECHAR(0xFEFF);
5130    if (len == 0)
5131        goto done;
5132
5133    if (byteorder == -1) {
5134        /* force LE */
5135        iorder[0] = 0;
5136        iorder[1] = 1;
5137        iorder[2] = 2;
5138        iorder[3] = 3;
5139    }
5140    else if (byteorder == 1) {
5141        /* force BE */
5142        iorder[0] = 3;
5143        iorder[1] = 2;
5144        iorder[2] = 1;
5145        iorder[3] = 0;
5146    }
5147
5148    for (i = 0; i < len; i++)
5149        STORECHAR(PyUnicode_READ(kind, data, i));
5150
5151  done:
5152    return v;
5153#undef STORECHAR
5154}
5155
5156PyObject *
5157PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5158                      Py_ssize_t size,
5159                      const char *errors,
5160                      int byteorder)
5161{
5162    PyObject *result;
5163    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5164    if (tmp == NULL)
5165        return NULL;
5166    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5167    Py_DECREF(tmp);
5168    return result;
5169}
5170
5171PyObject *
5172PyUnicode_AsUTF32String(PyObject *unicode)
5173{
5174    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5175}
5176
5177/* --- UTF-16 Codec ------------------------------------------------------- */
5178
5179PyObject *
5180PyUnicode_DecodeUTF16(const char *s,
5181                      Py_ssize_t size,
5182                      const char *errors,
5183                      int *byteorder)
5184{
5185    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5186}
5187
5188PyObject *
5189PyUnicode_DecodeUTF16Stateful(const char *s,
5190                              Py_ssize_t size,
5191                              const char *errors,
5192                              int *byteorder,
5193                              Py_ssize_t *consumed)
5194{
5195    const char *starts = s;
5196    Py_ssize_t startinpos;
5197    Py_ssize_t endinpos;
5198    Py_ssize_t outpos;
5199    PyObject *unicode;
5200    const unsigned char *q, *e;
5201    int bo = 0;       /* assume native ordering by default */
5202    int native_ordering;
5203    const char *errmsg = "";
5204    PyObject *errorHandler = NULL;
5205    PyObject *exc = NULL;
5206
5207    q = (unsigned char *)s;
5208    e = q + size;
5209
5210    if (byteorder)
5211        bo = *byteorder;
5212
5213    /* Check for BOM marks (U+FEFF) in the input and adjust current
5214       byte order setting accordingly. In native mode, the leading BOM
5215       mark is skipped, in all other modes, it is copied to the output
5216       stream as-is (giving a ZWNBSP character). */
5217    if (bo == 0 && size >= 2) {
5218        const Py_UCS4 bom = (q[1] << 8) | q[0];
5219        if (bom == 0xFEFF) {
5220            q += 2;
5221            bo = -1;
5222        }
5223        else if (bom == 0xFFFE) {
5224            q += 2;
5225            bo = 1;
5226        }
5227        if (byteorder)
5228            *byteorder = bo;
5229    }
5230
5231    if (q == e) {
5232        if (consumed)
5233            *consumed = size;
5234        Py_INCREF(unicode_empty);
5235        return unicode_empty;
5236    }
5237
5238#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5239    native_ordering = bo <= 0;
5240#else
5241    native_ordering = bo >= 0;
5242#endif
5243
5244    /* Note: size will always be longer than the resulting Unicode
5245       character count */
5246    unicode = PyUnicode_New((e - q + 1) / 2, 127);
5247    if (!unicode)
5248        return NULL;
5249
5250    outpos = 0;
5251    while (1) {
5252        Py_UCS4 ch = 0;
5253        if (e - q >= 2) {
5254            int kind = PyUnicode_KIND(unicode);
5255            if (kind == PyUnicode_1BYTE_KIND) {
5256                if (PyUnicode_IS_ASCII(unicode))
5257                    ch = asciilib_utf16_decode(&q, e,
5258                            PyUnicode_1BYTE_DATA(unicode), &outpos,
5259                            native_ordering);
5260                else
5261                    ch = ucs1lib_utf16_decode(&q, e,
5262                            PyUnicode_1BYTE_DATA(unicode), &outpos,
5263                            native_ordering);
5264            } else if (kind == PyUnicode_2BYTE_KIND) {
5265                ch = ucs2lib_utf16_decode(&q, e,
5266                        PyUnicode_2BYTE_DATA(unicode), &outpos,
5267                        native_ordering);
5268            } else {
5269                assert(kind == PyUnicode_4BYTE_KIND);
5270                ch = ucs4lib_utf16_decode(&q, e,
5271                        PyUnicode_4BYTE_DATA(unicode), &outpos,
5272                        native_ordering);
5273            }
5274        }
5275
5276        switch (ch)
5277        {
5278        case 0:
5279            /* remaining byte at the end? (size should be even) */
5280            if (q == e || consumed)
5281                goto End;
5282            errmsg = "truncated data";
5283            startinpos = ((const char *)q) - starts;
5284            endinpos = ((const char *)e) - starts;
5285            break;
5286            /* The remaining input chars are ignored if the callback
5287               chooses to skip the input */
5288        case 1:
5289            errmsg = "unexpected end of data";
5290            startinpos = ((const char *)q) - 2 - starts;
5291            endinpos = ((const char *)e) - starts;
5292            break;
5293        case 2:
5294            errmsg = "illegal encoding";
5295            startinpos = ((const char *)q) - 2 - starts;
5296            endinpos = startinpos + 2;
5297            break;
5298        case 3:
5299            errmsg = "illegal UTF-16 surrogate";
5300            startinpos = ((const char *)q) - 4 - starts;
5301            endinpos = startinpos + 2;
5302            break;
5303        default:
5304            if (unicode_putchar(&unicode, &outpos, ch) < 0)
5305                goto onError;
5306            continue;
5307        }
5308
5309        if (unicode_decode_call_errorhandler(
5310                errors,
5311                &errorHandler,
5312                "utf16", errmsg,
5313                &starts,
5314                (const char **)&e,
5315                &startinpos,
5316                &endinpos,
5317                &exc,
5318                (const char **)&q,
5319                &unicode,
5320                &outpos))
5321            goto onError;
5322    }
5323
5324End:
5325    if (consumed)
5326        *consumed = (const char *)q-starts;
5327
5328    /* Adjust length */
5329    if (unicode_resize(&unicode, outpos) < 0)
5330        goto onError;
5331
5332    Py_XDECREF(errorHandler);
5333    Py_XDECREF(exc);
5334    return unicode_result(unicode);
5335
5336  onError:
5337    Py_DECREF(unicode);
5338    Py_XDECREF(errorHandler);
5339    Py_XDECREF(exc);
5340    return NULL;
5341}
5342
5343PyObject *
5344_PyUnicode_EncodeUTF16(PyObject *str,
5345                       const char *errors,
5346                       int byteorder)
5347{
5348    enum PyUnicode_Kind kind;
5349    const void *data;
5350    Py_ssize_t len;
5351    PyObject *v;
5352    unsigned short *out;
5353    Py_ssize_t bytesize;
5354    Py_ssize_t pairs;
5355#ifdef WORDS_BIGENDIAN
5356    int native_ordering = byteorder >= 0;
5357#else
5358    int native_ordering = byteorder <= 0;
5359#endif
5360
5361    if (!PyUnicode_Check(str)) {
5362        PyErr_BadArgument();
5363        return NULL;
5364    }
5365    if (PyUnicode_READY(str) == -1)
5366        return NULL;
5367    kind = PyUnicode_KIND(str);
5368    data = PyUnicode_DATA(str);
5369    len = PyUnicode_GET_LENGTH(str);
5370
5371    pairs = 0;
5372    if (kind == PyUnicode_4BYTE_KIND) {
5373        const Py_UCS4 *in = (const Py_UCS4 *)data;
5374        const Py_UCS4 *end = in + len;
5375        while (in < end)
5376            if (*in++ >= 0x10000)
5377                pairs++;
5378    }
5379    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5380        return PyErr_NoMemory();
5381    bytesize = (len + pairs + (byteorder == 0)) * 2;
5382    v = PyBytes_FromStringAndSize(NULL, bytesize);
5383    if (v == NULL)
5384        return NULL;
5385
5386    /* output buffer is 2-bytes aligned */
5387    assert(((Py_uintptr_t)PyBytes_AS_STRING(v) & 1) == 0);
5388    out = (unsigned short *)PyBytes_AS_STRING(v);
5389    if (byteorder == 0)
5390        *out++ = 0xFEFF;
5391    if (len == 0)
5392        goto done;
5393
5394    switch (kind) {
5395    case PyUnicode_1BYTE_KIND: {
5396        ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5397        break;
5398    }
5399    case PyUnicode_2BYTE_KIND: {
5400        ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5401        break;
5402    }
5403    case PyUnicode_4BYTE_KIND: {
5404        ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5405        break;
5406    }
5407    default:
5408        assert(0);
5409    }
5410
5411  done:
5412    return v;
5413}
5414
5415PyObject *
5416PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5417                      Py_ssize_t size,
5418                      const char *errors,
5419                      int byteorder)
5420{
5421    PyObject *result;
5422    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5423    if (tmp == NULL)
5424        return NULL;
5425    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5426    Py_DECREF(tmp);
5427    return result;
5428}
5429
5430PyObject *
5431PyUnicode_AsUTF16String(PyObject *unicode)
5432{
5433    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5434}
5435
5436/* --- Unicode Escape Codec ----------------------------------------------- */
5437
5438/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5439   if all the escapes in the string make it still a valid ASCII string.
5440   Returns -1 if any escapes were found which cause the string to
5441   pop out of ASCII range.  Otherwise returns the length of the
5442   required buffer to hold the string.
5443   */
5444static Py_ssize_t
5445length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5446{
5447    const unsigned char *p = (const unsigned char *)s;
5448    const unsigned char *end = p + size;
5449    Py_ssize_t length = 0;
5450
5451    if (size < 0)
5452        return -1;
5453
5454    for (; p < end; ++p) {
5455        if (*p > 127) {
5456            /* Non-ASCII */
5457            return -1;
5458        }
5459        else if (*p != '\\') {
5460            /* Normal character */
5461            ++length;
5462        }
5463        else {
5464            /* Backslash-escape, check next char */
5465            ++p;
5466            /* Escape sequence reaches till end of string or
5467               non-ASCII follow-up. */
5468            if (p >= end || *p > 127)
5469                return -1;
5470            switch (*p) {
5471            case '\n':
5472                /* backslash + \n result in zero characters */
5473                break;
5474            case '\\': case '\'': case '\"':
5475            case 'b': case 'f': case 't':
5476            case 'n': case 'r': case 'v': case 'a':
5477                ++length;
5478                break;
5479            case '0': case '1': case '2': case '3':
5480            case '4': case '5': case '6': case '7':
5481            case 'x': case 'u': case 'U': case 'N':
5482                /* these do not guarantee ASCII characters */
5483                return -1;
5484            default:
5485                /* count the backslash + the other character */
5486                length += 2;
5487            }
5488        }
5489    }
5490    return length;
5491}
5492
5493static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5494
5495PyObject *
5496PyUnicode_DecodeUnicodeEscape(const char *s,
5497                              Py_ssize_t size,
5498                              const char *errors)
5499{
5500    const char *starts = s;
5501    Py_ssize_t startinpos;
5502    Py_ssize_t endinpos;
5503    int j;
5504    PyObject *v;
5505    const char *end;
5506    char* message;
5507    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5508    PyObject *errorHandler = NULL;
5509    PyObject *exc = NULL;
5510    Py_ssize_t len;
5511    Py_ssize_t i;
5512
5513    len = length_of_escaped_ascii_string(s, size);
5514
5515    /* After length_of_escaped_ascii_string() there are two alternatives,
5516       either the string is pure ASCII with named escapes like \n, etc.
5517       and we determined it's exact size (common case)
5518       or it contains \x, \u, ... escape sequences.  then we create a
5519       legacy wchar string and resize it at the end of this function. */
5520    if (len >= 0) {
5521        v = PyUnicode_New(len, 127);
5522        if (!v)
5523            goto onError;
5524        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5525    }
5526    else {
5527        /* Escaped strings will always be longer than the resulting
5528           Unicode string, so we start with size here and then reduce the
5529           length after conversion to the true value.
5530           (but if the error callback returns a long replacement string
5531           we'll have to allocate more space) */
5532        v = PyUnicode_New(size, 127);
5533        if (!v)
5534            goto onError;
5535        len = size;
5536    }
5537
5538    if (size == 0)
5539        return v;
5540    i = 0;
5541    end = s + size;
5542
5543    while (s < end) {
5544        unsigned char c;
5545        Py_UCS4 x;
5546        int digits;
5547
5548        /* The only case in which i == ascii_length is a backslash
5549           followed by a newline. */
5550        assert(i <= len);
5551
5552        /* Non-escape characters are interpreted as Unicode ordinals */
5553        if (*s != '\\') {
5554            if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5555                goto onError;
5556            continue;
5557        }
5558
5559        startinpos = s-starts;
5560        /* \ - Escapes */
5561        s++;
5562        c = *s++;
5563        if (s > end)
5564            c = '\0'; /* Invalid after \ */
5565
5566        /* The only case in which i == ascii_length is a backslash
5567           followed by a newline. */
5568        assert(i < len || (i == len && c == '\n'));
5569
5570        switch (c) {
5571
5572            /* \x escapes */
5573#define WRITECHAR(ch)                                   \
5574            do {                                        \
5575                if (unicode_putchar(&v, &i, ch) < 0)    \
5576                    goto onError;                       \
5577            }while(0)
5578
5579        case '\n': break;
5580        case '\\': WRITECHAR('\\'); break;
5581        case '\'': WRITECHAR('\''); break;
5582        case '\"': WRITECHAR('\"'); break;
5583        case 'b': WRITECHAR('\b'); break;
5584        /* FF */
5585        case 'f': WRITECHAR('\014'); break;
5586        case 't': WRITECHAR('\t'); break;
5587        case 'n': WRITECHAR('\n'); break;
5588        case 'r': WRITECHAR('\r'); break;
5589        /* VT */
5590        case 'v': WRITECHAR('\013'); break;
5591        /* BEL, not classic C */
5592        case 'a': WRITECHAR('\007'); break;
5593
5594            /* \OOO (octal) escapes */
5595        case '0': case '1': case '2': case '3':
5596        case '4': case '5': case '6': case '7':
5597            x = s[-1] - '0';
5598            if (s < end && '0' <= *s && *s <= '7') {
5599                x = (x<<3) + *s++ - '0';
5600                if (s < end && '0' <= *s && *s <= '7')
5601                    x = (x<<3) + *s++ - '0';
5602            }
5603            WRITECHAR(x);
5604            break;
5605
5606            /* hex escapes */
5607            /* \xXX */
5608        case 'x':
5609            digits = 2;
5610            message = "truncated \\xXX escape";
5611            goto hexescape;
5612
5613            /* \uXXXX */
5614        case 'u':
5615            digits = 4;
5616            message = "truncated \\uXXXX escape";
5617            goto hexescape;
5618
5619            /* \UXXXXXXXX */
5620        case 'U':
5621            digits = 8;
5622            message = "truncated \\UXXXXXXXX escape";
5623        hexescape:
5624            chr = 0;
5625            if (s+digits>end) {
5626                endinpos = size;
5627                if (unicode_decode_call_errorhandler(
5628                        errors, &errorHandler,
5629                        "unicodeescape", "end of string in escape sequence",
5630                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5631                        &v, &i))
5632                    goto onError;
5633                goto nextByte;
5634            }
5635            for (j = 0; j < digits; ++j) {
5636                c = (unsigned char) s[j];
5637                if (!Py_ISXDIGIT(c)) {
5638                    endinpos = (s+j+1)-starts;
5639                    if (unicode_decode_call_errorhandler(
5640                            errors, &errorHandler,
5641                            "unicodeescape", message,
5642                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5643                            &v, &i))
5644                        goto onError;
5645                    len = PyUnicode_GET_LENGTH(v);
5646                    goto nextByte;
5647                }
5648                chr = (chr<<4) & ~0xF;
5649                if (c >= '0' && c <= '9')
5650                    chr += c - '0';
5651                else if (c >= 'a' && c <= 'f')
5652                    chr += 10 + c - 'a';
5653                else
5654                    chr += 10 + c - 'A';
5655            }
5656            s += j;
5657            if (chr == 0xffffffff && PyErr_Occurred())
5658                /* _decoding_error will have already written into the
5659                   target buffer. */
5660                break;
5661        store:
5662            /* when we get here, chr is a 32-bit unicode character */
5663            if (chr <= MAX_UNICODE) {
5664                WRITECHAR(chr);
5665            } else {
5666                endinpos = s-starts;
5667                if (unicode_decode_call_errorhandler(
5668                        errors, &errorHandler,
5669                        "unicodeescape", "illegal Unicode character",
5670                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5671                        &v, &i))
5672                    goto onError;
5673            }
5674            break;
5675
5676            /* \N{name} */
5677        case 'N':
5678            message = "malformed \\N character escape";
5679            if (ucnhash_CAPI == NULL) {
5680                /* load the unicode data module */
5681                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5682                                                PyUnicodeData_CAPSULE_NAME, 1);
5683                if (ucnhash_CAPI == NULL)
5684                    goto ucnhashError;
5685            }
5686            if (*s == '{') {
5687                const char *start = s+1;
5688                /* look for the closing brace */
5689                while (*s != '}' && s < end)
5690                    s++;
5691                if (s > start && s < end && *s == '}') {
5692                    /* found a name.  look it up in the unicode database */
5693                    message = "unknown Unicode character name";
5694                    s++;
5695                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5696                                              &chr, 0))
5697                        goto store;
5698                }
5699            }
5700            endinpos = s-starts;
5701            if (unicode_decode_call_errorhandler(
5702                    errors, &errorHandler,
5703                    "unicodeescape", message,
5704                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5705                    &v, &i))
5706                goto onError;
5707            break;
5708
5709        default:
5710            if (s > end) {
5711                message = "\\ at end of string";
5712                s--;
5713                endinpos = s-starts;
5714                if (unicode_decode_call_errorhandler(
5715                        errors, &errorHandler,
5716                        "unicodeescape", message,
5717                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5718                        &v, &i))
5719                    goto onError;
5720            }
5721            else {
5722                WRITECHAR('\\');
5723                WRITECHAR(s[-1]);
5724            }
5725            break;
5726        }
5727      nextByte:
5728        ;
5729    }
5730#undef WRITECHAR
5731
5732    if (unicode_resize(&v, i) < 0)
5733        goto onError;
5734    Py_XDECREF(errorHandler);
5735    Py_XDECREF(exc);
5736    return unicode_result(v);
5737
5738  ucnhashError:
5739    PyErr_SetString(
5740        PyExc_UnicodeError,
5741        "\\N escapes not supported (can't load unicodedata module)"
5742        );
5743    Py_XDECREF(v);
5744    Py_XDECREF(errorHandler);
5745    Py_XDECREF(exc);
5746    return NULL;
5747
5748  onError:
5749    Py_XDECREF(v);
5750    Py_XDECREF(errorHandler);
5751    Py_XDECREF(exc);
5752    return NULL;
5753}
5754
5755/* Return a Unicode-Escape string version of the Unicode object.
5756
5757   If quotes is true, the string is enclosed in u"" or u'' quotes as
5758   appropriate.
5759
5760*/
5761
5762PyObject *
5763PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5764{
5765    Py_ssize_t i, len;
5766    PyObject *repr;
5767    char *p;
5768    int kind;
5769    void *data;
5770    Py_ssize_t expandsize = 0;
5771
5772    /* Initial allocation is based on the longest-possible unichr
5773       escape.
5774
5775       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5776       unichr, so in this case it's the longest unichr escape. In
5777       narrow (UTF-16) builds this is five chars per source unichr
5778       since there are two unichrs in the surrogate pair, so in narrow
5779       (UTF-16) builds it's not the longest unichr escape.
5780
5781       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5782       so in the narrow (UTF-16) build case it's the longest unichr
5783       escape.
5784    */
5785
5786    if (!PyUnicode_Check(unicode)) {
5787        PyErr_BadArgument();
5788        return NULL;
5789    }
5790    if (PyUnicode_READY(unicode) == -1)
5791        return NULL;
5792    len = PyUnicode_GET_LENGTH(unicode);
5793    kind = PyUnicode_KIND(unicode);
5794    data = PyUnicode_DATA(unicode);
5795    switch (kind) {
5796    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5797    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5798    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5799    }
5800
5801    if (len == 0)
5802        return PyBytes_FromStringAndSize(NULL, 0);
5803
5804    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5805        return PyErr_NoMemory();
5806
5807    repr = PyBytes_FromStringAndSize(NULL,
5808                                     2
5809                                     + expandsize*len
5810                                     + 1);
5811    if (repr == NULL)
5812        return NULL;
5813
5814    p = PyBytes_AS_STRING(repr);
5815
5816    for (i = 0; i < len; i++) {
5817        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5818
5819        /* Escape backslashes */
5820        if (ch == '\\') {
5821            *p++ = '\\';
5822            *p++ = (char) ch;
5823            continue;
5824        }
5825
5826        /* Map 21-bit characters to '\U00xxxxxx' */
5827        else if (ch >= 0x10000) {
5828            assert(ch <= MAX_UNICODE);
5829            *p++ = '\\';
5830            *p++ = 'U';
5831            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5832            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5833            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5834            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5835            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5836            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5837            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5838            *p++ = Py_hexdigits[ch & 0x0000000F];
5839            continue;
5840        }
5841
5842        /* Map 16-bit characters to '\uxxxx' */
5843        if (ch >= 256) {
5844            *p++ = '\\';
5845            *p++ = 'u';
5846            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5847            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5848            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5849            *p++ = Py_hexdigits[ch & 0x000F];
5850        }
5851
5852        /* Map special whitespace to '\t', \n', '\r' */
5853        else if (ch == '\t') {
5854            *p++ = '\\';
5855            *p++ = 't';
5856        }
5857        else if (ch == '\n') {
5858            *p++ = '\\';
5859            *p++ = 'n';
5860        }
5861        else if (ch == '\r') {
5862            *p++ = '\\';
5863            *p++ = 'r';
5864        }
5865
5866        /* Map non-printable US ASCII to '\xhh' */
5867        else if (ch < ' ' || ch >= 0x7F) {
5868            *p++ = '\\';
5869            *p++ = 'x';
5870            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5871            *p++ = Py_hexdigits[ch & 0x000F];
5872        }
5873
5874        /* Copy everything else as-is */
5875        else
5876            *p++ = (char) ch;
5877    }
5878
5879    assert(p - PyBytes_AS_STRING(repr) > 0);
5880    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5881        return NULL;
5882    return repr;
5883}
5884
5885PyObject *
5886PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5887                              Py_ssize_t size)
5888{
5889    PyObject *result;
5890    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5891    if (tmp == NULL)
5892        return NULL;
5893    result = PyUnicode_AsUnicodeEscapeString(tmp);
5894    Py_DECREF(tmp);
5895    return result;
5896}
5897
5898/* --- Raw Unicode Escape Codec ------------------------------------------- */
5899
5900PyObject *
5901PyUnicode_DecodeRawUnicodeEscape(const char *s,
5902                                 Py_ssize_t size,
5903                                 const char *errors)
5904{
5905    const char *starts = s;
5906    Py_ssize_t startinpos;
5907    Py_ssize_t endinpos;
5908    Py_ssize_t outpos;
5909    PyObject *v;
5910    const char *end;
5911    const char *bs;
5912    PyObject *errorHandler = NULL;
5913    PyObject *exc = NULL;
5914
5915    /* Escaped strings will always be longer than the resulting
5916       Unicode string, so we start with size here and then reduce the
5917       length after conversion to the true value. (But decoding error
5918       handler might have to resize the string) */
5919    v = PyUnicode_New(size, 127);
5920    if (v == NULL)
5921        goto onError;
5922    if (size == 0)
5923        return v;
5924    outpos = 0;
5925    end = s + size;
5926    while (s < end) {
5927        unsigned char c;
5928        Py_UCS4 x;
5929        int i;
5930        int count;
5931
5932        /* Non-escape characters are interpreted as Unicode ordinals */
5933        if (*s != '\\') {
5934            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5935                goto onError;
5936            continue;
5937        }
5938        startinpos = s-starts;
5939
5940        /* \u-escapes are only interpreted iff the number of leading
5941           backslashes if odd */
5942        bs = s;
5943        for (;s < end;) {
5944            if (*s != '\\')
5945                break;
5946            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5947                goto onError;
5948        }
5949        if (((s - bs) & 1) == 0 ||
5950            s >= end ||
5951            (*s != 'u' && *s != 'U')) {
5952            continue;
5953        }
5954        outpos--;
5955        count = *s=='u' ? 4 : 8;
5956        s++;
5957
5958        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5959        for (x = 0, i = 0; i < count; ++i, ++s) {
5960            c = (unsigned char)*s;
5961            if (!Py_ISXDIGIT(c)) {
5962                endinpos = s-starts;
5963                if (unicode_decode_call_errorhandler(
5964                        errors, &errorHandler,
5965                        "rawunicodeescape", "truncated \\uXXXX",
5966                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5967                        &v, &outpos))
5968                    goto onError;
5969                goto nextByte;
5970            }
5971            x = (x<<4) & ~0xF;
5972            if (c >= '0' && c <= '9')
5973                x += c - '0';
5974            else if (c >= 'a' && c <= 'f')
5975                x += 10 + c - 'a';
5976            else
5977                x += 10 + c - 'A';
5978        }
5979        if (x <= MAX_UNICODE) {
5980            if (unicode_putchar(&v, &outpos, x) < 0)
5981                goto onError;
5982        } else {
5983            endinpos = s-starts;
5984            if (unicode_decode_call_errorhandler(
5985                    errors, &errorHandler,
5986                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5987                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5988                    &v, &outpos))
5989                goto onError;
5990        }
5991      nextByte:
5992        ;
5993    }
5994    if (unicode_resize(&v, outpos) < 0)
5995        goto onError;
5996    Py_XDECREF(errorHandler);
5997    Py_XDECREF(exc);
5998    return unicode_result(v);
5999
6000  onError:
6001    Py_XDECREF(v);
6002    Py_XDECREF(errorHandler);
6003    Py_XDECREF(exc);
6004    return NULL;
6005}
6006
6007
6008PyObject *
6009PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6010{
6011    PyObject *repr;
6012    char *p;
6013    char *q;
6014    Py_ssize_t expandsize, pos;
6015    int kind;
6016    void *data;
6017    Py_ssize_t len;
6018
6019    if (!PyUnicode_Check(unicode)) {
6020        PyErr_BadArgument();
6021        return NULL;
6022    }
6023    if (PyUnicode_READY(unicode) == -1)
6024        return NULL;
6025    kind = PyUnicode_KIND(unicode);
6026    data = PyUnicode_DATA(unicode);
6027    len = PyUnicode_GET_LENGTH(unicode);
6028    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6029       bytes, and 1 byte characters 4. */
6030    expandsize = kind * 2 + 2;
6031
6032    if (len > PY_SSIZE_T_MAX / expandsize)
6033        return PyErr_NoMemory();
6034
6035    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6036    if (repr == NULL)
6037        return NULL;
6038    if (len == 0)
6039        return repr;
6040
6041    p = q = PyBytes_AS_STRING(repr);
6042    for (pos = 0; pos < len; pos++) {
6043        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6044        /* Map 32-bit characters to '\Uxxxxxxxx' */
6045        if (ch >= 0x10000) {
6046            assert(ch <= MAX_UNICODE);
6047            *p++ = '\\';
6048            *p++ = 'U';
6049            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6050            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6051            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6052            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6053            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6054            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6055            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6056            *p++ = Py_hexdigits[ch & 15];
6057        }
6058        /* Map 16-bit characters to '\uxxxx' */
6059        else if (ch >= 256) {
6060            *p++ = '\\';
6061            *p++ = 'u';
6062            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6063            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6064            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6065            *p++ = Py_hexdigits[ch & 15];
6066        }
6067        /* Copy everything else as-is */
6068        else
6069            *p++ = (char) ch;
6070    }
6071
6072    assert(p > q);
6073    if (_PyBytes_Resize(&repr, p - q) < 0)
6074        return NULL;
6075    return repr;
6076}
6077
6078PyObject *
6079PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6080                                 Py_ssize_t size)
6081{
6082    PyObject *result;
6083    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6084    if (tmp == NULL)
6085        return NULL;
6086    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6087    Py_DECREF(tmp);
6088    return result;
6089}
6090
6091/* --- Unicode Internal Codec ------------------------------------------- */
6092
6093PyObject *
6094_PyUnicode_DecodeUnicodeInternal(const char *s,
6095                                 Py_ssize_t size,
6096                                 const char *errors)
6097{
6098    const char *starts = s;
6099    Py_ssize_t startinpos;
6100    Py_ssize_t endinpos;
6101    Py_ssize_t outpos;
6102    PyObject *v;
6103    const char *end;
6104    const char *reason;
6105    PyObject *errorHandler = NULL;
6106    PyObject *exc = NULL;
6107
6108    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6109                     "unicode_internal codec has been deprecated",
6110                     1))
6111        return NULL;
6112
6113    /* XXX overflow detection missing */
6114    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
6115    if (v == NULL)
6116        goto onError;
6117    if (PyUnicode_GET_LENGTH(v) == 0)
6118        return v;
6119    outpos = 0;
6120    end = s + size;
6121
6122    while (s < end) {
6123        Py_UNICODE uch;
6124        Py_UCS4 ch;
6125        /* We copy the raw representation one byte at a time because the
6126           pointer may be unaligned (see test_codeccallbacks). */
6127        ((char *) &uch)[0] = s[0];
6128        ((char *) &uch)[1] = s[1];
6129#ifdef Py_UNICODE_WIDE
6130        ((char *) &uch)[2] = s[2];
6131        ((char *) &uch)[3] = s[3];
6132#endif
6133        ch = uch;
6134
6135        /* We have to sanity check the raw data, otherwise doom looms for
6136           some malformed UCS-4 data. */
6137        if (
6138#ifdef Py_UNICODE_WIDE
6139            ch > 0x10ffff ||
6140#endif
6141            end-s < Py_UNICODE_SIZE
6142            )
6143        {
6144            startinpos = s - starts;
6145            if (end-s < Py_UNICODE_SIZE) {
6146                endinpos = end-starts;
6147                reason = "truncated input";
6148            }
6149            else {
6150                endinpos = s - starts + Py_UNICODE_SIZE;
6151                reason = "illegal code point (> 0x10FFFF)";
6152            }
6153            if (unicode_decode_call_errorhandler(
6154                    errors, &errorHandler,
6155                    "unicode_internal", reason,
6156                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6157                    &v, &outpos))
6158                goto onError;
6159            continue;
6160        }
6161
6162        s += Py_UNICODE_SIZE;
6163#ifndef Py_UNICODE_WIDE
6164        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
6165        {
6166            Py_UNICODE uch2;
6167            ((char *) &uch2)[0] = s[0];
6168            ((char *) &uch2)[1] = s[1];
6169            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6170            {
6171                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6172                s += Py_UNICODE_SIZE;
6173            }
6174        }
6175#endif
6176
6177        if (unicode_putchar(&v, &outpos, ch) < 0)
6178            goto onError;
6179    }
6180
6181    if (unicode_resize(&v, outpos) < 0)
6182        goto onError;
6183    Py_XDECREF(errorHandler);
6184    Py_XDECREF(exc);
6185    return unicode_result(v);
6186
6187  onError:
6188    Py_XDECREF(v);
6189    Py_XDECREF(errorHandler);
6190    Py_XDECREF(exc);
6191    return NULL;
6192}
6193
6194/* --- Latin-1 Codec ------------------------------------------------------ */
6195
6196PyObject *
6197PyUnicode_DecodeLatin1(const char *s,
6198                       Py_ssize_t size,
6199                       const char *errors)
6200{
6201    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6202    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6203}
6204
6205/* create or adjust a UnicodeEncodeError */
6206static void
6207make_encode_exception(PyObject **exceptionObject,
6208                      const char *encoding,
6209                      PyObject *unicode,
6210                      Py_ssize_t startpos, Py_ssize_t endpos,
6211                      const char *reason)
6212{
6213    if (*exceptionObject == NULL) {
6214        *exceptionObject = PyObject_CallFunction(
6215            PyExc_UnicodeEncodeError, "sOnns",
6216            encoding, unicode, startpos, endpos, reason);
6217    }
6218    else {
6219        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6220            goto onError;
6221        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6222            goto onError;
6223        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6224            goto onError;
6225        return;
6226      onError:
6227        Py_DECREF(*exceptionObject);
6228        *exceptionObject = NULL;
6229    }
6230}
6231
6232/* raises a UnicodeEncodeError */
6233static void
6234raise_encode_exception(PyObject **exceptionObject,
6235                       const char *encoding,
6236                       PyObject *unicode,
6237                       Py_ssize_t startpos, Py_ssize_t endpos,
6238                       const char *reason)
6239{
6240    make_encode_exception(exceptionObject,
6241                          encoding, unicode, startpos, endpos, reason);
6242    if (*exceptionObject != NULL)
6243        PyCodec_StrictErrors(*exceptionObject);
6244}
6245
6246/* error handling callback helper:
6247   build arguments, call the callback and check the arguments,
6248   put the result into newpos and return the replacement string, which
6249   has to be freed by the caller */
6250static PyObject *
6251unicode_encode_call_errorhandler(const char *errors,
6252                                 PyObject **errorHandler,
6253                                 const char *encoding, const char *reason,
6254                                 PyObject *unicode, PyObject **exceptionObject,
6255                                 Py_ssize_t startpos, Py_ssize_t endpos,
6256                                 Py_ssize_t *newpos)
6257{
6258    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6259    Py_ssize_t len;
6260    PyObject *restuple;
6261    PyObject *resunicode;
6262
6263    if (*errorHandler == NULL) {
6264        *errorHandler = PyCodec_LookupError(errors);
6265        if (*errorHandler == NULL)
6266            return NULL;
6267    }
6268
6269    if (PyUnicode_READY(unicode) == -1)
6270        return NULL;
6271    len = PyUnicode_GET_LENGTH(unicode);
6272
6273    make_encode_exception(exceptionObject,
6274                          encoding, unicode, startpos, endpos, reason);
6275    if (*exceptionObject == NULL)
6276        return NULL;
6277
6278    restuple = PyObject_CallFunctionObjArgs(
6279        *errorHandler, *exceptionObject, NULL);
6280    if (restuple == NULL)
6281        return NULL;
6282    if (!PyTuple_Check(restuple)) {
6283        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6284        Py_DECREF(restuple);
6285        return NULL;
6286    }
6287    if (!PyArg_ParseTuple(restuple, argparse,
6288                          &resunicode, newpos)) {
6289        Py_DECREF(restuple);
6290        return NULL;
6291    }
6292    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6293        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6294        Py_DECREF(restuple);
6295        return NULL;
6296    }
6297    if (*newpos<0)
6298        *newpos = len + *newpos;
6299    if (*newpos<0 || *newpos>len) {
6300        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6301        Py_DECREF(restuple);
6302        return NULL;
6303    }
6304    Py_INCREF(resunicode);
6305    Py_DECREF(restuple);
6306    return resunicode;
6307}
6308
6309static PyObject *
6310unicode_encode_ucs1(PyObject *unicode,
6311                    const char *errors,
6312                    unsigned int limit)
6313{
6314    /* input state */
6315    Py_ssize_t pos=0, size;
6316    int kind;
6317    void *data;
6318    /* output object */
6319    PyObject *res;
6320    /* pointer into the output */
6321    char *str;
6322    /* current output position */
6323    Py_ssize_t ressize;
6324    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6325    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6326    PyObject *errorHandler = NULL;
6327    PyObject *exc = NULL;
6328    /* the following variable is used for caching string comparisons
6329     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6330    int known_errorHandler = -1;
6331
6332    if (PyUnicode_READY(unicode) == -1)
6333        return NULL;
6334    size = PyUnicode_GET_LENGTH(unicode);
6335    kind = PyUnicode_KIND(unicode);
6336    data = PyUnicode_DATA(unicode);
6337    /* allocate enough for a simple encoding without
6338       replacements, if we need more, we'll resize */
6339    if (size == 0)
6340        return PyBytes_FromStringAndSize(NULL, 0);
6341    res = PyBytes_FromStringAndSize(NULL, size);
6342    if (res == NULL)
6343        return NULL;
6344    str = PyBytes_AS_STRING(res);
6345    ressize = size;
6346
6347    while (pos < size) {
6348        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6349
6350        /* can we encode this? */
6351        if (c<limit) {
6352            /* no overflow check, because we know that the space is enough */
6353            *str++ = (char)c;
6354            ++pos;
6355        }
6356        else {
6357            Py_ssize_t requiredsize;
6358            PyObject *repunicode;
6359            Py_ssize_t repsize, newpos, respos, i;
6360            /* startpos for collecting unencodable chars */
6361            Py_ssize_t collstart = pos;
6362            Py_ssize_t collend = pos;
6363            /* find all unecodable characters */
6364            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6365                ++collend;
6366            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6367            if (known_errorHandler==-1) {
6368                if ((errors==NULL) || (!strcmp(errors, "strict")))
6369                    known_errorHandler = 1;
6370                else if (!strcmp(errors, "replace"))
6371                    known_errorHandler = 2;
6372                else if (!strcmp(errors, "ignore"))
6373                    known_errorHandler = 3;
6374                else if (!strcmp(errors, "xmlcharrefreplace"))
6375                    known_errorHandler = 4;
6376                else
6377                    known_errorHandler = 0;
6378            }
6379            switch (known_errorHandler) {
6380            case 1: /* strict */
6381                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6382                goto onError;
6383            case 2: /* replace */
6384                while (collstart++<collend)
6385                    *str++ = '?'; /* fall through */
6386            case 3: /* ignore */
6387                pos = collend;
6388                break;
6389            case 4: /* xmlcharrefreplace */
6390                respos = str - PyBytes_AS_STRING(res);
6391                /* determine replacement size */
6392                for (i = collstart, repsize = 0; i < collend; ++i) {
6393                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6394                    if (ch < 10)
6395                        repsize += 2+1+1;
6396                    else if (ch < 100)
6397                        repsize += 2+2+1;
6398                    else if (ch < 1000)
6399                        repsize += 2+3+1;
6400                    else if (ch < 10000)
6401                        repsize += 2+4+1;
6402                    else if (ch < 100000)
6403                        repsize += 2+5+1;
6404                    else if (ch < 1000000)
6405                        repsize += 2+6+1;
6406                    else {
6407                        assert(ch <= MAX_UNICODE);
6408                        repsize += 2+7+1;
6409                    }
6410                }
6411                requiredsize = respos+repsize+(size-collend);
6412                if (requiredsize > ressize) {
6413                    if (requiredsize<2*ressize)
6414                        requiredsize = 2*ressize;
6415                    if (_PyBytes_Resize(&res, requiredsize))
6416                        goto onError;
6417                    str = PyBytes_AS_STRING(res) + respos;
6418                    ressize = requiredsize;
6419                }
6420                /* generate replacement */
6421                for (i = collstart; i < collend; ++i) {
6422                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6423                }
6424                pos = collend;
6425                break;
6426            default:
6427                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6428                                                              encoding, reason, unicode, &exc,
6429                                                              collstart, collend, &newpos);
6430                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6431                                           PyUnicode_READY(repunicode) == -1))
6432                    goto onError;
6433                if (PyBytes_Check(repunicode)) {
6434                    /* Directly copy bytes result to output. */
6435                    repsize = PyBytes_Size(repunicode);
6436                    if (repsize > 1) {
6437                        /* Make room for all additional bytes. */
6438                        respos = str - PyBytes_AS_STRING(res);
6439                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6440                            Py_DECREF(repunicode);
6441                            goto onError;
6442                        }
6443                        str = PyBytes_AS_STRING(res) + respos;
6444                        ressize += repsize-1;
6445                    }
6446                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6447                    str += repsize;
6448                    pos = newpos;
6449                    Py_DECREF(repunicode);
6450                    break;
6451                }
6452                /* need more space? (at least enough for what we
6453                   have+the replacement+the rest of the string, so
6454                   we won't have to check space for encodable characters) */
6455                respos = str - PyBytes_AS_STRING(res);
6456                repsize = PyUnicode_GET_LENGTH(repunicode);
6457                requiredsize = respos+repsize+(size-collend);
6458                if (requiredsize > ressize) {
6459                    if (requiredsize<2*ressize)
6460                        requiredsize = 2*ressize;
6461                    if (_PyBytes_Resize(&res, requiredsize)) {
6462                        Py_DECREF(repunicode);
6463                        goto onError;
6464                    }
6465                    str = PyBytes_AS_STRING(res) + respos;
6466                    ressize = requiredsize;
6467                }
6468                /* check if there is anything unencodable in the replacement
6469                   and copy it to the output */
6470                for (i = 0; repsize-->0; ++i, ++str) {
6471                    c = PyUnicode_READ_CHAR(repunicode, i);
6472                    if (c >= limit) {
6473                        raise_encode_exception(&exc, encoding, unicode,
6474                                               pos, pos+1, reason);
6475                        Py_DECREF(repunicode);
6476                        goto onError;
6477                    }
6478                    *str = (char)c;
6479                }
6480                pos = newpos;
6481                Py_DECREF(repunicode);
6482            }
6483        }
6484    }
6485    /* Resize if we allocated to much */
6486    size = str - PyBytes_AS_STRING(res);
6487    if (size < ressize) { /* If this falls res will be NULL */
6488        assert(size >= 0);
6489        if (_PyBytes_Resize(&res, size) < 0)
6490            goto onError;
6491    }
6492
6493    Py_XDECREF(errorHandler);
6494    Py_XDECREF(exc);
6495    return res;
6496
6497  onError:
6498    Py_XDECREF(res);
6499    Py_XDECREF(errorHandler);
6500    Py_XDECREF(exc);
6501    return NULL;
6502}
6503
6504/* Deprecated */
6505PyObject *
6506PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6507                       Py_ssize_t size,
6508                       const char *errors)
6509{
6510    PyObject *result;
6511    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6512    if (unicode == NULL)
6513        return NULL;
6514    result = unicode_encode_ucs1(unicode, errors, 256);
6515    Py_DECREF(unicode);
6516    return result;
6517}
6518
6519PyObject *
6520_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6521{
6522    if (!PyUnicode_Check(unicode)) {
6523        PyErr_BadArgument();
6524        return NULL;
6525    }
6526    if (PyUnicode_READY(unicode) == -1)
6527        return NULL;
6528    /* Fast path: if it is a one-byte string, construct
6529       bytes object directly. */
6530    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6531        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6532                                         PyUnicode_GET_LENGTH(unicode));
6533    /* Non-Latin-1 characters present. Defer to above function to
6534       raise the exception. */
6535    return unicode_encode_ucs1(unicode, errors, 256);
6536}
6537
6538PyObject*
6539PyUnicode_AsLatin1String(PyObject *unicode)
6540{
6541    return _PyUnicode_AsLatin1String(unicode, NULL);
6542}
6543
6544/* --- 7-bit ASCII Codec -------------------------------------------------- */
6545
6546PyObject *
6547PyUnicode_DecodeASCII(const char *s,
6548                      Py_ssize_t size,
6549                      const char *errors)
6550{
6551    const char *starts = s;
6552    PyObject *unicode;
6553    int kind;
6554    void *data;
6555    Py_ssize_t startinpos;
6556    Py_ssize_t endinpos;
6557    Py_ssize_t outpos;
6558    const char *e;
6559    PyObject *errorHandler = NULL;
6560    PyObject *exc = NULL;
6561
6562    if (size == 0) {
6563        Py_INCREF(unicode_empty);
6564        return unicode_empty;
6565    }
6566
6567    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6568    if (size == 1 && (unsigned char)s[0] < 128)
6569        return get_latin1_char((unsigned char)s[0]);
6570
6571    unicode = PyUnicode_New(size, 127);
6572    if (unicode == NULL)
6573        goto onError;
6574
6575    e = s + size;
6576    data = PyUnicode_1BYTE_DATA(unicode);
6577    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6578    if (outpos == size)
6579        return unicode;
6580
6581    s += outpos;
6582    kind = PyUnicode_1BYTE_KIND;
6583    while (s < e) {
6584        register unsigned char c = (unsigned char)*s;
6585        if (c < 128) {
6586            PyUnicode_WRITE(kind, data, outpos++, c);
6587            ++s;
6588        }
6589        else {
6590            startinpos = s-starts;
6591            endinpos = startinpos + 1;
6592            if (unicode_decode_call_errorhandler(
6593                    errors, &errorHandler,
6594                    "ascii", "ordinal not in range(128)",
6595                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6596                    &unicode, &outpos))
6597                goto onError;
6598            kind = PyUnicode_KIND(unicode);
6599            data = PyUnicode_DATA(unicode);
6600        }
6601    }
6602    if (unicode_resize(&unicode, outpos) < 0)
6603        goto onError;
6604    Py_XDECREF(errorHandler);
6605    Py_XDECREF(exc);
6606    assert(_PyUnicode_CheckConsistency(unicode, 1));
6607    return unicode;
6608
6609  onError:
6610    Py_XDECREF(unicode);
6611    Py_XDECREF(errorHandler);
6612    Py_XDECREF(exc);
6613    return NULL;
6614}
6615
6616/* Deprecated */
6617PyObject *
6618PyUnicode_EncodeASCII(const Py_UNICODE *p,
6619                      Py_ssize_t size,
6620                      const char *errors)
6621{
6622    PyObject *result;
6623    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6624    if (unicode == NULL)
6625        return NULL;
6626    result = unicode_encode_ucs1(unicode, errors, 128);
6627    Py_DECREF(unicode);
6628    return result;
6629}
6630
6631PyObject *
6632_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6633{
6634    if (!PyUnicode_Check(unicode)) {
6635        PyErr_BadArgument();
6636        return NULL;
6637    }
6638    if (PyUnicode_READY(unicode) == -1)
6639        return NULL;
6640    /* Fast path: if it is an ASCII-only string, construct bytes object
6641       directly. Else defer to above function to raise the exception. */
6642    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6643        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6644                                         PyUnicode_GET_LENGTH(unicode));
6645    return unicode_encode_ucs1(unicode, errors, 128);
6646}
6647
6648PyObject *
6649PyUnicode_AsASCIIString(PyObject *unicode)
6650{
6651    return _PyUnicode_AsASCIIString(unicode, NULL);
6652}
6653
6654#ifdef HAVE_MBCS
6655
6656/* --- MBCS codecs for Windows -------------------------------------------- */
6657
6658#if SIZEOF_INT < SIZEOF_SIZE_T
6659#define NEED_RETRY
6660#endif
6661
6662#ifndef WC_ERR_INVALID_CHARS
6663#  define WC_ERR_INVALID_CHARS 0x0080
6664#endif
6665
6666static char*
6667code_page_name(UINT code_page, PyObject **obj)
6668{
6669    *obj = NULL;
6670    if (code_page == CP_ACP)
6671        return "mbcs";
6672    if (code_page == CP_UTF7)
6673        return "CP_UTF7";
6674    if (code_page == CP_UTF8)
6675        return "CP_UTF8";
6676
6677    *obj = PyBytes_FromFormat("cp%u", code_page);
6678    if (*obj == NULL)
6679        return NULL;
6680    return PyBytes_AS_STRING(*obj);
6681}
6682
6683static int
6684is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6685{
6686    const char *curr = s + offset;
6687    const char *prev;
6688
6689    if (!IsDBCSLeadByteEx(code_page, *curr))
6690        return 0;
6691
6692    prev = CharPrevExA(code_page, s, curr, 0);
6693    if (prev == curr)
6694        return 1;
6695    /* FIXME: This code is limited to "true" double-byte encodings,
6696       as it assumes an incomplete character consists of a single
6697       byte. */
6698    if (curr - prev == 2)
6699        return 1;
6700    if (!IsDBCSLeadByteEx(code_page, *prev))
6701        return 1;
6702    return 0;
6703}
6704
6705static DWORD
6706decode_code_page_flags(UINT code_page)
6707{
6708    if (code_page == CP_UTF7) {
6709        /* The CP_UTF7 decoder only supports flags=0 */
6710        return 0;
6711    }
6712    else
6713        return MB_ERR_INVALID_CHARS;
6714}
6715
6716/*
6717 * Decode a byte string from a Windows code page into unicode object in strict
6718 * mode.
6719 *
6720 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6721 * WindowsError and returns -1 on other error.
6722 */
6723static int
6724decode_code_page_strict(UINT code_page,
6725                        PyObject **v,
6726                        const char *in,
6727                        int insize)
6728{
6729    const DWORD flags = decode_code_page_flags(code_page);
6730    wchar_t *out;
6731    DWORD outsize;
6732
6733    /* First get the size of the result */
6734    assert(insize > 0);
6735    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6736    if (outsize <= 0)
6737        goto error;
6738
6739    if (*v == NULL) {
6740        /* Create unicode object */
6741        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6742        *v = (PyObject*)_PyUnicode_New(outsize);
6743        if (*v == NULL)
6744            return -1;
6745        out = PyUnicode_AS_UNICODE(*v);
6746    }
6747    else {
6748        /* Extend unicode object */
6749        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6750        if (unicode_resize(v, n + outsize) < 0)
6751            return -1;
6752        out = PyUnicode_AS_UNICODE(*v) + n;
6753    }
6754
6755    /* Do the conversion */
6756    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6757    if (outsize <= 0)
6758        goto error;
6759    return insize;
6760
6761error:
6762    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6763        return -2;
6764    PyErr_SetFromWindowsErr(0);
6765    return -1;
6766}
6767
6768/*
6769 * Decode a byte string from a code page into unicode object with an error
6770 * handler.
6771 *
6772 * Returns consumed size if succeed, or raise a WindowsError or
6773 * UnicodeDecodeError exception and returns -1 on error.
6774 */
6775static int
6776decode_code_page_errors(UINT code_page,
6777                        PyObject **v,
6778                        const char *in, const int size,
6779                        const char *errors)
6780{
6781    const char *startin = in;
6782    const char *endin = in + size;
6783    const DWORD flags = decode_code_page_flags(code_page);
6784    /* Ideally, we should get reason from FormatMessage. This is the Windows
6785       2000 English version of the message. */
6786    const char *reason = "No mapping for the Unicode character exists "
6787                         "in the target code page.";
6788    /* each step cannot decode more than 1 character, but a character can be
6789       represented as a surrogate pair */
6790    wchar_t buffer[2], *startout, *out;
6791    int insize, outsize;
6792    PyObject *errorHandler = NULL;
6793    PyObject *exc = NULL;
6794    PyObject *encoding_obj = NULL;
6795    char *encoding;
6796    DWORD err;
6797    int ret = -1;
6798
6799    assert(size > 0);
6800
6801    encoding = code_page_name(code_page, &encoding_obj);
6802    if (encoding == NULL)
6803        return -1;
6804
6805    if (errors == NULL || strcmp(errors, "strict") == 0) {
6806        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6807           UnicodeDecodeError. */
6808        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6809        if (exc != NULL) {
6810            PyCodec_StrictErrors(exc);
6811            Py_CLEAR(exc);
6812        }
6813        goto error;
6814    }
6815
6816    if (*v == NULL) {
6817        /* Create unicode object */
6818        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6819            PyErr_NoMemory();
6820            goto error;
6821        }
6822        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6823        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6824        if (*v == NULL)
6825            goto error;
6826        startout = PyUnicode_AS_UNICODE(*v);
6827    }
6828    else {
6829        /* Extend unicode object */
6830        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6831        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6832            PyErr_NoMemory();
6833            goto error;
6834        }
6835        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6836            goto error;
6837        startout = PyUnicode_AS_UNICODE(*v) + n;
6838    }
6839
6840    /* Decode the byte string character per character */
6841    out = startout;
6842    while (in < endin)
6843    {
6844        /* Decode a character */
6845        insize = 1;
6846        do
6847        {
6848            outsize = MultiByteToWideChar(code_page, flags,
6849                                          in, insize,
6850                                          buffer, Py_ARRAY_LENGTH(buffer));
6851            if (outsize > 0)
6852                break;
6853            err = GetLastError();
6854            if (err != ERROR_NO_UNICODE_TRANSLATION
6855                && err != ERROR_INSUFFICIENT_BUFFER)
6856            {
6857                PyErr_SetFromWindowsErr(0);
6858                goto error;
6859            }
6860            insize++;
6861        }
6862        /* 4=maximum length of a UTF-8 sequence */
6863        while (insize <= 4 && (in + insize) <= endin);
6864
6865        if (outsize <= 0) {
6866            Py_ssize_t startinpos, endinpos, outpos;
6867
6868            startinpos = in - startin;
6869            endinpos = startinpos + 1;
6870            outpos = out - PyUnicode_AS_UNICODE(*v);
6871            if (unicode_decode_call_errorhandler(
6872                    errors, &errorHandler,
6873                    encoding, reason,
6874                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
6875                    v, &outpos))
6876            {
6877                goto error;
6878            }
6879            out = PyUnicode_AS_UNICODE(*v) + outpos;
6880        }
6881        else {
6882            in += insize;
6883            memcpy(out, buffer, outsize * sizeof(wchar_t));
6884            out += outsize;
6885        }
6886    }
6887
6888    /* write a NUL character at the end */
6889    *out = 0;
6890
6891    /* Extend unicode object */
6892    outsize = out - startout;
6893    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
6894    if (unicode_resize(v, outsize) < 0)
6895        goto error;
6896    ret = size;
6897
6898error:
6899    Py_XDECREF(encoding_obj);
6900    Py_XDECREF(errorHandler);
6901    Py_XDECREF(exc);
6902    return ret;
6903}
6904
6905static PyObject *
6906decode_code_page_stateful(int code_page,
6907                          const char *s, Py_ssize_t size,
6908                          const char *errors, Py_ssize_t *consumed)
6909{
6910    PyObject *v = NULL;
6911    int chunk_size, final, converted, done;
6912
6913    if (code_page < 0) {
6914        PyErr_SetString(PyExc_ValueError, "invalid code page number");
6915        return NULL;
6916    }
6917
6918    if (consumed)
6919        *consumed = 0;
6920
6921    do
6922    {
6923#ifdef NEED_RETRY
6924        if (size > INT_MAX) {
6925            chunk_size = INT_MAX;
6926            final = 0;
6927            done = 0;
6928        }
6929        else
6930#endif
6931        {
6932            chunk_size = (int)size;
6933            final = (consumed == NULL);
6934            done = 1;
6935        }
6936
6937        /* Skip trailing lead-byte unless 'final' is set */
6938        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6939            --chunk_size;
6940
6941        if (chunk_size == 0 && done) {
6942            if (v != NULL)
6943                break;
6944            Py_INCREF(unicode_empty);
6945            return unicode_empty;
6946        }
6947
6948
6949        converted = decode_code_page_strict(code_page, &v,
6950                                            s, chunk_size);
6951        if (converted == -2)
6952            converted = decode_code_page_errors(code_page, &v,
6953                                                s, chunk_size,
6954                                                errors);
6955        assert(converted != 0);
6956
6957        if (converted < 0) {
6958            Py_XDECREF(v);
6959            return NULL;
6960        }
6961
6962        if (consumed)
6963            *consumed += converted;
6964
6965        s += converted;
6966        size -= converted;
6967    } while (!done);
6968
6969    return unicode_result(v);
6970}
6971
6972PyObject *
6973PyUnicode_DecodeCodePageStateful(int code_page,
6974                                 const char *s,
6975                                 Py_ssize_t size,
6976                                 const char *errors,
6977                                 Py_ssize_t *consumed)
6978{
6979    return decode_code_page_stateful(code_page, s, size, errors, consumed);
6980}
6981
6982PyObject *
6983PyUnicode_DecodeMBCSStateful(const char *s,
6984                             Py_ssize_t size,
6985                             const char *errors,
6986                             Py_ssize_t *consumed)
6987{
6988    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6989}
6990
6991PyObject *
6992PyUnicode_DecodeMBCS(const char *s,
6993                     Py_ssize_t size,
6994                     const char *errors)
6995{
6996    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6997}
6998
6999static DWORD
7000encode_code_page_flags(UINT code_page, const char *errors)
7001{
7002    if (code_page == CP_UTF8) {
7003        if (winver.dwMajorVersion >= 6)
7004            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7005               and later */
7006            return WC_ERR_INVALID_CHARS;
7007        else
7008            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7009            return 0;
7010    }
7011    else if (code_page == CP_UTF7) {
7012        /* CP_UTF7 only supports flags=0 */
7013        return 0;
7014    }
7015    else {
7016        if (errors != NULL && strcmp(errors, "replace") == 0)
7017            return 0;
7018        else
7019            return WC_NO_BEST_FIT_CHARS;
7020    }
7021}
7022
7023/*
7024 * Encode a Unicode string to a Windows code page into a byte string in strict
7025 * mode.
7026 *
7027 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7028 * a WindowsError and returns -1 on other error.
7029 */
7030static int
7031encode_code_page_strict(UINT code_page, PyObject **outbytes,
7032                        PyObject *unicode, Py_ssize_t offset, int len,
7033                        const char* errors)
7034{
7035    BOOL usedDefaultChar = FALSE;
7036    BOOL *pusedDefaultChar = &usedDefaultChar;
7037    int outsize;
7038    PyObject *exc = NULL;
7039    wchar_t *p;
7040    Py_ssize_t size;
7041    const DWORD flags = encode_code_page_flags(code_page, NULL);
7042    char *out;
7043    /* Create a substring so that we can get the UTF-16 representation
7044       of just the slice under consideration. */
7045    PyObject *substring;
7046
7047    assert(len > 0);
7048
7049    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7050        pusedDefaultChar = &usedDefaultChar;
7051    else
7052        pusedDefaultChar = NULL;
7053
7054    substring = PyUnicode_Substring(unicode, offset, offset+len);
7055    if (substring == NULL)
7056        return -1;
7057    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7058    if (p == NULL) {
7059        Py_DECREF(substring);
7060        return -1;
7061    }
7062
7063    /* First get the size of the result */
7064    outsize = WideCharToMultiByte(code_page, flags,
7065                                  p, size,
7066                                  NULL, 0,
7067                                  NULL, pusedDefaultChar);
7068    if (outsize <= 0)
7069        goto error;
7070    /* If we used a default char, then we failed! */
7071    if (pusedDefaultChar && *pusedDefaultChar) {
7072        Py_DECREF(substring);
7073        return -2;
7074    }
7075
7076    if (*outbytes == NULL) {
7077        /* Create string object */
7078        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7079        if (*outbytes == NULL) {
7080            Py_DECREF(substring);
7081            return -1;
7082        }
7083        out = PyBytes_AS_STRING(*outbytes);
7084    }
7085    else {
7086        /* Extend string object */
7087        const Py_ssize_t n = PyBytes_Size(*outbytes);
7088        if (outsize > PY_SSIZE_T_MAX - n) {
7089            PyErr_NoMemory();
7090            Py_DECREF(substring);
7091            return -1;
7092        }
7093        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7094            Py_DECREF(substring);
7095            return -1;
7096        }
7097        out = PyBytes_AS_STRING(*outbytes) + n;
7098    }
7099
7100    /* Do the conversion */
7101    outsize = WideCharToMultiByte(code_page, flags,
7102                                  p, size,
7103                                  out, outsize,
7104                                  NULL, pusedDefaultChar);
7105    Py_CLEAR(substring);
7106    if (outsize <= 0)
7107        goto error;
7108    if (pusedDefaultChar && *pusedDefaultChar)
7109        return -2;
7110    return 0;
7111
7112error:
7113    Py_XDECREF(substring);
7114    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7115        return -2;
7116    PyErr_SetFromWindowsErr(0);
7117    return -1;
7118}
7119
7120/*
7121 * Encode a Unicode string to a Windows code page into a byte string using a
7122 * error handler.
7123 *
7124 * Returns consumed characters if succeed, or raise a WindowsError and returns
7125 * -1 on other error.
7126 */
7127static int
7128encode_code_page_errors(UINT code_page, PyObject **outbytes,
7129                        PyObject *unicode, Py_ssize_t unicode_offset,
7130                        Py_ssize_t insize, const char* errors)
7131{
7132    const DWORD flags = encode_code_page_flags(code_page, errors);
7133    Py_ssize_t pos = unicode_offset;
7134    Py_ssize_t endin = unicode_offset + insize;
7135    /* Ideally, we should get reason from FormatMessage. This is the Windows
7136       2000 English version of the message. */
7137    const char *reason = "invalid character";
7138    /* 4=maximum length of a UTF-8 sequence */
7139    char buffer[4];
7140    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7141    Py_ssize_t outsize;
7142    char *out;
7143    PyObject *errorHandler = NULL;
7144    PyObject *exc = NULL;
7145    PyObject *encoding_obj = NULL;
7146    char *encoding;
7147    Py_ssize_t newpos, newoutsize;
7148    PyObject *rep;
7149    int ret = -1;
7150
7151    assert(insize > 0);
7152
7153    encoding = code_page_name(code_page, &encoding_obj);
7154    if (encoding == NULL)
7155        return -1;
7156
7157    if (errors == NULL || strcmp(errors, "strict") == 0) {
7158        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7159           then we raise a UnicodeEncodeError. */
7160        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7161        if (exc != NULL) {
7162            PyCodec_StrictErrors(exc);
7163            Py_DECREF(exc);
7164        }
7165        Py_XDECREF(encoding_obj);
7166        return -1;
7167    }
7168
7169    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7170        pusedDefaultChar = &usedDefaultChar;
7171    else
7172        pusedDefaultChar = NULL;
7173
7174    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7175        PyErr_NoMemory();
7176        goto error;
7177    }
7178    outsize = insize * Py_ARRAY_LENGTH(buffer);
7179
7180    if (*outbytes == NULL) {
7181        /* Create string object */
7182        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7183        if (*outbytes == NULL)
7184            goto error;
7185        out = PyBytes_AS_STRING(*outbytes);
7186    }
7187    else {
7188        /* Extend string object */
7189        Py_ssize_t n = PyBytes_Size(*outbytes);
7190        if (n > PY_SSIZE_T_MAX - outsize) {
7191            PyErr_NoMemory();
7192            goto error;
7193        }
7194        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7195            goto error;
7196        out = PyBytes_AS_STRING(*outbytes) + n;
7197    }
7198
7199    /* Encode the string character per character */
7200    while (pos < endin)
7201    {
7202        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7203        wchar_t chars[2];
7204        int charsize;
7205        if (ch < 0x10000) {
7206            chars[0] = (wchar_t)ch;
7207            charsize = 1;
7208        }
7209        else {
7210            ch -= 0x10000;
7211            chars[0] = 0xd800 + (ch >> 10);
7212            chars[1] = 0xdc00 + (ch & 0x3ff);
7213            charsize = 2;
7214        }
7215
7216        outsize = WideCharToMultiByte(code_page, flags,
7217                                      chars, charsize,
7218                                      buffer, Py_ARRAY_LENGTH(buffer),
7219                                      NULL, pusedDefaultChar);
7220        if (outsize > 0) {
7221            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7222            {
7223                pos++;
7224                memcpy(out, buffer, outsize);
7225                out += outsize;
7226                continue;
7227            }
7228        }
7229        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7230            PyErr_SetFromWindowsErr(0);
7231            goto error;
7232        }
7233
7234        rep = unicode_encode_call_errorhandler(
7235                  errors, &errorHandler, encoding, reason,
7236                  unicode, &exc,
7237                  pos, pos + 1, &newpos);
7238        if (rep == NULL)
7239            goto error;
7240        pos = newpos;
7241
7242        if (PyBytes_Check(rep)) {
7243            outsize = PyBytes_GET_SIZE(rep);
7244            if (outsize != 1) {
7245                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7246                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7247                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7248                    Py_DECREF(rep);
7249                    goto error;
7250                }
7251                out = PyBytes_AS_STRING(*outbytes) + offset;
7252            }
7253            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7254            out += outsize;
7255        }
7256        else {
7257            Py_ssize_t i;
7258            enum PyUnicode_Kind kind;
7259            void *data;
7260
7261            if (PyUnicode_READY(rep) == -1) {
7262                Py_DECREF(rep);
7263                goto error;
7264            }
7265
7266            outsize = PyUnicode_GET_LENGTH(rep);
7267            if (outsize != 1) {
7268                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7269                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7270                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7271                    Py_DECREF(rep);
7272                    goto error;
7273                }
7274                out = PyBytes_AS_STRING(*outbytes) + offset;
7275            }
7276            kind = PyUnicode_KIND(rep);
7277            data = PyUnicode_DATA(rep);
7278            for (i=0; i < outsize; i++) {
7279                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7280                if (ch > 127) {
7281                    raise_encode_exception(&exc,
7282                        encoding, unicode,
7283                        pos, pos + 1,
7284                        "unable to encode error handler result to ASCII");
7285                    Py_DECREF(rep);
7286                    goto error;
7287                }
7288                *out = (unsigned char)ch;
7289                out++;
7290            }
7291        }
7292        Py_DECREF(rep);
7293    }
7294    /* write a NUL byte */
7295    *out = 0;
7296    outsize = out - PyBytes_AS_STRING(*outbytes);
7297    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7298    if (_PyBytes_Resize(outbytes, outsize) < 0)
7299        goto error;
7300    ret = 0;
7301
7302error:
7303    Py_XDECREF(encoding_obj);
7304    Py_XDECREF(errorHandler);
7305    Py_XDECREF(exc);
7306    return ret;
7307}
7308
7309static PyObject *
7310encode_code_page(int code_page,
7311                 PyObject *unicode,
7312                 const char *errors)
7313{
7314    Py_ssize_t len;
7315    PyObject *outbytes = NULL;
7316    Py_ssize_t offset;
7317    int chunk_len, ret, done;
7318
7319    if (PyUnicode_READY(unicode) == -1)
7320        return NULL;
7321    len = PyUnicode_GET_LENGTH(unicode);
7322
7323    if (code_page < 0) {
7324        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7325        return NULL;
7326    }
7327
7328    if (len == 0)
7329        return PyBytes_FromStringAndSize(NULL, 0);
7330
7331    offset = 0;
7332    do
7333    {
7334#ifdef NEED_RETRY
7335        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7336           chunks. */
7337        if (len > INT_MAX/2) {
7338            chunk_len = INT_MAX/2;
7339            done = 0;
7340        }
7341        else
7342#endif
7343        {
7344            chunk_len = (int)len;
7345            done = 1;
7346        }
7347
7348        ret = encode_code_page_strict(code_page, &outbytes,
7349                                      unicode, offset, chunk_len,
7350                                      errors);
7351        if (ret == -2)
7352            ret = encode_code_page_errors(code_page, &outbytes,
7353                                          unicode, offset,
7354                                          chunk_len, errors);
7355        if (ret < 0) {
7356            Py_XDECREF(outbytes);
7357            return NULL;
7358        }
7359
7360        offset += chunk_len;
7361        len -= chunk_len;
7362    } while (!done);
7363
7364    return outbytes;
7365}
7366
7367PyObject *
7368PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7369                     Py_ssize_t size,
7370                     const char *errors)
7371{
7372    PyObject *unicode, *res;
7373    unicode = PyUnicode_FromUnicode(p, size);
7374    if (unicode == NULL)
7375        return NULL;
7376    res = encode_code_page(CP_ACP, unicode, errors);
7377    Py_DECREF(unicode);
7378    return res;
7379}
7380
7381PyObject *
7382PyUnicode_EncodeCodePage(int code_page,
7383                         PyObject *unicode,
7384                         const char *errors)
7385{
7386    return encode_code_page(code_page, unicode, errors);
7387}
7388
7389PyObject *
7390PyUnicode_AsMBCSString(PyObject *unicode)
7391{
7392    if (!PyUnicode_Check(unicode)) {
7393        PyErr_BadArgument();
7394        return NULL;
7395    }
7396    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7397}
7398
7399#undef NEED_RETRY
7400
7401#endif /* HAVE_MBCS */
7402
7403/* --- Character Mapping Codec -------------------------------------------- */
7404
7405PyObject *
7406PyUnicode_DecodeCharmap(const char *s,
7407                        Py_ssize_t size,
7408                        PyObject *mapping,
7409                        const char *errors)
7410{
7411    const char *starts = s;
7412    Py_ssize_t startinpos;
7413    Py_ssize_t endinpos;
7414    Py_ssize_t outpos;
7415    const char *e;
7416    PyObject *v;
7417    Py_ssize_t extrachars = 0;
7418    PyObject *errorHandler = NULL;
7419    PyObject *exc = NULL;
7420
7421    /* Default to Latin-1 */
7422    if (mapping == NULL)
7423        return PyUnicode_DecodeLatin1(s, size, errors);
7424
7425    v = PyUnicode_New(size, 127);
7426    if (v == NULL)
7427        goto onError;
7428    if (size == 0)
7429        return v;
7430    outpos = 0;
7431    e = s + size;
7432    if (PyUnicode_CheckExact(mapping)) {
7433        Py_ssize_t maplen;
7434        enum PyUnicode_Kind kind;
7435        void *data;
7436        Py_UCS4 x;
7437
7438        if (PyUnicode_READY(mapping) == -1)
7439            return NULL;
7440
7441        maplen = PyUnicode_GET_LENGTH(mapping);
7442        data = PyUnicode_DATA(mapping);
7443        kind = PyUnicode_KIND(mapping);
7444        while (s < e) {
7445            unsigned char ch = *s;
7446
7447            if (ch < maplen)
7448                x = PyUnicode_READ(kind, data, ch);
7449            else
7450                x = 0xfffe; /* invalid value */
7451
7452            if (x == 0xfffe)
7453            {
7454                /* undefined mapping */
7455                startinpos = s-starts;
7456                endinpos = startinpos+1;
7457                if (unicode_decode_call_errorhandler(
7458                        errors, &errorHandler,
7459                        "charmap", "character maps to <undefined>",
7460                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7461                        &v, &outpos)) {
7462                    goto onError;
7463                }
7464                continue;
7465            }
7466
7467            if (unicode_putchar(&v, &outpos, x) < 0)
7468                goto onError;
7469            ++s;
7470        }
7471    }
7472    else {
7473        while (s < e) {
7474            unsigned char ch = *s;
7475            PyObject *w, *x;
7476
7477            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7478            w = PyLong_FromLong((long)ch);
7479            if (w == NULL)
7480                goto onError;
7481            x = PyObject_GetItem(mapping, w);
7482            Py_DECREF(w);
7483            if (x == NULL) {
7484                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7485                    /* No mapping found means: mapping is undefined. */
7486                    PyErr_Clear();
7487                    x = Py_None;
7488                    Py_INCREF(x);
7489                } else
7490                    goto onError;
7491            }
7492
7493            /* Apply mapping */
7494            if (PyLong_Check(x)) {
7495                long value = PyLong_AS_LONG(x);
7496                if (value < 0 || value > 65535) {
7497                    PyErr_SetString(PyExc_TypeError,
7498                                    "character mapping must be in range(65536)");
7499                    Py_DECREF(x);
7500                    goto onError;
7501                }
7502                if (unicode_putchar(&v, &outpos, value) < 0)
7503                    goto onError;
7504            }
7505            else if (x == Py_None) {
7506                /* undefined mapping */
7507                startinpos = s-starts;
7508                endinpos = startinpos+1;
7509                if (unicode_decode_call_errorhandler(
7510                        errors, &errorHandler,
7511                        "charmap", "character maps to <undefined>",
7512                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7513                        &v, &outpos)) {
7514                    Py_DECREF(x);
7515                    goto onError;
7516                }
7517                Py_DECREF(x);
7518                continue;
7519            }
7520            else if (PyUnicode_Check(x)) {
7521                Py_ssize_t targetsize;
7522
7523                if (PyUnicode_READY(x) == -1)
7524                    goto onError;
7525                targetsize = PyUnicode_GET_LENGTH(x);
7526
7527                if (targetsize == 1) {
7528                    /* 1-1 mapping */
7529                    if (unicode_putchar(&v, &outpos,
7530                                        PyUnicode_READ_CHAR(x, 0)) < 0)
7531                        goto onError;
7532                }
7533                else if (targetsize > 1) {
7534                    /* 1-n mapping */
7535                    if (targetsize > extrachars) {
7536                        /* resize first */
7537                        Py_ssize_t needed = (targetsize - extrachars) + \
7538                            (targetsize << 2);
7539                        extrachars += needed;
7540                        /* XXX overflow detection missing */
7541                        if (unicode_resize(&v,
7542                                           PyUnicode_GET_LENGTH(v) + needed) < 0)
7543                        {
7544                            Py_DECREF(x);
7545                            goto onError;
7546                        }
7547                    }
7548                    if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7549                        goto onError;
7550                    PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7551                    outpos += targetsize;
7552                    extrachars -= targetsize;
7553                }
7554                /* 1-0 mapping: skip the character */
7555            }
7556            else {
7557                /* wrong return value */
7558                PyErr_SetString(PyExc_TypeError,
7559                                "character mapping must return integer, None or str");
7560                Py_DECREF(x);
7561                goto onError;
7562            }
7563            Py_DECREF(x);
7564            ++s;
7565        }
7566    }
7567    if (unicode_resize(&v, outpos) < 0)
7568        goto onError;
7569    Py_XDECREF(errorHandler);
7570    Py_XDECREF(exc);
7571    return unicode_result(v);
7572
7573  onError:
7574    Py_XDECREF(errorHandler);
7575    Py_XDECREF(exc);
7576    Py_XDECREF(v);
7577    return NULL;
7578}
7579
7580/* Charmap encoding: the lookup table */
7581
7582struct encoding_map {
7583    PyObject_HEAD
7584    unsigned char level1[32];
7585    int count2, count3;
7586    unsigned char level23[1];
7587};
7588
7589static PyObject*
7590encoding_map_size(PyObject *obj, PyObject* args)
7591{
7592    struct encoding_map *map = (struct encoding_map*)obj;
7593    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7594                           128*map->count3);
7595}
7596
7597static PyMethodDef encoding_map_methods[] = {
7598    {"size", encoding_map_size, METH_NOARGS,
7599     PyDoc_STR("Return the size (in bytes) of this object") },
7600    { 0 }
7601};
7602
7603static void
7604encoding_map_dealloc(PyObject* o)
7605{
7606    PyObject_FREE(o);
7607}
7608
7609static PyTypeObject EncodingMapType = {
7610    PyVarObject_HEAD_INIT(NULL, 0)
7611    "EncodingMap",          /*tp_name*/
7612    sizeof(struct encoding_map),   /*tp_basicsize*/
7613    0,                      /*tp_itemsize*/
7614    /* methods */
7615    encoding_map_dealloc,   /*tp_dealloc*/
7616    0,                      /*tp_print*/
7617    0,                      /*tp_getattr*/
7618    0,                      /*tp_setattr*/
7619    0,                      /*tp_reserved*/
7620    0,                      /*tp_repr*/
7621    0,                      /*tp_as_number*/
7622    0,                      /*tp_as_sequence*/
7623    0,                      /*tp_as_mapping*/
7624    0,                      /*tp_hash*/
7625    0,                      /*tp_call*/
7626    0,                      /*tp_str*/
7627    0,                      /*tp_getattro*/
7628    0,                      /*tp_setattro*/
7629    0,                      /*tp_as_buffer*/
7630    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7631    0,                      /*tp_doc*/
7632    0,                      /*tp_traverse*/
7633    0,                      /*tp_clear*/
7634    0,                      /*tp_richcompare*/
7635    0,                      /*tp_weaklistoffset*/
7636    0,                      /*tp_iter*/
7637    0,                      /*tp_iternext*/
7638    encoding_map_methods,   /*tp_methods*/
7639    0,                      /*tp_members*/
7640    0,                      /*tp_getset*/
7641    0,                      /*tp_base*/
7642    0,                      /*tp_dict*/
7643    0,                      /*tp_descr_get*/
7644    0,                      /*tp_descr_set*/
7645    0,                      /*tp_dictoffset*/
7646    0,                      /*tp_init*/
7647    0,                      /*tp_alloc*/
7648    0,                      /*tp_new*/
7649    0,                      /*tp_free*/
7650    0,                      /*tp_is_gc*/
7651};
7652
7653PyObject*
7654PyUnicode_BuildEncodingMap(PyObject* string)
7655{
7656    PyObject *result;
7657    struct encoding_map *mresult;
7658    int i;
7659    int need_dict = 0;
7660    unsigned char level1[32];
7661    unsigned char level2[512];
7662    unsigned char *mlevel1, *mlevel2, *mlevel3;
7663    int count2 = 0, count3 = 0;
7664    int kind;
7665    void *data;
7666    Py_UCS4 ch;
7667
7668    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7669        PyErr_BadArgument();
7670        return NULL;
7671    }
7672    kind = PyUnicode_KIND(string);
7673    data = PyUnicode_DATA(string);
7674    memset(level1, 0xFF, sizeof level1);
7675    memset(level2, 0xFF, sizeof level2);
7676
7677    /* If there isn't a one-to-one mapping of NULL to \0,
7678       or if there are non-BMP characters, we need to use
7679       a mapping dictionary. */
7680    if (PyUnicode_READ(kind, data, 0) != 0)
7681        need_dict = 1;
7682    for (i = 1; i < 256; i++) {
7683        int l1, l2;
7684        ch = PyUnicode_READ(kind, data, i);
7685        if (ch == 0 || ch > 0xFFFF) {
7686            need_dict = 1;
7687            break;
7688        }
7689        if (ch == 0xFFFE)
7690            /* unmapped character */
7691            continue;
7692        l1 = ch >> 11;
7693        l2 = ch >> 7;
7694        if (level1[l1] == 0xFF)
7695            level1[l1] = count2++;
7696        if (level2[l2] == 0xFF)
7697            level2[l2] = count3++;
7698    }
7699
7700    if (count2 >= 0xFF || count3 >= 0xFF)
7701        need_dict = 1;
7702
7703    if (need_dict) {
7704        PyObject *result = PyDict_New();
7705        PyObject *key, *value;
7706        if (!result)
7707            return NULL;
7708        for (i = 0; i < 256; i++) {
7709            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7710            value = PyLong_FromLong(i);
7711            if (!key || !value)
7712                goto failed1;
7713            if (PyDict_SetItem(result, key, value) == -1)
7714                goto failed1;
7715            Py_DECREF(key);
7716            Py_DECREF(value);
7717        }
7718        return result;
7719      failed1:
7720        Py_XDECREF(key);
7721        Py_XDECREF(value);
7722        Py_DECREF(result);
7723        return NULL;
7724    }
7725
7726    /* Create a three-level trie */
7727    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7728                             16*count2 + 128*count3 - 1);
7729    if (!result)
7730        return PyErr_NoMemory();
7731    PyObject_Init(result, &EncodingMapType);
7732    mresult = (struct encoding_map*)result;
7733    mresult->count2 = count2;
7734    mresult->count3 = count3;
7735    mlevel1 = mresult->level1;
7736    mlevel2 = mresult->level23;
7737    mlevel3 = mresult->level23 + 16*count2;
7738    memcpy(mlevel1, level1, 32);
7739    memset(mlevel2, 0xFF, 16*count2);
7740    memset(mlevel3, 0, 128*count3);
7741    count3 = 0;
7742    for (i = 1; i < 256; i++) {
7743        int o1, o2, o3, i2, i3;
7744        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7745            /* unmapped character */
7746            continue;
7747        o1 = PyUnicode_READ(kind, data, i)>>11;
7748        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7749        i2 = 16*mlevel1[o1] + o2;
7750        if (mlevel2[i2] == 0xFF)
7751            mlevel2[i2] = count3++;
7752        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7753        i3 = 128*mlevel2[i2] + o3;
7754        mlevel3[i3] = i;
7755    }
7756    return result;
7757}
7758
7759static int
7760encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7761{
7762    struct encoding_map *map = (struct encoding_map*)mapping;
7763    int l1 = c>>11;
7764    int l2 = (c>>7) & 0xF;
7765    int l3 = c & 0x7F;
7766    int i;
7767
7768    if (c > 0xFFFF)
7769        return -1;
7770    if (c == 0)
7771        return 0;
7772    /* level 1*/
7773    i = map->level1[l1];
7774    if (i == 0xFF) {
7775        return -1;
7776    }
7777    /* level 2*/
7778    i = map->level23[16*i+l2];
7779    if (i == 0xFF) {
7780        return -1;
7781    }
7782    /* level 3 */
7783    i = map->level23[16*map->count2 + 128*i + l3];
7784    if (i == 0) {
7785        return -1;
7786    }
7787    return i;
7788}
7789
7790/* Lookup the character ch in the mapping. If the character
7791   can't be found, Py_None is returned (or NULL, if another
7792   error occurred). */
7793static PyObject *
7794charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7795{
7796    PyObject *w = PyLong_FromLong((long)c);
7797    PyObject *x;
7798
7799    if (w == NULL)
7800        return NULL;
7801    x = PyObject_GetItem(mapping, w);
7802    Py_DECREF(w);
7803    if (x == NULL) {
7804        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7805            /* No mapping found means: mapping is undefined. */
7806            PyErr_Clear();
7807            x = Py_None;
7808            Py_INCREF(x);
7809            return x;
7810        } else
7811            return NULL;
7812    }
7813    else if (x == Py_None)
7814        return x;
7815    else if (PyLong_Check(x)) {
7816        long value = PyLong_AS_LONG(x);
7817        if (value < 0 || value > 255) {
7818            PyErr_SetString(PyExc_TypeError,
7819                            "character mapping must be in range(256)");
7820            Py_DECREF(x);
7821            return NULL;
7822        }
7823        return x;
7824    }
7825    else if (PyBytes_Check(x))
7826        return x;
7827    else {
7828        /* wrong return value */
7829        PyErr_Format(PyExc_TypeError,
7830                     "character mapping must return integer, bytes or None, not %.400s",
7831                     x->ob_type->tp_name);
7832        Py_DECREF(x);
7833        return NULL;
7834    }
7835}
7836
7837static int
7838charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7839{
7840    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7841    /* exponentially overallocate to minimize reallocations */
7842    if (requiredsize < 2*outsize)
7843        requiredsize = 2*outsize;
7844    if (_PyBytes_Resize(outobj, requiredsize))
7845        return -1;
7846    return 0;
7847}
7848
7849typedef enum charmapencode_result {
7850    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7851} charmapencode_result;
7852/* lookup the character, put the result in the output string and adjust
7853   various state variables. Resize the output bytes object if not enough
7854   space is available. Return a new reference to the object that
7855   was put in the output buffer, or Py_None, if the mapping was undefined
7856   (in which case no character was written) or NULL, if a
7857   reallocation error occurred. The caller must decref the result */
7858static charmapencode_result
7859charmapencode_output(Py_UCS4 c, PyObject *mapping,
7860                     PyObject **outobj, Py_ssize_t *outpos)
7861{
7862    PyObject *rep;
7863    char *outstart;
7864    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7865
7866    if (Py_TYPE(mapping) == &EncodingMapType) {
7867        int res = encoding_map_lookup(c, mapping);
7868        Py_ssize_t requiredsize = *outpos+1;
7869        if (res == -1)
7870            return enc_FAILED;
7871        if (outsize<requiredsize)
7872            if (charmapencode_resize(outobj, outpos, requiredsize))
7873                return enc_EXCEPTION;
7874        outstart = PyBytes_AS_STRING(*outobj);
7875        outstart[(*outpos)++] = (char)res;
7876        return enc_SUCCESS;
7877    }
7878
7879    rep = charmapencode_lookup(c, mapping);
7880    if (rep==NULL)
7881        return enc_EXCEPTION;
7882    else if (rep==Py_None) {
7883        Py_DECREF(rep);
7884        return enc_FAILED;
7885    } else {
7886        if (PyLong_Check(rep)) {
7887            Py_ssize_t requiredsize = *outpos+1;
7888            if (outsize<requiredsize)
7889                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7890                    Py_DECREF(rep);
7891                    return enc_EXCEPTION;
7892                }
7893            outstart = PyBytes_AS_STRING(*outobj);
7894            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7895        }
7896        else {
7897            const char *repchars = PyBytes_AS_STRING(rep);
7898            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7899            Py_ssize_t requiredsize = *outpos+repsize;
7900            if (outsize<requiredsize)
7901                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7902                    Py_DECREF(rep);
7903                    return enc_EXCEPTION;
7904                }
7905            outstart = PyBytes_AS_STRING(*outobj);
7906            memcpy(outstart + *outpos, repchars, repsize);
7907            *outpos += repsize;
7908        }
7909    }
7910    Py_DECREF(rep);
7911    return enc_SUCCESS;
7912}
7913
7914/* handle an error in PyUnicode_EncodeCharmap
7915   Return 0 on success, -1 on error */
7916static int
7917charmap_encoding_error(
7918    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
7919    PyObject **exceptionObject,
7920    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7921    PyObject **res, Py_ssize_t *respos)
7922{
7923    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7924    Py_ssize_t size, repsize;
7925    Py_ssize_t newpos;
7926    enum PyUnicode_Kind kind;
7927    void *data;
7928    Py_ssize_t index;
7929    /* startpos for collecting unencodable chars */
7930    Py_ssize_t collstartpos = *inpos;
7931    Py_ssize_t collendpos = *inpos+1;
7932    Py_ssize_t collpos;
7933    char *encoding = "charmap";
7934    char *reason = "character maps to <undefined>";
7935    charmapencode_result x;
7936    Py_UCS4 ch;
7937    int val;
7938
7939    if (PyUnicode_READY(unicode) == -1)
7940        return -1;
7941    size = PyUnicode_GET_LENGTH(unicode);
7942    /* find all unencodable characters */
7943    while (collendpos < size) {
7944        PyObject *rep;
7945        if (Py_TYPE(mapping) == &EncodingMapType) {
7946            ch = PyUnicode_READ_CHAR(unicode, collendpos);
7947            val = encoding_map_lookup(ch, mapping);
7948            if (val != -1)
7949                break;
7950            ++collendpos;
7951            continue;
7952        }
7953
7954        ch = PyUnicode_READ_CHAR(unicode, collendpos);
7955        rep = charmapencode_lookup(ch, mapping);
7956        if (rep==NULL)
7957            return -1;
7958        else if (rep!=Py_None) {
7959            Py_DECREF(rep);
7960            break;
7961        }
7962        Py_DECREF(rep);
7963        ++collendpos;
7964    }
7965    /* cache callback name lookup
7966     * (if not done yet, i.e. it's the first error) */
7967    if (*known_errorHandler==-1) {
7968        if ((errors==NULL) || (!strcmp(errors, "strict")))
7969            *known_errorHandler = 1;
7970        else if (!strcmp(errors, "replace"))
7971            *known_errorHandler = 2;
7972        else if (!strcmp(errors, "ignore"))
7973            *known_errorHandler = 3;
7974        else if (!strcmp(errors, "xmlcharrefreplace"))
7975            *known_errorHandler = 4;
7976        else
7977            *known_errorHandler = 0;
7978    }
7979    switch (*known_errorHandler) {
7980    case 1: /* strict */
7981        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7982        return -1;
7983    case 2: /* replace */
7984        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7985            x = charmapencode_output('?', mapping, res, respos);
7986            if (x==enc_EXCEPTION) {
7987                return -1;
7988            }
7989            else if (x==enc_FAILED) {
7990                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7991                return -1;
7992            }
7993        }
7994        /* fall through */
7995    case 3: /* ignore */
7996        *inpos = collendpos;
7997        break;
7998    case 4: /* xmlcharrefreplace */
7999        /* generate replacement (temporarily (mis)uses p) */
8000        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8001            char buffer[2+29+1+1];
8002            char *cp;
8003            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8004            for (cp = buffer; *cp; ++cp) {
8005                x = charmapencode_output(*cp, mapping, res, respos);
8006                if (x==enc_EXCEPTION)
8007                    return -1;
8008                else if (x==enc_FAILED) {
8009                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8010                    return -1;
8011                }
8012            }
8013        }
8014        *inpos = collendpos;
8015        break;
8016    default:
8017        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8018                                                      encoding, reason, unicode, exceptionObject,
8019                                                      collstartpos, collendpos, &newpos);
8020        if (repunicode == NULL)
8021            return -1;
8022        if (PyBytes_Check(repunicode)) {
8023            /* Directly copy bytes result to output. */
8024            Py_ssize_t outsize = PyBytes_Size(*res);
8025            Py_ssize_t requiredsize;
8026            repsize = PyBytes_Size(repunicode);
8027            requiredsize = *respos + repsize;
8028            if (requiredsize > outsize)
8029                /* Make room for all additional bytes. */
8030                if (charmapencode_resize(res, respos, requiredsize)) {
8031                    Py_DECREF(repunicode);
8032                    return -1;
8033                }
8034            memcpy(PyBytes_AsString(*res) + *respos,
8035                   PyBytes_AsString(repunicode),  repsize);
8036            *respos += repsize;
8037            *inpos = newpos;
8038            Py_DECREF(repunicode);
8039            break;
8040        }
8041        /* generate replacement  */
8042        if (PyUnicode_READY(repunicode) == -1) {
8043            Py_DECREF(repunicode);
8044            return -1;
8045        }
8046        repsize = PyUnicode_GET_LENGTH(repunicode);
8047        data = PyUnicode_DATA(repunicode);
8048        kind = PyUnicode_KIND(repunicode);
8049        for (index = 0; index < repsize; index++) {
8050            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8051            x = charmapencode_output(repch, mapping, res, respos);
8052            if (x==enc_EXCEPTION) {
8053                Py_DECREF(repunicode);
8054                return -1;
8055            }
8056            else if (x==enc_FAILED) {
8057                Py_DECREF(repunicode);
8058                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8059                return -1;
8060            }
8061        }
8062        *inpos = newpos;
8063        Py_DECREF(repunicode);
8064    }
8065    return 0;
8066}
8067
8068PyObject *
8069_PyUnicode_EncodeCharmap(PyObject *unicode,
8070                         PyObject *mapping,
8071                         const char *errors)
8072{
8073    /* output object */
8074    PyObject *res = NULL;
8075    /* current input position */
8076    Py_ssize_t inpos = 0;
8077    Py_ssize_t size;
8078    /* current output position */
8079    Py_ssize_t respos = 0;
8080    PyObject *errorHandler = NULL;
8081    PyObject *exc = NULL;
8082    /* the following variable is used for caching string comparisons
8083     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8084     * 3=ignore, 4=xmlcharrefreplace */
8085    int known_errorHandler = -1;
8086
8087    if (PyUnicode_READY(unicode) == -1)
8088        return NULL;
8089    size = PyUnicode_GET_LENGTH(unicode);
8090
8091    /* Default to Latin-1 */
8092    if (mapping == NULL)
8093        return unicode_encode_ucs1(unicode, errors, 256);
8094
8095    /* allocate enough for a simple encoding without
8096       replacements, if we need more, we'll resize */
8097    res = PyBytes_FromStringAndSize(NULL, size);
8098    if (res == NULL)
8099        goto onError;
8100    if (size == 0)
8101        return res;
8102
8103    while (inpos<size) {
8104        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
8105        /* try to encode it */
8106        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8107        if (x==enc_EXCEPTION) /* error */
8108            goto onError;
8109        if (x==enc_FAILED) { /* unencodable character */
8110            if (charmap_encoding_error(unicode, &inpos, mapping,
8111                                       &exc,
8112                                       &known_errorHandler, &errorHandler, errors,
8113                                       &res, &respos)) {
8114                goto onError;
8115            }
8116        }
8117        else
8118            /* done with this character => adjust input position */
8119            ++inpos;
8120    }
8121
8122    /* Resize if we allocated to much */
8123    if (respos<PyBytes_GET_SIZE(res))
8124        if (_PyBytes_Resize(&res, respos) < 0)
8125            goto onError;
8126
8127    Py_XDECREF(exc);
8128    Py_XDECREF(errorHandler);
8129    return res;
8130
8131  onError:
8132    Py_XDECREF(res);
8133    Py_XDECREF(exc);
8134    Py_XDECREF(errorHandler);
8135    return NULL;
8136}
8137
8138/* Deprecated */
8139PyObject *
8140PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8141                        Py_ssize_t size,
8142                        PyObject *mapping,
8143                        const char *errors)
8144{
8145    PyObject *result;
8146    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8147    if (unicode == NULL)
8148        return NULL;
8149    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8150    Py_DECREF(unicode);
8151    return result;
8152}
8153
8154PyObject *
8155PyUnicode_AsCharmapString(PyObject *unicode,
8156                          PyObject *mapping)
8157{
8158    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8159        PyErr_BadArgument();
8160        return NULL;
8161    }
8162    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8163}
8164
8165/* create or adjust a UnicodeTranslateError */
8166static void
8167make_translate_exception(PyObject **exceptionObject,
8168                         PyObject *unicode,
8169                         Py_ssize_t startpos, Py_ssize_t endpos,
8170                         const char *reason)
8171{
8172    if (*exceptionObject == NULL) {
8173        *exceptionObject = _PyUnicodeTranslateError_Create(
8174            unicode, startpos, endpos, reason);
8175    }
8176    else {
8177        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8178            goto onError;
8179        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8180            goto onError;
8181        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8182            goto onError;
8183        return;
8184      onError:
8185        Py_DECREF(*exceptionObject);
8186        *exceptionObject = NULL;
8187    }
8188}
8189
8190/* raises a UnicodeTranslateError */
8191static void
8192raise_translate_exception(PyObject **exceptionObject,
8193                          PyObject *unicode,
8194                          Py_ssize_t startpos, Py_ssize_t endpos,
8195                          const char *reason)
8196{
8197    make_translate_exception(exceptionObject,
8198                             unicode, startpos, endpos, reason);
8199    if (*exceptionObject != NULL)
8200        PyCodec_StrictErrors(*exceptionObject);
8201}
8202
8203/* error handling callback helper:
8204   build arguments, call the callback and check the arguments,
8205   put the result into newpos and return the replacement string, which
8206   has to be freed by the caller */
8207static PyObject *
8208unicode_translate_call_errorhandler(const char *errors,
8209                                    PyObject **errorHandler,
8210                                    const char *reason,
8211                                    PyObject *unicode, PyObject **exceptionObject,
8212                                    Py_ssize_t startpos, Py_ssize_t endpos,
8213                                    Py_ssize_t *newpos)
8214{
8215    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8216
8217    Py_ssize_t i_newpos;
8218    PyObject *restuple;
8219    PyObject *resunicode;
8220
8221    if (*errorHandler == NULL) {
8222        *errorHandler = PyCodec_LookupError(errors);
8223        if (*errorHandler == NULL)
8224            return NULL;
8225    }
8226
8227    make_translate_exception(exceptionObject,
8228                             unicode, startpos, endpos, reason);
8229    if (*exceptionObject == NULL)
8230        return NULL;
8231
8232    restuple = PyObject_CallFunctionObjArgs(
8233        *errorHandler, *exceptionObject, NULL);
8234    if (restuple == NULL)
8235        return NULL;
8236    if (!PyTuple_Check(restuple)) {
8237        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8238        Py_DECREF(restuple);
8239        return NULL;
8240    }
8241    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8242                          &resunicode, &i_newpos)) {
8243        Py_DECREF(restuple);
8244        return NULL;
8245    }
8246    if (i_newpos<0)
8247        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8248    else
8249        *newpos = i_newpos;
8250    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8251        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8252        Py_DECREF(restuple);
8253        return NULL;
8254    }
8255    Py_INCREF(resunicode);
8256    Py_DECREF(restuple);
8257    return resunicode;
8258}
8259
8260/* Lookup the character ch in the mapping and put the result in result,
8261   which must be decrefed by the caller.
8262   Return 0 on success, -1 on error */
8263static int
8264charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8265{
8266    PyObject *w = PyLong_FromLong((long)c);
8267    PyObject *x;
8268
8269    if (w == NULL)
8270        return -1;
8271    x = PyObject_GetItem(mapping, w);
8272    Py_DECREF(w);
8273    if (x == NULL) {
8274        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8275            /* No mapping found means: use 1:1 mapping. */
8276            PyErr_Clear();
8277            *result = NULL;
8278            return 0;
8279        } else
8280            return -1;
8281    }
8282    else if (x == Py_None) {
8283        *result = x;
8284        return 0;
8285    }
8286    else if (PyLong_Check(x)) {
8287        long value = PyLong_AS_LONG(x);
8288        long max = PyUnicode_GetMax();
8289        if (value < 0 || value > max) {
8290            PyErr_Format(PyExc_TypeError,
8291                         "character mapping must be in range(0x%x)", max+1);
8292            Py_DECREF(x);
8293            return -1;
8294        }
8295        *result = x;
8296        return 0;
8297    }
8298    else if (PyUnicode_Check(x)) {
8299        *result = x;
8300        return 0;
8301    }
8302    else {
8303        /* wrong return value */
8304        PyErr_SetString(PyExc_TypeError,
8305                        "character mapping must return integer, None or str");
8306        Py_DECREF(x);
8307        return -1;
8308    }
8309}
8310/* ensure that *outobj is at least requiredsize characters long,
8311   if not reallocate and adjust various state variables.
8312   Return 0 on success, -1 on error */
8313static int
8314charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8315                               Py_ssize_t requiredsize)
8316{
8317    Py_ssize_t oldsize = *psize;
8318    Py_UCS4 *new_outobj;
8319    if (requiredsize > oldsize) {
8320        /* exponentially overallocate to minimize reallocations */
8321        if (requiredsize < 2 * oldsize)
8322            requiredsize = 2 * oldsize;
8323        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8324        if (new_outobj == 0)
8325            return -1;
8326        *outobj = new_outobj;
8327        *psize = requiredsize;
8328    }
8329    return 0;
8330}
8331/* lookup the character, put the result in the output string and adjust
8332   various state variables. Return a new reference to the object that
8333   was put in the output buffer in *result, or Py_None, if the mapping was
8334   undefined (in which case no character was written).
8335   The called must decref result.
8336   Return 0 on success, -1 on error. */
8337static int
8338charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8339                        PyObject *mapping, Py_UCS4 **output,
8340                        Py_ssize_t *osize, Py_ssize_t *opos,
8341                        PyObject **res)
8342{
8343    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8344    if (charmaptranslate_lookup(curinp, mapping, res))
8345        return -1;
8346    if (*res==NULL) {
8347        /* not found => default to 1:1 mapping */
8348        (*output)[(*opos)++] = curinp;
8349    }
8350    else if (*res==Py_None)
8351        ;
8352    else if (PyLong_Check(*res)) {
8353        /* no overflow check, because we know that the space is enough */
8354        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8355    }
8356    else if (PyUnicode_Check(*res)) {
8357        Py_ssize_t repsize;
8358        if (PyUnicode_READY(*res) == -1)
8359            return -1;
8360        repsize = PyUnicode_GET_LENGTH(*res);
8361        if (repsize==1) {
8362            /* no overflow check, because we know that the space is enough */
8363            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8364        }
8365        else if (repsize!=0) {
8366            /* more than one character */
8367            Py_ssize_t requiredsize = *opos +
8368                (PyUnicode_GET_LENGTH(input) - ipos) +
8369                repsize - 1;
8370            Py_ssize_t i;
8371            if (charmaptranslate_makespace(output, osize, requiredsize))
8372                return -1;
8373            for(i = 0; i < repsize; i++)
8374                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8375        }
8376    }
8377    else
8378        return -1;
8379    return 0;
8380}
8381
8382PyObject *
8383_PyUnicode_TranslateCharmap(PyObject *input,
8384                            PyObject *mapping,
8385                            const char *errors)
8386{
8387    /* input object */
8388    char *idata;
8389    Py_ssize_t size, i;
8390    int kind;
8391    /* output buffer */
8392    Py_UCS4 *output = NULL;
8393    Py_ssize_t osize;
8394    PyObject *res;
8395    /* current output position */
8396    Py_ssize_t opos;
8397    char *reason = "character maps to <undefined>";
8398    PyObject *errorHandler = NULL;
8399    PyObject *exc = NULL;
8400    /* the following variable is used for caching string comparisons
8401     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8402     * 3=ignore, 4=xmlcharrefreplace */
8403    int known_errorHandler = -1;
8404
8405    if (mapping == NULL) {
8406        PyErr_BadArgument();
8407        return NULL;
8408    }
8409
8410    if (PyUnicode_READY(input) == -1)
8411        return NULL;
8412    idata = (char*)PyUnicode_DATA(input);
8413    kind = PyUnicode_KIND(input);
8414    size = PyUnicode_GET_LENGTH(input);
8415    i = 0;
8416
8417    if (size == 0) {
8418        Py_INCREF(input);
8419        return input;
8420    }
8421
8422    /* allocate enough for a simple 1:1 translation without
8423       replacements, if we need more, we'll resize */
8424    osize = size;
8425    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8426    opos = 0;
8427    if (output == NULL) {
8428        PyErr_NoMemory();
8429        goto onError;
8430    }
8431
8432    while (i<size) {
8433        /* try to encode it */
8434        PyObject *x = NULL;
8435        if (charmaptranslate_output(input, i, mapping,
8436                                    &output, &osize, &opos, &x)) {
8437            Py_XDECREF(x);
8438            goto onError;
8439        }
8440        Py_XDECREF(x);
8441        if (x!=Py_None) /* it worked => adjust input pointer */
8442            ++i;
8443        else { /* untranslatable character */
8444            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8445            Py_ssize_t repsize;
8446            Py_ssize_t newpos;
8447            Py_ssize_t uni2;
8448            /* startpos for collecting untranslatable chars */
8449            Py_ssize_t collstart = i;
8450            Py_ssize_t collend = i+1;
8451            Py_ssize_t coll;
8452
8453            /* find all untranslatable characters */
8454            while (collend < size) {
8455                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8456                    goto onError;
8457                Py_XDECREF(x);
8458                if (x!=Py_None)
8459                    break;
8460                ++collend;
8461            }
8462            /* cache callback name lookup
8463             * (if not done yet, i.e. it's the first error) */
8464            if (known_errorHandler==-1) {
8465                if ((errors==NULL) || (!strcmp(errors, "strict")))
8466                    known_errorHandler = 1;
8467                else if (!strcmp(errors, "replace"))
8468                    known_errorHandler = 2;
8469                else if (!strcmp(errors, "ignore"))
8470                    known_errorHandler = 3;
8471                else if (!strcmp(errors, "xmlcharrefreplace"))
8472                    known_errorHandler = 4;
8473                else
8474                    known_errorHandler = 0;
8475            }
8476            switch (known_errorHandler) {
8477            case 1: /* strict */
8478                raise_translate_exception(&exc, input, collstart,
8479                                          collend, reason);
8480                goto onError;
8481            case 2: /* replace */
8482                /* No need to check for space, this is a 1:1 replacement */
8483                for (coll = collstart; coll<collend; coll++)
8484                    output[opos++] = '?';
8485                /* fall through */
8486            case 3: /* ignore */
8487                i = collend;
8488                break;
8489            case 4: /* xmlcharrefreplace */
8490                /* generate replacement (temporarily (mis)uses i) */
8491                for (i = collstart; i < collend; ++i) {
8492                    char buffer[2+29+1+1];
8493                    char *cp;
8494                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8495                    if (charmaptranslate_makespace(&output, &osize,
8496                                                   opos+strlen(buffer)+(size-collend)))
8497                        goto onError;
8498                    for (cp = buffer; *cp; ++cp)
8499                        output[opos++] = *cp;
8500                }
8501                i = collend;
8502                break;
8503            default:
8504                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8505                                                                 reason, input, &exc,
8506                                                                 collstart, collend, &newpos);
8507                if (repunicode == NULL)
8508                    goto onError;
8509                if (PyUnicode_READY(repunicode) == -1) {
8510                    Py_DECREF(repunicode);
8511                    goto onError;
8512                }
8513                /* generate replacement  */
8514                repsize = PyUnicode_GET_LENGTH(repunicode);
8515                if (charmaptranslate_makespace(&output, &osize,
8516                                               opos+repsize+(size-collend))) {
8517                    Py_DECREF(repunicode);
8518                    goto onError;
8519                }
8520                for (uni2 = 0; repsize-->0; ++uni2)
8521                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8522                i = newpos;
8523                Py_DECREF(repunicode);
8524            }
8525        }
8526    }
8527    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8528    if (!res)
8529        goto onError;
8530    PyMem_Free(output);
8531    Py_XDECREF(exc);
8532    Py_XDECREF(errorHandler);
8533    return res;
8534
8535  onError:
8536    PyMem_Free(output);
8537    Py_XDECREF(exc);
8538    Py_XDECREF(errorHandler);
8539    return NULL;
8540}
8541
8542/* Deprecated. Use PyUnicode_Translate instead. */
8543PyObject *
8544PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8545                           Py_ssize_t size,
8546                           PyObject *mapping,
8547                           const char *errors)
8548{
8549    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8550    if (!unicode)
8551        return NULL;
8552    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8553}
8554
8555PyObject *
8556PyUnicode_Translate(PyObject *str,
8557                    PyObject *mapping,
8558                    const char *errors)
8559{
8560    PyObject *result;
8561
8562    str = PyUnicode_FromObject(str);
8563    if (str == NULL)
8564        goto onError;
8565    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8566    Py_DECREF(str);
8567    return result;
8568
8569  onError:
8570    Py_XDECREF(str);
8571    return NULL;
8572}
8573
8574static Py_UCS4
8575fix_decimal_and_space_to_ascii(PyObject *self)
8576{
8577    /* No need to call PyUnicode_READY(self) because this function is only
8578       called as a callback from fixup() which does it already. */
8579    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8580    const int kind = PyUnicode_KIND(self);
8581    void *data = PyUnicode_DATA(self);
8582    Py_UCS4 maxchar = 127, ch, fixed;
8583    int modified = 0;
8584    Py_ssize_t i;
8585
8586    for (i = 0; i < len; ++i) {
8587        ch = PyUnicode_READ(kind, data, i);
8588        fixed = 0;
8589        if (ch > 127) {
8590            if (Py_UNICODE_ISSPACE(ch))
8591                fixed = ' ';
8592            else {
8593                const int decimal = Py_UNICODE_TODECIMAL(ch);
8594                if (decimal >= 0)
8595                    fixed = '0' + decimal;
8596            }
8597            if (fixed != 0) {
8598                modified = 1;
8599                maxchar = MAX_MAXCHAR(maxchar, fixed);
8600                PyUnicode_WRITE(kind, data, i, fixed);
8601            }
8602            else
8603                maxchar = MAX_MAXCHAR(maxchar, ch);
8604        }
8605    }
8606
8607    return (modified) ? maxchar : 0;
8608}
8609
8610PyObject *
8611_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8612{
8613    if (!PyUnicode_Check(unicode)) {
8614        PyErr_BadInternalCall();
8615        return NULL;
8616    }
8617    if (PyUnicode_READY(unicode) == -1)
8618        return NULL;
8619    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8620        /* If the string is already ASCII, just return the same string */
8621        Py_INCREF(unicode);
8622        return unicode;
8623    }
8624    return fixup(unicode, fix_decimal_and_space_to_ascii);
8625}
8626
8627PyObject *
8628PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8629                                  Py_ssize_t length)
8630{
8631    PyObject *decimal;
8632    Py_ssize_t i;
8633    Py_UCS4 maxchar;
8634    enum PyUnicode_Kind kind;
8635    void *data;
8636
8637    maxchar = 127;
8638    for (i = 0; i < length; i++) {
8639        Py_UNICODE ch = s[i];
8640        if (ch > 127) {
8641            int decimal = Py_UNICODE_TODECIMAL(ch);
8642            if (decimal >= 0)
8643                ch = '0' + decimal;
8644            maxchar = MAX_MAXCHAR(maxchar, ch);
8645        }
8646    }
8647
8648    /* Copy to a new string */
8649    decimal = PyUnicode_New(length, maxchar);
8650    if (decimal == NULL)
8651        return decimal;
8652    kind = PyUnicode_KIND(decimal);
8653    data = PyUnicode_DATA(decimal);
8654    /* Iterate over code points */
8655    for (i = 0; i < length; i++) {
8656        Py_UNICODE ch = s[i];
8657        if (ch > 127) {
8658            int decimal = Py_UNICODE_TODECIMAL(ch);
8659            if (decimal >= 0)
8660                ch = '0' + decimal;
8661        }
8662        PyUnicode_WRITE(kind, data, i, ch);
8663    }
8664    return unicode_result(decimal);
8665}
8666/* --- Decimal Encoder ---------------------------------------------------- */
8667
8668int
8669PyUnicode_EncodeDecimal(Py_UNICODE *s,
8670                        Py_ssize_t length,
8671                        char *output,
8672                        const char *errors)
8673{
8674    PyObject *unicode;
8675    Py_ssize_t i;
8676    enum PyUnicode_Kind kind;
8677    void *data;
8678
8679    if (output == NULL) {
8680        PyErr_BadArgument();
8681        return -1;
8682    }
8683
8684    unicode = PyUnicode_FromUnicode(s, length);
8685    if (unicode == NULL)
8686        return -1;
8687
8688    if (PyUnicode_READY(unicode) == -1) {
8689        Py_DECREF(unicode);
8690        return -1;
8691    }
8692    kind = PyUnicode_KIND(unicode);
8693    data = PyUnicode_DATA(unicode);
8694
8695    for (i=0; i < length; ) {
8696        PyObject *exc;
8697        Py_UCS4 ch;
8698        int decimal;
8699        Py_ssize_t startpos;
8700
8701        ch = PyUnicode_READ(kind, data, i);
8702
8703        if (Py_UNICODE_ISSPACE(ch)) {
8704            *output++ = ' ';
8705            i++;
8706            continue;
8707        }
8708        decimal = Py_UNICODE_TODECIMAL(ch);
8709        if (decimal >= 0) {
8710            *output++ = '0' + decimal;
8711            i++;
8712            continue;
8713        }
8714        if (0 < ch && ch < 256) {
8715            *output++ = (char)ch;
8716            i++;
8717            continue;
8718        }
8719
8720        startpos = i;
8721        exc = NULL;
8722        raise_encode_exception(&exc, "decimal", unicode,
8723                               startpos, startpos+1,
8724                               "invalid decimal Unicode string");
8725        Py_XDECREF(exc);
8726        Py_DECREF(unicode);
8727        return -1;
8728    }
8729    /* 0-terminate the output string */
8730    *output++ = '\0';
8731    Py_DECREF(unicode);
8732    return 0;
8733}
8734
8735/* --- Helpers ------------------------------------------------------------ */
8736
8737static Py_ssize_t
8738any_find_slice(int direction, PyObject* s1, PyObject* s2,
8739               Py_ssize_t start,
8740               Py_ssize_t end)
8741{
8742    int kind1, kind2, kind;
8743    void *buf1, *buf2;
8744    Py_ssize_t len1, len2, result;
8745
8746    kind1 = PyUnicode_KIND(s1);
8747    kind2 = PyUnicode_KIND(s2);
8748    kind = kind1 > kind2 ? kind1 : kind2;
8749    buf1 = PyUnicode_DATA(s1);
8750    buf2 = PyUnicode_DATA(s2);
8751    if (kind1 != kind)
8752        buf1 = _PyUnicode_AsKind(s1, kind);
8753    if (!buf1)
8754        return -2;
8755    if (kind2 != kind)
8756        buf2 = _PyUnicode_AsKind(s2, kind);
8757    if (!buf2) {
8758        if (kind1 != kind) PyMem_Free(buf1);
8759        return -2;
8760    }
8761    len1 = PyUnicode_GET_LENGTH(s1);
8762    len2 = PyUnicode_GET_LENGTH(s2);
8763
8764    if (direction > 0) {
8765        switch (kind) {
8766        case PyUnicode_1BYTE_KIND:
8767            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8768                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8769            else
8770                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8771            break;
8772        case PyUnicode_2BYTE_KIND:
8773            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8774            break;
8775        case PyUnicode_4BYTE_KIND:
8776            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8777            break;
8778        default:
8779            assert(0); result = -2;
8780        }
8781    }
8782    else {
8783        switch (kind) {
8784        case PyUnicode_1BYTE_KIND:
8785            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8786                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8787            else
8788                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8789            break;
8790        case PyUnicode_2BYTE_KIND:
8791            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8792            break;
8793        case PyUnicode_4BYTE_KIND:
8794            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8795            break;
8796        default:
8797            assert(0); result = -2;
8798        }
8799    }
8800
8801    if (kind1 != kind)
8802        PyMem_Free(buf1);
8803    if (kind2 != kind)
8804        PyMem_Free(buf2);
8805
8806    return result;
8807}
8808
8809Py_ssize_t
8810_PyUnicode_InsertThousandsGrouping(
8811    PyObject *unicode, Py_ssize_t index,
8812    Py_ssize_t n_buffer,
8813    void *digits, Py_ssize_t n_digits,
8814    Py_ssize_t min_width,
8815    const char *grouping, PyObject *thousands_sep,
8816    Py_UCS4 *maxchar)
8817{
8818    unsigned int kind, thousands_sep_kind;
8819    char *data, *thousands_sep_data;
8820    Py_ssize_t thousands_sep_len;
8821    Py_ssize_t len;
8822
8823    if (unicode != NULL) {
8824        kind = PyUnicode_KIND(unicode);
8825        data = (char *) PyUnicode_DATA(unicode) + index * kind;
8826    }
8827    else {
8828        kind = PyUnicode_1BYTE_KIND;
8829        data = NULL;
8830    }
8831    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8832    thousands_sep_data = PyUnicode_DATA(thousands_sep);
8833    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8834    if (unicode != NULL && thousands_sep_kind != kind) {
8835        if (thousands_sep_kind < kind) {
8836            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8837            if (!thousands_sep_data)
8838                return -1;
8839        }
8840        else {
8841            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8842            if (!data)
8843                return -1;
8844        }
8845    }
8846
8847    switch (kind) {
8848    case PyUnicode_1BYTE_KIND:
8849        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8850            len = asciilib_InsertThousandsGrouping(
8851                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
8852                min_width, grouping,
8853                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8854        else
8855            len = ucs1lib_InsertThousandsGrouping(
8856                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8857                min_width, grouping,
8858                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8859        break;
8860    case PyUnicode_2BYTE_KIND:
8861        len = ucs2lib_InsertThousandsGrouping(
8862            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
8863            min_width, grouping,
8864            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
8865        break;
8866    case PyUnicode_4BYTE_KIND:
8867        len = ucs4lib_InsertThousandsGrouping(
8868            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
8869            min_width, grouping,
8870            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
8871        break;
8872    default:
8873        assert(0);
8874        return -1;
8875    }
8876    if (unicode != NULL && thousands_sep_kind != kind) {
8877        if (thousands_sep_kind < kind)
8878            PyMem_Free(thousands_sep_data);
8879        else
8880            PyMem_Free(data);
8881    }
8882    if (unicode == NULL) {
8883        *maxchar = 127;
8884        if (len != n_digits) {
8885            *maxchar = MAX_MAXCHAR(*maxchar,
8886                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
8887        }
8888    }
8889    return len;
8890}
8891
8892
8893/* helper macro to fixup start/end slice values */
8894#define ADJUST_INDICES(start, end, len)         \
8895    if (end > len)                              \
8896        end = len;                              \
8897    else if (end < 0) {                         \
8898        end += len;                             \
8899        if (end < 0)                            \
8900            end = 0;                            \
8901    }                                           \
8902    if (start < 0) {                            \
8903        start += len;                           \
8904        if (start < 0)                          \
8905            start = 0;                          \
8906    }
8907
8908Py_ssize_t
8909PyUnicode_Count(PyObject *str,
8910                PyObject *substr,
8911                Py_ssize_t start,
8912                Py_ssize_t end)
8913{
8914    Py_ssize_t result;
8915    PyObject* str_obj;
8916    PyObject* sub_obj;
8917    int kind1, kind2, kind;
8918    void *buf1 = NULL, *buf2 = NULL;
8919    Py_ssize_t len1, len2;
8920
8921    str_obj = PyUnicode_FromObject(str);
8922    if (!str_obj)
8923        return -1;
8924    sub_obj = PyUnicode_FromObject(substr);
8925    if (!sub_obj) {
8926        Py_DECREF(str_obj);
8927        return -1;
8928    }
8929    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
8930        Py_DECREF(sub_obj);
8931        Py_DECREF(str_obj);
8932        return -1;
8933    }
8934
8935    kind1 = PyUnicode_KIND(str_obj);
8936    kind2 = PyUnicode_KIND(sub_obj);
8937    kind = kind1;
8938    buf1 = PyUnicode_DATA(str_obj);
8939    buf2 = PyUnicode_DATA(sub_obj);
8940    if (kind2 != kind) {
8941        if (kind2 > kind) {
8942            Py_DECREF(sub_obj);
8943            Py_DECREF(str_obj);
8944            return 0;
8945        }
8946        buf2 = _PyUnicode_AsKind(sub_obj, kind);
8947    }
8948    if (!buf2)
8949        goto onError;
8950    len1 = PyUnicode_GET_LENGTH(str_obj);
8951    len2 = PyUnicode_GET_LENGTH(sub_obj);
8952
8953    ADJUST_INDICES(start, end, len1);
8954    switch (kind) {
8955    case PyUnicode_1BYTE_KIND:
8956        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8957            result = asciilib_count(
8958                ((Py_UCS1*)buf1) + start, end - start,
8959                buf2, len2, PY_SSIZE_T_MAX
8960                );
8961        else
8962            result = ucs1lib_count(
8963                ((Py_UCS1*)buf1) + start, end - start,
8964                buf2, len2, PY_SSIZE_T_MAX
8965                );
8966        break;
8967    case PyUnicode_2BYTE_KIND:
8968        result = ucs2lib_count(
8969            ((Py_UCS2*)buf1) + start, end - start,
8970            buf2, len2, PY_SSIZE_T_MAX
8971            );
8972        break;
8973    case PyUnicode_4BYTE_KIND:
8974        result = ucs4lib_count(
8975            ((Py_UCS4*)buf1) + start, end - start,
8976            buf2, len2, PY_SSIZE_T_MAX
8977            );
8978        break;
8979    default:
8980        assert(0); result = 0;
8981    }
8982
8983    Py_DECREF(sub_obj);
8984    Py_DECREF(str_obj);
8985
8986    if (kind2 != kind)
8987        PyMem_Free(buf2);
8988
8989    return result;
8990  onError:
8991    Py_DECREF(sub_obj);
8992    Py_DECREF(str_obj);
8993    if (kind2 != kind && buf2)
8994        PyMem_Free(buf2);
8995    return -1;
8996}
8997
8998Py_ssize_t
8999PyUnicode_Find(PyObject *str,
9000               PyObject *sub,
9001               Py_ssize_t start,
9002               Py_ssize_t end,
9003               int direction)
9004{
9005    Py_ssize_t result;
9006
9007    str = PyUnicode_FromObject(str);
9008    if (!str)
9009        return -2;
9010    sub = PyUnicode_FromObject(sub);
9011    if (!sub) {
9012        Py_DECREF(str);
9013        return -2;
9014    }
9015    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9016        Py_DECREF(sub);
9017        Py_DECREF(str);
9018        return -2;
9019    }
9020
9021    result = any_find_slice(direction,
9022        str, sub, start, end
9023        );
9024
9025    Py_DECREF(str);
9026    Py_DECREF(sub);
9027
9028    return result;
9029}
9030
9031Py_ssize_t
9032PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9033                   Py_ssize_t start, Py_ssize_t end,
9034                   int direction)
9035{
9036    int kind;
9037    Py_ssize_t result;
9038    if (PyUnicode_READY(str) == -1)
9039        return -2;
9040    if (start < 0 || end < 0) {
9041        PyErr_SetString(PyExc_IndexError, "string index out of range");
9042        return -2;
9043    }
9044    if (end > PyUnicode_GET_LENGTH(str))
9045        end = PyUnicode_GET_LENGTH(str);
9046    kind = PyUnicode_KIND(str);
9047    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9048                      kind, end-start, ch, direction);
9049    if (result == -1)
9050        return -1;
9051    else
9052        return start + result;
9053}
9054
9055static int
9056tailmatch(PyObject *self,
9057          PyObject *substring,
9058          Py_ssize_t start,
9059          Py_ssize_t end,
9060          int direction)
9061{
9062    int kind_self;
9063    int kind_sub;
9064    void *data_self;
9065    void *data_sub;
9066    Py_ssize_t offset;
9067    Py_ssize_t i;
9068    Py_ssize_t end_sub;
9069
9070    if (PyUnicode_READY(self) == -1 ||
9071        PyUnicode_READY(substring) == -1)
9072        return 0;
9073
9074    if (PyUnicode_GET_LENGTH(substring) == 0)
9075        return 1;
9076
9077    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9078    end -= PyUnicode_GET_LENGTH(substring);
9079    if (end < start)
9080        return 0;
9081
9082    kind_self = PyUnicode_KIND(self);
9083    data_self = PyUnicode_DATA(self);
9084    kind_sub = PyUnicode_KIND(substring);
9085    data_sub = PyUnicode_DATA(substring);
9086    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9087
9088    if (direction > 0)
9089        offset = end;
9090    else
9091        offset = start;
9092
9093    if (PyUnicode_READ(kind_self, data_self, offset) ==
9094        PyUnicode_READ(kind_sub, data_sub, 0) &&
9095        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9096        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9097        /* If both are of the same kind, memcmp is sufficient */
9098        if (kind_self == kind_sub) {
9099            return ! memcmp((char *)data_self +
9100                                (offset * PyUnicode_KIND(substring)),
9101                            data_sub,
9102                            PyUnicode_GET_LENGTH(substring) *
9103                                PyUnicode_KIND(substring));
9104        }
9105        /* otherwise we have to compare each character by first accesing it */
9106        else {
9107            /* We do not need to compare 0 and len(substring)-1 because
9108               the if statement above ensured already that they are equal
9109               when we end up here. */
9110            // TODO: honor direction and do a forward or backwards search
9111            for (i = 1; i < end_sub; ++i) {
9112                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9113                    PyUnicode_READ(kind_sub, data_sub, i))
9114                    return 0;
9115            }
9116            return 1;
9117        }
9118    }
9119
9120    return 0;
9121}
9122
9123Py_ssize_t
9124PyUnicode_Tailmatch(PyObject *str,
9125                    PyObject *substr,
9126                    Py_ssize_t start,
9127                    Py_ssize_t end,
9128                    int direction)
9129{
9130    Py_ssize_t result;
9131
9132    str = PyUnicode_FromObject(str);
9133    if (str == NULL)
9134        return -1;
9135    substr = PyUnicode_FromObject(substr);
9136    if (substr == NULL) {
9137        Py_DECREF(str);
9138        return -1;
9139    }
9140
9141    result = tailmatch(str, substr,
9142                       start, end, direction);
9143    Py_DECREF(str);
9144    Py_DECREF(substr);
9145    return result;
9146}
9147
9148/* Apply fixfct filter to the Unicode object self and return a
9149   reference to the modified object */
9150
9151static PyObject *
9152fixup(PyObject *self,
9153      Py_UCS4 (*fixfct)(PyObject *s))
9154{
9155    PyObject *u;
9156    Py_UCS4 maxchar_old, maxchar_new = 0;
9157    PyObject *v;
9158
9159    u = _PyUnicode_Copy(self);
9160    if (u == NULL)
9161        return NULL;
9162    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9163
9164    /* fix functions return the new maximum character in a string,
9165       if the kind of the resulting unicode object does not change,
9166       everything is fine.  Otherwise we need to change the string kind
9167       and re-run the fix function. */
9168    maxchar_new = fixfct(u);
9169
9170    if (maxchar_new == 0) {
9171        /* no changes */;
9172        if (PyUnicode_CheckExact(self)) {
9173            Py_DECREF(u);
9174            Py_INCREF(self);
9175            return self;
9176        }
9177        else
9178            return u;
9179    }
9180
9181    maxchar_new = align_maxchar(maxchar_new);
9182
9183    if (maxchar_new == maxchar_old)
9184        return u;
9185
9186    /* In case the maximum character changed, we need to
9187       convert the string to the new category. */
9188    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9189    if (v == NULL) {
9190        Py_DECREF(u);
9191        return NULL;
9192    }
9193    if (maxchar_new > maxchar_old) {
9194        /* If the maxchar increased so that the kind changed, not all
9195           characters are representable anymore and we need to fix the
9196           string again. This only happens in very few cases. */
9197        _PyUnicode_FastCopyCharacters(v, 0,
9198                                      self, 0, PyUnicode_GET_LENGTH(self));
9199        maxchar_old = fixfct(v);
9200        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9201    }
9202    else {
9203        _PyUnicode_FastCopyCharacters(v, 0,
9204                                      u, 0, PyUnicode_GET_LENGTH(self));
9205    }
9206    Py_DECREF(u);
9207    assert(_PyUnicode_CheckConsistency(v, 1));
9208    return v;
9209}
9210
9211static PyObject *
9212ascii_upper_or_lower(PyObject *self, int lower)
9213{
9214    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9215    char *resdata, *data = PyUnicode_DATA(self);
9216    PyObject *res;
9217
9218    res = PyUnicode_New(len, 127);
9219    if (res == NULL)
9220        return NULL;
9221    resdata = PyUnicode_DATA(res);
9222    if (lower)
9223        _Py_bytes_lower(resdata, data, len);
9224    else
9225        _Py_bytes_upper(resdata, data, len);
9226    return res;
9227}
9228
9229static Py_UCS4
9230handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9231{
9232    Py_ssize_t j;
9233    int final_sigma;
9234    Py_UCS4 c;
9235    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9236
9237     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9238
9239    where ! is a negation and \p{xxx} is a character with property xxx.
9240    */
9241    for (j = i - 1; j >= 0; j--) {
9242        c = PyUnicode_READ(kind, data, j);
9243        if (!_PyUnicode_IsCaseIgnorable(c))
9244            break;
9245    }
9246    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9247    if (final_sigma) {
9248        for (j = i + 1; j < length; j++) {
9249            c = PyUnicode_READ(kind, data, j);
9250            if (!_PyUnicode_IsCaseIgnorable(c))
9251                break;
9252        }
9253        final_sigma = j == length || !_PyUnicode_IsCased(c);
9254    }
9255    return (final_sigma) ? 0x3C2 : 0x3C3;
9256}
9257
9258static int
9259lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9260           Py_UCS4 c, Py_UCS4 *mapped)
9261{
9262    /* Obscure special case. */
9263    if (c == 0x3A3) {
9264        mapped[0] = handle_capital_sigma(kind, data, length, i);
9265        return 1;
9266    }
9267    return _PyUnicode_ToLowerFull(c, mapped);
9268}
9269
9270static Py_ssize_t
9271do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9272{
9273    Py_ssize_t i, k = 0;
9274    int n_res, j;
9275    Py_UCS4 c, mapped[3];
9276
9277    c = PyUnicode_READ(kind, data, 0);
9278    n_res = _PyUnicode_ToUpperFull(c, mapped);
9279    for (j = 0; j < n_res; j++) {
9280        *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9281        res[k++] = mapped[j];
9282    }
9283    for (i = 1; i < length; i++) {
9284        c = PyUnicode_READ(kind, data, i);
9285        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9286        for (j = 0; j < n_res; j++) {
9287            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9288            res[k++] = mapped[j];
9289        }
9290    }
9291    return k;
9292}
9293
9294static Py_ssize_t
9295do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9296    Py_ssize_t i, k = 0;
9297
9298    for (i = 0; i < length; i++) {
9299        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9300        int n_res, j;
9301        if (Py_UNICODE_ISUPPER(c)) {
9302            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9303        }
9304        else if (Py_UNICODE_ISLOWER(c)) {
9305            n_res = _PyUnicode_ToUpperFull(c, mapped);
9306        }
9307        else {
9308            n_res = 1;
9309            mapped[0] = c;
9310        }
9311        for (j = 0; j < n_res; j++) {
9312            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9313            res[k++] = mapped[j];
9314        }
9315    }
9316    return k;
9317}
9318
9319static Py_ssize_t
9320do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9321                  Py_UCS4 *maxchar, int lower)
9322{
9323    Py_ssize_t i, k = 0;
9324
9325    for (i = 0; i < length; i++) {
9326        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9327        int n_res, j;
9328        if (lower)
9329            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9330        else
9331            n_res = _PyUnicode_ToUpperFull(c, mapped);
9332        for (j = 0; j < n_res; j++) {
9333            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9334            res[k++] = mapped[j];
9335        }
9336    }
9337    return k;
9338}
9339
9340static Py_ssize_t
9341do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9342{
9343    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9344}
9345
9346static Py_ssize_t
9347do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9348{
9349    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9350}
9351
9352static Py_ssize_t
9353do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9354{
9355    Py_ssize_t i, k = 0;
9356
9357    for (i = 0; i < length; i++) {
9358        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9359        Py_UCS4 mapped[3];
9360        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9361        for (j = 0; j < n_res; j++) {
9362            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9363            res[k++] = mapped[j];
9364        }
9365    }
9366    return k;
9367}
9368
9369static Py_ssize_t
9370do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9371{
9372    Py_ssize_t i, k = 0;
9373    int previous_is_cased;
9374
9375    previous_is_cased = 0;
9376    for (i = 0; i < length; i++) {
9377        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9378        Py_UCS4 mapped[3];
9379        int n_res, j;
9380
9381        if (previous_is_cased)
9382            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9383        else
9384            n_res = _PyUnicode_ToTitleFull(c, mapped);
9385
9386        for (j = 0; j < n_res; j++) {
9387            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9388            res[k++] = mapped[j];
9389        }
9390
9391        previous_is_cased = _PyUnicode_IsCased(c);
9392    }
9393    return k;
9394}
9395
9396static PyObject *
9397case_operation(PyObject *self,
9398               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9399{
9400    PyObject *res = NULL;
9401    Py_ssize_t length, newlength = 0;
9402    int kind, outkind;
9403    void *data, *outdata;
9404    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9405
9406    assert(PyUnicode_IS_READY(self));
9407
9408    kind = PyUnicode_KIND(self);
9409    data = PyUnicode_DATA(self);
9410    length = PyUnicode_GET_LENGTH(self);
9411    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9412    if (tmp == NULL)
9413        return PyErr_NoMemory();
9414    newlength = perform(kind, data, length, tmp, &maxchar);
9415    res = PyUnicode_New(newlength, maxchar);
9416    if (res == NULL)
9417        goto leave;
9418    tmpend = tmp + newlength;
9419    outdata = PyUnicode_DATA(res);
9420    outkind = PyUnicode_KIND(res);
9421    switch (outkind) {
9422    case PyUnicode_1BYTE_KIND:
9423        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9424        break;
9425    case PyUnicode_2BYTE_KIND:
9426        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9427        break;
9428    case PyUnicode_4BYTE_KIND:
9429        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9430        break;
9431    default:
9432        assert(0);
9433        break;
9434    }
9435  leave:
9436    PyMem_FREE(tmp);
9437    return res;
9438}
9439
9440PyObject *
9441PyUnicode_Join(PyObject *separator, PyObject *seq)
9442{
9443    PyObject *sep = NULL;
9444    Py_ssize_t seplen;
9445    PyObject *res = NULL; /* the result */
9446    PyObject *fseq;          /* PySequence_Fast(seq) */
9447    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9448    PyObject **items;
9449    PyObject *item;
9450    Py_ssize_t sz, i, res_offset;
9451    Py_UCS4 maxchar;
9452    Py_UCS4 item_maxchar;
9453    int use_memcpy;
9454    unsigned char *res_data = NULL, *sep_data = NULL;
9455    PyObject *last_obj;
9456    unsigned int kind = 0;
9457
9458    fseq = PySequence_Fast(seq, "");
9459    if (fseq == NULL) {
9460        return NULL;
9461    }
9462
9463    /* NOTE: the following code can't call back into Python code,
9464     * so we are sure that fseq won't be mutated.
9465     */
9466
9467    seqlen = PySequence_Fast_GET_SIZE(fseq);
9468    /* If empty sequence, return u"". */
9469    if (seqlen == 0) {
9470        Py_DECREF(fseq);
9471        Py_INCREF(unicode_empty);
9472        res = unicode_empty;
9473        return res;
9474    }
9475
9476    /* If singleton sequence with an exact Unicode, return that. */
9477    last_obj = NULL;
9478    items = PySequence_Fast_ITEMS(fseq);
9479    if (seqlen == 1) {
9480        if (PyUnicode_CheckExact(items[0])) {
9481            res = items[0];
9482            Py_INCREF(res);
9483            Py_DECREF(fseq);
9484            return res;
9485        }
9486        seplen = 0;
9487        maxchar = 0;
9488    }
9489    else {
9490        /* Set up sep and seplen */
9491        if (separator == NULL) {
9492            /* fall back to a blank space separator */
9493            sep = PyUnicode_FromOrdinal(' ');
9494            if (!sep)
9495                goto onError;
9496            seplen = 1;
9497            maxchar = 32;
9498        }
9499        else {
9500            if (!PyUnicode_Check(separator)) {
9501                PyErr_Format(PyExc_TypeError,
9502                             "separator: expected str instance,"
9503                             " %.80s found",
9504                             Py_TYPE(separator)->tp_name);
9505                goto onError;
9506            }
9507            if (PyUnicode_READY(separator))
9508                goto onError;
9509            sep = separator;
9510            seplen = PyUnicode_GET_LENGTH(separator);
9511            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9512            /* inc refcount to keep this code path symmetric with the
9513               above case of a blank separator */
9514            Py_INCREF(sep);
9515        }
9516        last_obj = sep;
9517    }
9518
9519    /* There are at least two things to join, or else we have a subclass
9520     * of str in the sequence.
9521     * Do a pre-pass to figure out the total amount of space we'll
9522     * need (sz), and see whether all argument are strings.
9523     */
9524    sz = 0;
9525#ifdef Py_DEBUG
9526    use_memcpy = 0;
9527#else
9528    use_memcpy = 1;
9529#endif
9530    for (i = 0; i < seqlen; i++) {
9531        const Py_ssize_t old_sz = sz;
9532        item = items[i];
9533        if (!PyUnicode_Check(item)) {
9534            PyErr_Format(PyExc_TypeError,
9535                         "sequence item %zd: expected str instance,"
9536                         " %.80s found",
9537                         i, Py_TYPE(item)->tp_name);
9538            goto onError;
9539        }
9540        if (PyUnicode_READY(item) == -1)
9541            goto onError;
9542        sz += PyUnicode_GET_LENGTH(item);
9543        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9544        maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
9545        if (i != 0)
9546            sz += seplen;
9547        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9548            PyErr_SetString(PyExc_OverflowError,
9549                            "join() result is too long for a Python string");
9550            goto onError;
9551        }
9552        if (use_memcpy && last_obj != NULL) {
9553            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9554                use_memcpy = 0;
9555        }
9556        last_obj = item;
9557    }
9558
9559    res = PyUnicode_New(sz, maxchar);
9560    if (res == NULL)
9561        goto onError;
9562
9563    /* Catenate everything. */
9564#ifdef Py_DEBUG
9565    use_memcpy = 0;
9566#else
9567    if (use_memcpy) {
9568        res_data = PyUnicode_1BYTE_DATA(res);
9569        kind = PyUnicode_KIND(res);
9570        if (seplen != 0)
9571            sep_data = PyUnicode_1BYTE_DATA(sep);
9572    }
9573#endif
9574    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9575        Py_ssize_t itemlen;
9576        item = items[i];
9577        /* Copy item, and maybe the separator. */
9578        if (i && seplen != 0) {
9579            if (use_memcpy) {
9580                Py_MEMCPY(res_data,
9581                          sep_data,
9582                          kind * seplen);
9583                res_data += kind * seplen;
9584            }
9585            else {
9586                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9587                res_offset += seplen;
9588            }
9589        }
9590        itemlen = PyUnicode_GET_LENGTH(item);
9591        if (itemlen != 0) {
9592            if (use_memcpy) {
9593                Py_MEMCPY(res_data,
9594                          PyUnicode_DATA(item),
9595                          kind * itemlen);
9596                res_data += kind * itemlen;
9597            }
9598            else {
9599                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9600                res_offset += itemlen;
9601            }
9602        }
9603    }
9604    if (use_memcpy)
9605        assert(res_data == PyUnicode_1BYTE_DATA(res)
9606                           + kind * PyUnicode_GET_LENGTH(res));
9607    else
9608        assert(res_offset == PyUnicode_GET_LENGTH(res));
9609
9610    Py_DECREF(fseq);
9611    Py_XDECREF(sep);
9612    assert(_PyUnicode_CheckConsistency(res, 1));
9613    return res;
9614
9615  onError:
9616    Py_DECREF(fseq);
9617    Py_XDECREF(sep);
9618    Py_XDECREF(res);
9619    return NULL;
9620}
9621
9622#define FILL(kind, data, value, start, length) \
9623    do { \
9624        Py_ssize_t i_ = 0; \
9625        assert(kind != PyUnicode_WCHAR_KIND); \
9626        switch ((kind)) { \
9627        case PyUnicode_1BYTE_KIND: { \
9628            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9629            memset(to_, (unsigned char)value, (length)); \
9630            break; \
9631        } \
9632        case PyUnicode_2BYTE_KIND: { \
9633            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9634            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9635            break; \
9636        } \
9637        case PyUnicode_4BYTE_KIND: { \
9638            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9639            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9640            break; \
9641        default: assert(0); \
9642        } \
9643        } \
9644    } while (0)
9645
9646void
9647_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9648                    Py_UCS4 fill_char)
9649{
9650    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9651    const void *data = PyUnicode_DATA(unicode);
9652    assert(PyUnicode_IS_READY(unicode));
9653    assert(unicode_modifiable(unicode));
9654    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9655    assert(start >= 0);
9656    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9657    FILL(kind, data, fill_char, start, length);
9658}
9659
9660Py_ssize_t
9661PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9662               Py_UCS4 fill_char)
9663{
9664    Py_ssize_t maxlen;
9665
9666    if (!PyUnicode_Check(unicode)) {
9667        PyErr_BadInternalCall();
9668        return -1;
9669    }
9670    if (PyUnicode_READY(unicode) == -1)
9671        return -1;
9672    if (unicode_check_modifiable(unicode))
9673        return -1;
9674
9675    if (start < 0) {
9676        PyErr_SetString(PyExc_IndexError, "string index out of range");
9677        return -1;
9678    }
9679    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9680        PyErr_SetString(PyExc_ValueError,
9681                         "fill character is bigger than "
9682                         "the string maximum character");
9683        return -1;
9684    }
9685
9686    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9687    length = Py_MIN(maxlen, length);
9688    if (length <= 0)
9689        return 0;
9690
9691    _PyUnicode_FastFill(unicode, start, length, fill_char);
9692    return length;
9693}
9694
9695static PyObject *
9696pad(PyObject *self,
9697    Py_ssize_t left,
9698    Py_ssize_t right,
9699    Py_UCS4 fill)
9700{
9701    PyObject *u;
9702    Py_UCS4 maxchar;
9703    int kind;
9704    void *data;
9705
9706    if (left < 0)
9707        left = 0;
9708    if (right < 0)
9709        right = 0;
9710
9711    if (left == 0 && right == 0)
9712        return unicode_result_unchanged(self);
9713
9714    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9715        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9716        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9717        return NULL;
9718    }
9719    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9720    maxchar = MAX_MAXCHAR(maxchar, fill);
9721    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9722    if (!u)
9723        return NULL;
9724
9725    kind = PyUnicode_KIND(u);
9726    data = PyUnicode_DATA(u);
9727    if (left)
9728        FILL(kind, data, fill, 0, left);
9729    if (right)
9730        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9731    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9732    assert(_PyUnicode_CheckConsistency(u, 1));
9733    return u;
9734}
9735
9736PyObject *
9737PyUnicode_Splitlines(PyObject *string, int keepends)
9738{
9739    PyObject *list;
9740
9741    string = PyUnicode_FromObject(string);
9742    if (string == NULL)
9743        return NULL;
9744    if (PyUnicode_READY(string) == -1) {
9745        Py_DECREF(string);
9746        return NULL;
9747    }
9748
9749    switch (PyUnicode_KIND(string)) {
9750    case PyUnicode_1BYTE_KIND:
9751        if (PyUnicode_IS_ASCII(string))
9752            list = asciilib_splitlines(
9753                string, PyUnicode_1BYTE_DATA(string),
9754                PyUnicode_GET_LENGTH(string), keepends);
9755        else
9756            list = ucs1lib_splitlines(
9757                string, PyUnicode_1BYTE_DATA(string),
9758                PyUnicode_GET_LENGTH(string), keepends);
9759        break;
9760    case PyUnicode_2BYTE_KIND:
9761        list = ucs2lib_splitlines(
9762            string, PyUnicode_2BYTE_DATA(string),
9763            PyUnicode_GET_LENGTH(string), keepends);
9764        break;
9765    case PyUnicode_4BYTE_KIND:
9766        list = ucs4lib_splitlines(
9767            string, PyUnicode_4BYTE_DATA(string),
9768            PyUnicode_GET_LENGTH(string), keepends);
9769        break;
9770    default:
9771        assert(0);
9772        list = 0;
9773    }
9774    Py_DECREF(string);
9775    return list;
9776}
9777
9778static PyObject *
9779split(PyObject *self,
9780      PyObject *substring,
9781      Py_ssize_t maxcount)
9782{
9783    int kind1, kind2, kind;
9784    void *buf1, *buf2;
9785    Py_ssize_t len1, len2;
9786    PyObject* out;
9787
9788    if (maxcount < 0)
9789        maxcount = PY_SSIZE_T_MAX;
9790
9791    if (PyUnicode_READY(self) == -1)
9792        return NULL;
9793
9794    if (substring == NULL)
9795        switch (PyUnicode_KIND(self)) {
9796        case PyUnicode_1BYTE_KIND:
9797            if (PyUnicode_IS_ASCII(self))
9798                return asciilib_split_whitespace(
9799                    self,  PyUnicode_1BYTE_DATA(self),
9800                    PyUnicode_GET_LENGTH(self), maxcount
9801                    );
9802            else
9803                return ucs1lib_split_whitespace(
9804                    self,  PyUnicode_1BYTE_DATA(self),
9805                    PyUnicode_GET_LENGTH(self), maxcount
9806                    );
9807        case PyUnicode_2BYTE_KIND:
9808            return ucs2lib_split_whitespace(
9809                self,  PyUnicode_2BYTE_DATA(self),
9810                PyUnicode_GET_LENGTH(self), maxcount
9811                );
9812        case PyUnicode_4BYTE_KIND:
9813            return ucs4lib_split_whitespace(
9814                self,  PyUnicode_4BYTE_DATA(self),
9815                PyUnicode_GET_LENGTH(self), maxcount
9816                );
9817        default:
9818            assert(0);
9819            return NULL;
9820        }
9821
9822    if (PyUnicode_READY(substring) == -1)
9823        return NULL;
9824
9825    kind1 = PyUnicode_KIND(self);
9826    kind2 = PyUnicode_KIND(substring);
9827    kind = kind1 > kind2 ? kind1 : kind2;
9828    buf1 = PyUnicode_DATA(self);
9829    buf2 = PyUnicode_DATA(substring);
9830    if (kind1 != kind)
9831        buf1 = _PyUnicode_AsKind(self, kind);
9832    if (!buf1)
9833        return NULL;
9834    if (kind2 != kind)
9835        buf2 = _PyUnicode_AsKind(substring, kind);
9836    if (!buf2) {
9837        if (kind1 != kind) PyMem_Free(buf1);
9838        return NULL;
9839    }
9840    len1 = PyUnicode_GET_LENGTH(self);
9841    len2 = PyUnicode_GET_LENGTH(substring);
9842
9843    switch (kind) {
9844    case PyUnicode_1BYTE_KIND:
9845        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9846            out = asciilib_split(
9847                self,  buf1, len1, buf2, len2, maxcount);
9848        else
9849            out = ucs1lib_split(
9850                self,  buf1, len1, buf2, len2, maxcount);
9851        break;
9852    case PyUnicode_2BYTE_KIND:
9853        out = ucs2lib_split(
9854            self,  buf1, len1, buf2, len2, maxcount);
9855        break;
9856    case PyUnicode_4BYTE_KIND:
9857        out = ucs4lib_split(
9858            self,  buf1, len1, buf2, len2, maxcount);
9859        break;
9860    default:
9861        out = NULL;
9862    }
9863    if (kind1 != kind)
9864        PyMem_Free(buf1);
9865    if (kind2 != kind)
9866        PyMem_Free(buf2);
9867    return out;
9868}
9869
9870static PyObject *
9871rsplit(PyObject *self,
9872       PyObject *substring,
9873       Py_ssize_t maxcount)
9874{
9875    int kind1, kind2, kind;
9876    void *buf1, *buf2;
9877    Py_ssize_t len1, len2;
9878    PyObject* out;
9879
9880    if (maxcount < 0)
9881        maxcount = PY_SSIZE_T_MAX;
9882
9883    if (PyUnicode_READY(self) == -1)
9884        return NULL;
9885
9886    if (substring == NULL)
9887        switch (PyUnicode_KIND(self)) {
9888        case PyUnicode_1BYTE_KIND:
9889            if (PyUnicode_IS_ASCII(self))
9890                return asciilib_rsplit_whitespace(
9891                    self,  PyUnicode_1BYTE_DATA(self),
9892                    PyUnicode_GET_LENGTH(self), maxcount
9893                    );
9894            else
9895                return ucs1lib_rsplit_whitespace(
9896                    self,  PyUnicode_1BYTE_DATA(self),
9897                    PyUnicode_GET_LENGTH(self), maxcount
9898                    );
9899        case PyUnicode_2BYTE_KIND:
9900            return ucs2lib_rsplit_whitespace(
9901                self,  PyUnicode_2BYTE_DATA(self),
9902                PyUnicode_GET_LENGTH(self), maxcount
9903                );
9904        case PyUnicode_4BYTE_KIND:
9905            return ucs4lib_rsplit_whitespace(
9906                self,  PyUnicode_4BYTE_DATA(self),
9907                PyUnicode_GET_LENGTH(self), maxcount
9908                );
9909        default:
9910            assert(0);
9911            return NULL;
9912        }
9913
9914    if (PyUnicode_READY(substring) == -1)
9915        return NULL;
9916
9917    kind1 = PyUnicode_KIND(self);
9918    kind2 = PyUnicode_KIND(substring);
9919    kind = kind1 > kind2 ? kind1 : kind2;
9920    buf1 = PyUnicode_DATA(self);
9921    buf2 = PyUnicode_DATA(substring);
9922    if (kind1 != kind)
9923        buf1 = _PyUnicode_AsKind(self, kind);
9924    if (!buf1)
9925        return NULL;
9926    if (kind2 != kind)
9927        buf2 = _PyUnicode_AsKind(substring, kind);
9928    if (!buf2) {
9929        if (kind1 != kind) PyMem_Free(buf1);
9930        return NULL;
9931    }
9932    len1 = PyUnicode_GET_LENGTH(self);
9933    len2 = PyUnicode_GET_LENGTH(substring);
9934
9935    switch (kind) {
9936    case PyUnicode_1BYTE_KIND:
9937        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9938            out = asciilib_rsplit(
9939                self,  buf1, len1, buf2, len2, maxcount);
9940        else
9941            out = ucs1lib_rsplit(
9942                self,  buf1, len1, buf2, len2, maxcount);
9943        break;
9944    case PyUnicode_2BYTE_KIND:
9945        out = ucs2lib_rsplit(
9946            self,  buf1, len1, buf2, len2, maxcount);
9947        break;
9948    case PyUnicode_4BYTE_KIND:
9949        out = ucs4lib_rsplit(
9950            self,  buf1, len1, buf2, len2, maxcount);
9951        break;
9952    default:
9953        out = NULL;
9954    }
9955    if (kind1 != kind)
9956        PyMem_Free(buf1);
9957    if (kind2 != kind)
9958        PyMem_Free(buf2);
9959    return out;
9960}
9961
9962static Py_ssize_t
9963anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9964            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9965{
9966    switch (kind) {
9967    case PyUnicode_1BYTE_KIND:
9968        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9969            return asciilib_find(buf1, len1, buf2, len2, offset);
9970        else
9971            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9972    case PyUnicode_2BYTE_KIND:
9973        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9974    case PyUnicode_4BYTE_KIND:
9975        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9976    }
9977    assert(0);
9978    return -1;
9979}
9980
9981static Py_ssize_t
9982anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9983             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9984{
9985    switch (kind) {
9986    case PyUnicode_1BYTE_KIND:
9987        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9988            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9989        else
9990            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9991    case PyUnicode_2BYTE_KIND:
9992        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9993    case PyUnicode_4BYTE_KIND:
9994        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9995    }
9996    assert(0);
9997    return 0;
9998}
9999
10000static PyObject *
10001replace(PyObject *self, PyObject *str1,
10002        PyObject *str2, Py_ssize_t maxcount)
10003{
10004    PyObject *u;
10005    char *sbuf = PyUnicode_DATA(self);
10006    char *buf1 = PyUnicode_DATA(str1);
10007    char *buf2 = PyUnicode_DATA(str2);
10008    int srelease = 0, release1 = 0, release2 = 0;
10009    int skind = PyUnicode_KIND(self);
10010    int kind1 = PyUnicode_KIND(str1);
10011    int kind2 = PyUnicode_KIND(str2);
10012    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10013    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10014    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10015    int mayshrink;
10016    Py_UCS4 maxchar, maxchar_str2;
10017
10018    if (maxcount < 0)
10019        maxcount = PY_SSIZE_T_MAX;
10020    else if (maxcount == 0 || slen == 0)
10021        goto nothing;
10022
10023    if (str1 == str2)
10024        goto nothing;
10025    if (skind < kind1)
10026        /* substring too wide to be present */
10027        goto nothing;
10028
10029    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10030    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10031    /* Replacing str1 with str2 may cause a maxchar reduction in the
10032       result string. */
10033    mayshrink = (maxchar_str2 < maxchar);
10034    maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
10035
10036    if (len1 == len2) {
10037        /* same length */
10038        if (len1 == 0)
10039            goto nothing;
10040        if (len1 == 1) {
10041            /* replace characters */
10042            Py_UCS4 u1, u2;
10043            int rkind;
10044            Py_ssize_t index, pos;
10045            char *src;
10046
10047            u1 = PyUnicode_READ_CHAR(str1, 0);
10048            pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10049            if (pos < 0)
10050                goto nothing;
10051            u2 = PyUnicode_READ_CHAR(str2, 0);
10052            u = PyUnicode_New(slen, maxchar);
10053            if (!u)
10054                goto error;
10055            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10056            rkind = PyUnicode_KIND(u);
10057
10058            PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10059            index = 0;
10060            src = sbuf;
10061            while (--maxcount)
10062            {
10063                pos++;
10064                src += pos * PyUnicode_KIND(self);
10065                slen -= pos;
10066                index += pos;
10067                pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10068                if (pos < 0)
10069                    break;
10070                PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10071            }
10072        }
10073        else {
10074            int rkind = skind;
10075            char *res;
10076            Py_ssize_t i;
10077
10078            if (kind1 < rkind) {
10079                /* widen substring */
10080                buf1 = _PyUnicode_AsKind(str1, rkind);
10081                if (!buf1) goto error;
10082                release1 = 1;
10083            }
10084            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10085            if (i < 0)
10086                goto nothing;
10087            if (rkind > kind2) {
10088                /* widen replacement */
10089                buf2 = _PyUnicode_AsKind(str2, rkind);
10090                if (!buf2) goto error;
10091                release2 = 1;
10092            }
10093            else if (rkind < kind2) {
10094                /* widen self and buf1 */
10095                rkind = kind2;
10096                if (release1) PyMem_Free(buf1);
10097                sbuf = _PyUnicode_AsKind(self, rkind);
10098                if (!sbuf) goto error;
10099                srelease = 1;
10100                buf1 = _PyUnicode_AsKind(str1, rkind);
10101                if (!buf1) goto error;
10102                release1 = 1;
10103            }
10104            u = PyUnicode_New(slen, maxchar);
10105            if (!u)
10106                goto error;
10107            assert(PyUnicode_KIND(u) == rkind);
10108            res = PyUnicode_DATA(u);
10109
10110            memcpy(res, sbuf, rkind * slen);
10111            /* change everything in-place, starting with this one */
10112            memcpy(res + rkind * i,
10113                   buf2,
10114                   rkind * len2);
10115            i += len1;
10116
10117            while ( --maxcount > 0) {
10118                i = anylib_find(rkind, self,
10119                                sbuf+rkind*i, slen-i,
10120                                str1, buf1, len1, i);
10121                if (i == -1)
10122                    break;
10123                memcpy(res + rkind * i,
10124                       buf2,
10125                       rkind * len2);
10126                i += len1;
10127            }
10128        }
10129    }
10130    else {
10131        Py_ssize_t n, i, j, ires;
10132        Py_ssize_t product, new_size;
10133        int rkind = skind;
10134        char *res;
10135
10136        if (kind1 < rkind) {
10137            /* widen substring */
10138            buf1 = _PyUnicode_AsKind(str1, rkind);
10139            if (!buf1) goto error;
10140            release1 = 1;
10141        }
10142        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10143        if (n == 0)
10144            goto nothing;
10145        if (kind2 < rkind) {
10146            /* widen replacement */
10147            buf2 = _PyUnicode_AsKind(str2, rkind);
10148            if (!buf2) goto error;
10149            release2 = 1;
10150        }
10151        else if (kind2 > rkind) {
10152            /* widen self and buf1 */
10153            rkind = kind2;
10154            sbuf = _PyUnicode_AsKind(self, rkind);
10155            if (!sbuf) goto error;
10156            srelease = 1;
10157            if (release1) PyMem_Free(buf1);
10158            buf1 = _PyUnicode_AsKind(str1, rkind);
10159            if (!buf1) goto error;
10160            release1 = 1;
10161        }
10162        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10163           PyUnicode_GET_LENGTH(str1))); */
10164        product = n * (len2-len1);
10165        if ((product / (len2-len1)) != n) {
10166                PyErr_SetString(PyExc_OverflowError,
10167                                "replace string is too long");
10168                goto error;
10169        }
10170        new_size = slen + product;
10171        if (new_size == 0) {
10172            Py_INCREF(unicode_empty);
10173            u = unicode_empty;
10174            goto done;
10175        }
10176        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10177            PyErr_SetString(PyExc_OverflowError,
10178                            "replace string is too long");
10179            goto error;
10180        }
10181        u = PyUnicode_New(new_size, maxchar);
10182        if (!u)
10183            goto error;
10184        assert(PyUnicode_KIND(u) == rkind);
10185        res = PyUnicode_DATA(u);
10186        ires = i = 0;
10187        if (len1 > 0) {
10188            while (n-- > 0) {
10189                /* look for next match */
10190                j = anylib_find(rkind, self,
10191                                sbuf + rkind * i, slen-i,
10192                                str1, buf1, len1, i);
10193                if (j == -1)
10194                    break;
10195                else if (j > i) {
10196                    /* copy unchanged part [i:j] */
10197                    memcpy(res + rkind * ires,
10198                           sbuf + rkind * i,
10199                           rkind * (j-i));
10200                    ires += j - i;
10201                }
10202                /* copy substitution string */
10203                if (len2 > 0) {
10204                    memcpy(res + rkind * ires,
10205                           buf2,
10206                           rkind * len2);
10207                    ires += len2;
10208                }
10209                i = j + len1;
10210            }
10211            if (i < slen)
10212                /* copy tail [i:] */
10213                memcpy(res + rkind * ires,
10214                       sbuf + rkind * i,
10215                       rkind * (slen-i));
10216        }
10217        else {
10218            /* interleave */
10219            while (n > 0) {
10220                memcpy(res + rkind * ires,
10221                       buf2,
10222                       rkind * len2);
10223                ires += len2;
10224                if (--n <= 0)
10225                    break;
10226                memcpy(res + rkind * ires,
10227                       sbuf + rkind * i,
10228                       rkind);
10229                ires++;
10230                i++;
10231            }
10232            memcpy(res + rkind * ires,
10233                   sbuf + rkind * i,
10234                   rkind * (slen-i));
10235        }
10236    }
10237
10238    if (mayshrink) {
10239        unicode_adjust_maxchar(&u);
10240        if (u == NULL)
10241            goto error;
10242    }
10243
10244  done:
10245    if (srelease)
10246        PyMem_FREE(sbuf);
10247    if (release1)
10248        PyMem_FREE(buf1);
10249    if (release2)
10250        PyMem_FREE(buf2);
10251    assert(_PyUnicode_CheckConsistency(u, 1));
10252    return u;
10253
10254  nothing:
10255    /* nothing to replace; return original string (when possible) */
10256    if (srelease)
10257        PyMem_FREE(sbuf);
10258    if (release1)
10259        PyMem_FREE(buf1);
10260    if (release2)
10261        PyMem_FREE(buf2);
10262    return unicode_result_unchanged(self);
10263
10264  error:
10265    if (srelease && sbuf)
10266        PyMem_FREE(sbuf);
10267    if (release1 && buf1)
10268        PyMem_FREE(buf1);
10269    if (release2 && buf2)
10270        PyMem_FREE(buf2);
10271    return NULL;
10272}
10273
10274/* --- Unicode Object Methods --------------------------------------------- */
10275
10276PyDoc_STRVAR(title__doc__,
10277             "S.title() -> str\n\
10278\n\
10279Return a titlecased version of S, i.e. words start with title case\n\
10280characters, all remaining cased characters have lower case.");
10281
10282static PyObject*
10283unicode_title(PyObject *self)
10284{
10285    if (PyUnicode_READY(self) == -1)
10286        return NULL;
10287    return case_operation(self, do_title);
10288}
10289
10290PyDoc_STRVAR(capitalize__doc__,
10291             "S.capitalize() -> str\n\
10292\n\
10293Return a capitalized version of S, i.e. make the first character\n\
10294have upper case and the rest lower case.");
10295
10296static PyObject*
10297unicode_capitalize(PyObject *self)
10298{
10299    if (PyUnicode_READY(self) == -1)
10300        return NULL;
10301    if (PyUnicode_GET_LENGTH(self) == 0)
10302        return unicode_result_unchanged(self);
10303    return case_operation(self, do_capitalize);
10304}
10305
10306PyDoc_STRVAR(casefold__doc__,
10307             "S.casefold() -> str\n\
10308\n\
10309Return a version of S suitable for caseless comparisons.");
10310
10311static PyObject *
10312unicode_casefold(PyObject *self)
10313{
10314    if (PyUnicode_READY(self) == -1)
10315        return NULL;
10316    if (PyUnicode_IS_ASCII(self))
10317        return ascii_upper_or_lower(self, 1);
10318    return case_operation(self, do_casefold);
10319}
10320
10321
10322/* Argument converter.  Coerces to a single unicode character */
10323
10324static int
10325convert_uc(PyObject *obj, void *addr)
10326{
10327    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10328    PyObject *uniobj;
10329
10330    uniobj = PyUnicode_FromObject(obj);
10331    if (uniobj == NULL) {
10332        PyErr_SetString(PyExc_TypeError,
10333                        "The fill character cannot be converted to Unicode");
10334        return 0;
10335    }
10336    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10337        PyErr_SetString(PyExc_TypeError,
10338                        "The fill character must be exactly one character long");
10339        Py_DECREF(uniobj);
10340        return 0;
10341    }
10342    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10343    Py_DECREF(uniobj);
10344    return 1;
10345}
10346
10347PyDoc_STRVAR(center__doc__,
10348             "S.center(width[, fillchar]) -> str\n\
10349\n\
10350Return S centered in a string of length width. Padding is\n\
10351done using the specified fill character (default is a space)");
10352
10353static PyObject *
10354unicode_center(PyObject *self, PyObject *args)
10355{
10356    Py_ssize_t marg, left;
10357    Py_ssize_t width;
10358    Py_UCS4 fillchar = ' ';
10359
10360    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10361        return NULL;
10362
10363    if (PyUnicode_READY(self) == -1)
10364        return NULL;
10365
10366    if (PyUnicode_GET_LENGTH(self) >= width)
10367        return unicode_result_unchanged(self);
10368
10369    marg = width - PyUnicode_GET_LENGTH(self);
10370    left = marg / 2 + (marg & width & 1);
10371
10372    return pad(self, left, marg - left, fillchar);
10373}
10374
10375/* This function assumes that str1 and str2 are readied by the caller. */
10376
10377static int
10378unicode_compare(PyObject *str1, PyObject *str2)
10379{
10380    int kind1, kind2;
10381    void *data1, *data2;
10382    Py_ssize_t len1, len2, i;
10383
10384    kind1 = PyUnicode_KIND(str1);
10385    kind2 = PyUnicode_KIND(str2);
10386    data1 = PyUnicode_DATA(str1);
10387    data2 = PyUnicode_DATA(str2);
10388    len1 = PyUnicode_GET_LENGTH(str1);
10389    len2 = PyUnicode_GET_LENGTH(str2);
10390
10391    for (i = 0; i < len1 && i < len2; ++i) {
10392        Py_UCS4 c1, c2;
10393        c1 = PyUnicode_READ(kind1, data1, i);
10394        c2 = PyUnicode_READ(kind2, data2, i);
10395
10396        if (c1 != c2)
10397            return (c1 < c2) ? -1 : 1;
10398    }
10399
10400    return (len1 < len2) ? -1 : (len1 != len2);
10401}
10402
10403int
10404PyUnicode_Compare(PyObject *left, PyObject *right)
10405{
10406    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10407        if (PyUnicode_READY(left) == -1 ||
10408            PyUnicode_READY(right) == -1)
10409            return -1;
10410        return unicode_compare(left, right);
10411    }
10412    PyErr_Format(PyExc_TypeError,
10413                 "Can't compare %.100s and %.100s",
10414                 left->ob_type->tp_name,
10415                 right->ob_type->tp_name);
10416    return -1;
10417}
10418
10419int
10420PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10421{
10422    Py_ssize_t i;
10423    int kind;
10424    void *data;
10425    Py_UCS4 chr;
10426
10427    assert(_PyUnicode_CHECK(uni));
10428    if (PyUnicode_READY(uni) == -1)
10429        return -1;
10430    kind = PyUnicode_KIND(uni);
10431    data = PyUnicode_DATA(uni);
10432    /* Compare Unicode string and source character set string */
10433    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10434        if (chr != str[i])
10435            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10436    /* This check keeps Python strings that end in '\0' from comparing equal
10437     to C strings identical up to that point. */
10438    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10439        return 1; /* uni is longer */
10440    if (str[i])
10441        return -1; /* str is longer */
10442    return 0;
10443}
10444
10445
10446#define TEST_COND(cond)                         \
10447    ((cond) ? Py_True : Py_False)
10448
10449PyObject *
10450PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10451{
10452    int result;
10453
10454    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10455        PyObject *v;
10456        if (PyUnicode_READY(left) == -1 ||
10457            PyUnicode_READY(right) == -1)
10458            return NULL;
10459        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10460            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10461            if (op == Py_EQ) {
10462                Py_INCREF(Py_False);
10463                return Py_False;
10464            }
10465            if (op == Py_NE) {
10466                Py_INCREF(Py_True);
10467                return Py_True;
10468            }
10469        }
10470        if (left == right)
10471            result = 0;
10472        else
10473            result = unicode_compare(left, right);
10474
10475        /* Convert the return value to a Boolean */
10476        switch (op) {
10477        case Py_EQ:
10478            v = TEST_COND(result == 0);
10479            break;
10480        case Py_NE:
10481            v = TEST_COND(result != 0);
10482            break;
10483        case Py_LE:
10484            v = TEST_COND(result <= 0);
10485            break;
10486        case Py_GE:
10487            v = TEST_COND(result >= 0);
10488            break;
10489        case Py_LT:
10490            v = TEST_COND(result == -1);
10491            break;
10492        case Py_GT:
10493            v = TEST_COND(result == 1);
10494            break;
10495        default:
10496            PyErr_BadArgument();
10497            return NULL;
10498        }
10499        Py_INCREF(v);
10500        return v;
10501    }
10502
10503    Py_RETURN_NOTIMPLEMENTED;
10504}
10505
10506int
10507PyUnicode_Contains(PyObject *container, PyObject *element)
10508{
10509    PyObject *str, *sub;
10510    int kind1, kind2, kind;
10511    void *buf1, *buf2;
10512    Py_ssize_t len1, len2;
10513    int result;
10514
10515    /* Coerce the two arguments */
10516    sub = PyUnicode_FromObject(element);
10517    if (!sub) {
10518        PyErr_Format(PyExc_TypeError,
10519                     "'in <string>' requires string as left operand, not %s",
10520                     element->ob_type->tp_name);
10521        return -1;
10522    }
10523
10524    str = PyUnicode_FromObject(container);
10525    if (!str) {
10526        Py_DECREF(sub);
10527        return -1;
10528    }
10529    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10530        Py_DECREF(sub);
10531        Py_DECREF(str);
10532    }
10533
10534    kind1 = PyUnicode_KIND(str);
10535    kind2 = PyUnicode_KIND(sub);
10536    kind = kind1;
10537    buf1 = PyUnicode_DATA(str);
10538    buf2 = PyUnicode_DATA(sub);
10539    if (kind2 != kind) {
10540        if (kind2 > kind) {
10541            Py_DECREF(sub);
10542            Py_DECREF(str);
10543            return 0;
10544        }
10545        buf2 = _PyUnicode_AsKind(sub, kind);
10546    }
10547    if (!buf2) {
10548        Py_DECREF(sub);
10549        Py_DECREF(str);
10550        return -1;
10551    }
10552    len1 = PyUnicode_GET_LENGTH(str);
10553    len2 = PyUnicode_GET_LENGTH(sub);
10554
10555    switch (kind) {
10556    case PyUnicode_1BYTE_KIND:
10557        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10558        break;
10559    case PyUnicode_2BYTE_KIND:
10560        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10561        break;
10562    case PyUnicode_4BYTE_KIND:
10563        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10564        break;
10565    default:
10566        result = -1;
10567        assert(0);
10568    }
10569
10570    Py_DECREF(str);
10571    Py_DECREF(sub);
10572
10573    if (kind2 != kind)
10574        PyMem_Free(buf2);
10575
10576    return result;
10577}
10578
10579/* Concat to string or Unicode object giving a new Unicode object. */
10580
10581PyObject *
10582PyUnicode_Concat(PyObject *left, PyObject *right)
10583{
10584    PyObject *u = NULL, *v = NULL, *w;
10585    Py_UCS4 maxchar, maxchar2;
10586    Py_ssize_t u_len, v_len, new_len;
10587
10588    /* Coerce the two arguments */
10589    u = PyUnicode_FromObject(left);
10590    if (u == NULL)
10591        goto onError;
10592    v = PyUnicode_FromObject(right);
10593    if (v == NULL)
10594        goto onError;
10595
10596    /* Shortcuts */
10597    if (v == unicode_empty) {
10598        Py_DECREF(v);
10599        return u;
10600    }
10601    if (u == unicode_empty) {
10602        Py_DECREF(u);
10603        return v;
10604    }
10605
10606    u_len = PyUnicode_GET_LENGTH(u);
10607    v_len = PyUnicode_GET_LENGTH(v);
10608    if (u_len > PY_SSIZE_T_MAX - v_len) {
10609        PyErr_SetString(PyExc_OverflowError,
10610                        "strings are too large to concat");
10611        goto onError;
10612    }
10613    new_len = u_len + v_len;
10614
10615    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10616    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10617    maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10618
10619    /* Concat the two Unicode strings */
10620    w = PyUnicode_New(new_len, maxchar);
10621    if (w == NULL)
10622        goto onError;
10623    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10624    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
10625    Py_DECREF(u);
10626    Py_DECREF(v);
10627    assert(_PyUnicode_CheckConsistency(w, 1));
10628    return w;
10629
10630  onError:
10631    Py_XDECREF(u);
10632    Py_XDECREF(v);
10633    return NULL;
10634}
10635
10636void
10637PyUnicode_Append(PyObject **p_left, PyObject *right)
10638{
10639    PyObject *left, *res;
10640    Py_UCS4 maxchar, maxchar2;
10641    Py_ssize_t left_len, right_len, new_len;
10642
10643    if (p_left == NULL) {
10644        if (!PyErr_Occurred())
10645            PyErr_BadInternalCall();
10646        return;
10647    }
10648    left = *p_left;
10649    if (right == NULL || !PyUnicode_Check(left)) {
10650        if (!PyErr_Occurred())
10651            PyErr_BadInternalCall();
10652        goto error;
10653    }
10654
10655    if (PyUnicode_READY(left) == -1)
10656        goto error;
10657    if (PyUnicode_READY(right) == -1)
10658        goto error;
10659
10660    /* Shortcuts */
10661    if (left == unicode_empty) {
10662        Py_DECREF(left);
10663        Py_INCREF(right);
10664        *p_left = right;
10665        return;
10666    }
10667    if (right == unicode_empty)
10668        return;
10669
10670    left_len = PyUnicode_GET_LENGTH(left);
10671    right_len = PyUnicode_GET_LENGTH(right);
10672    if (left_len > PY_SSIZE_T_MAX - right_len) {
10673        PyErr_SetString(PyExc_OverflowError,
10674                        "strings are too large to concat");
10675        goto error;
10676    }
10677    new_len = left_len + right_len;
10678
10679    if (unicode_modifiable(left)
10680        && PyUnicode_CheckExact(right)
10681        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
10682        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10683           to change the structure size, but characters are stored just after
10684           the structure, and so it requires to move all characters which is
10685           not so different than duplicating the string. */
10686        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10687    {
10688        /* append inplace */
10689        if (unicode_resize(p_left, new_len) != 0) {
10690            /* XXX if _PyUnicode_Resize() fails, 'left' has been
10691             * deallocated so it cannot be put back into
10692             * 'variable'.  The MemoryError is raised when there
10693             * is no value in 'variable', which might (very
10694             * remotely) be a cause of incompatibilities.
10695             */
10696            goto error;
10697        }
10698        /* copy 'right' into the newly allocated area of 'left' */
10699        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
10700    }
10701    else {
10702        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10703        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10704        maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10705
10706        /* Concat the two Unicode strings */
10707        res = PyUnicode_New(new_len, maxchar);
10708        if (res == NULL)
10709            goto error;
10710        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10711        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
10712        Py_DECREF(left);
10713        *p_left = res;
10714    }
10715    assert(_PyUnicode_CheckConsistency(*p_left, 1));
10716    return;
10717
10718error:
10719    Py_CLEAR(*p_left);
10720}
10721
10722void
10723PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10724{
10725    PyUnicode_Append(pleft, right);
10726    Py_XDECREF(right);
10727}
10728
10729PyDoc_STRVAR(count__doc__,
10730             "S.count(sub[, start[, end]]) -> int\n\
10731\n\
10732Return the number of non-overlapping occurrences of substring sub in\n\
10733string S[start:end].  Optional arguments start and end are\n\
10734interpreted as in slice notation.");
10735
10736static PyObject *
10737unicode_count(PyObject *self, PyObject *args)
10738{
10739    PyObject *substring;
10740    Py_ssize_t start = 0;
10741    Py_ssize_t end = PY_SSIZE_T_MAX;
10742    PyObject *result;
10743    int kind1, kind2, kind;
10744    void *buf1, *buf2;
10745    Py_ssize_t len1, len2, iresult;
10746
10747    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10748                                            &start, &end))
10749        return NULL;
10750
10751    kind1 = PyUnicode_KIND(self);
10752    kind2 = PyUnicode_KIND(substring);
10753    if (kind2 > kind1)
10754        return PyLong_FromLong(0);
10755    kind = kind1;
10756    buf1 = PyUnicode_DATA(self);
10757    buf2 = PyUnicode_DATA(substring);
10758    if (kind2 != kind)
10759        buf2 = _PyUnicode_AsKind(substring, kind);
10760    if (!buf2) {
10761        Py_DECREF(substring);
10762        return NULL;
10763    }
10764    len1 = PyUnicode_GET_LENGTH(self);
10765    len2 = PyUnicode_GET_LENGTH(substring);
10766
10767    ADJUST_INDICES(start, end, len1);
10768    switch (kind) {
10769    case PyUnicode_1BYTE_KIND:
10770        iresult = ucs1lib_count(
10771            ((Py_UCS1*)buf1) + start, end - start,
10772            buf2, len2, PY_SSIZE_T_MAX
10773            );
10774        break;
10775    case PyUnicode_2BYTE_KIND:
10776        iresult = ucs2lib_count(
10777            ((Py_UCS2*)buf1) + start, end - start,
10778            buf2, len2, PY_SSIZE_T_MAX
10779            );
10780        break;
10781    case PyUnicode_4BYTE_KIND:
10782        iresult = ucs4lib_count(
10783            ((Py_UCS4*)buf1) + start, end - start,
10784            buf2, len2, PY_SSIZE_T_MAX
10785            );
10786        break;
10787    default:
10788        assert(0); iresult = 0;
10789    }
10790
10791    result = PyLong_FromSsize_t(iresult);
10792
10793    if (kind2 != kind)
10794        PyMem_Free(buf2);
10795
10796    Py_DECREF(substring);
10797
10798    return result;
10799}
10800
10801PyDoc_STRVAR(encode__doc__,
10802             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10803\n\
10804Encode S using the codec registered for encoding. Default encoding\n\
10805is 'utf-8'. errors may be given to set a different error\n\
10806handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10807a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10808'xmlcharrefreplace' as well as any other name registered with\n\
10809codecs.register_error that can handle UnicodeEncodeErrors.");
10810
10811static PyObject *
10812unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10813{
10814    static char *kwlist[] = {"encoding", "errors", 0};
10815    char *encoding = NULL;
10816    char *errors = NULL;
10817
10818    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10819                                     kwlist, &encoding, &errors))
10820        return NULL;
10821    return PyUnicode_AsEncodedString(self, encoding, errors);
10822}
10823
10824PyDoc_STRVAR(expandtabs__doc__,
10825             "S.expandtabs([tabsize]) -> str\n\
10826\n\
10827Return a copy of S where all tab characters are expanded using spaces.\n\
10828If tabsize is not given, a tab size of 8 characters is assumed.");
10829
10830static PyObject*
10831unicode_expandtabs(PyObject *self, PyObject *args)
10832{
10833    Py_ssize_t i, j, line_pos, src_len, incr;
10834    Py_UCS4 ch;
10835    PyObject *u;
10836    void *src_data, *dest_data;
10837    int tabsize = 8;
10838    int kind;
10839    int found;
10840
10841    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10842        return NULL;
10843
10844    if (PyUnicode_READY(self) == -1)
10845        return NULL;
10846
10847    /* First pass: determine size of output string */
10848    src_len = PyUnicode_GET_LENGTH(self);
10849    i = j = line_pos = 0;
10850    kind = PyUnicode_KIND(self);
10851    src_data = PyUnicode_DATA(self);
10852    found = 0;
10853    for (; i < src_len; i++) {
10854        ch = PyUnicode_READ(kind, src_data, i);
10855        if (ch == '\t') {
10856            found = 1;
10857            if (tabsize > 0) {
10858                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10859                if (j > PY_SSIZE_T_MAX - incr)
10860                    goto overflow;
10861                line_pos += incr;
10862                j += incr;
10863            }
10864        }
10865        else {
10866            if (j > PY_SSIZE_T_MAX - 1)
10867                goto overflow;
10868            line_pos++;
10869            j++;
10870            if (ch == '\n' || ch == '\r')
10871                line_pos = 0;
10872        }
10873    }
10874    if (!found)
10875        return unicode_result_unchanged(self);
10876
10877    /* Second pass: create output string and fill it */
10878    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10879    if (!u)
10880        return NULL;
10881    dest_data = PyUnicode_DATA(u);
10882
10883    i = j = line_pos = 0;
10884
10885    for (; i < src_len; i++) {
10886        ch = PyUnicode_READ(kind, src_data, i);
10887        if (ch == '\t') {
10888            if (tabsize > 0) {
10889                incr = tabsize - (line_pos % tabsize);
10890                line_pos += incr;
10891                FILL(kind, dest_data, ' ', j, incr);
10892                j += incr;
10893            }
10894        }
10895        else {
10896            line_pos++;
10897            PyUnicode_WRITE(kind, dest_data, j, ch);
10898            j++;
10899            if (ch == '\n' || ch == '\r')
10900                line_pos = 0;
10901        }
10902    }
10903    assert (j == PyUnicode_GET_LENGTH(u));
10904    return unicode_result(u);
10905
10906  overflow:
10907    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10908    return NULL;
10909}
10910
10911PyDoc_STRVAR(find__doc__,
10912             "S.find(sub[, start[, end]]) -> int\n\
10913\n\
10914Return the lowest index in S where substring sub is found,\n\
10915such that sub is contained within S[start:end].  Optional\n\
10916arguments start and end are interpreted as in slice notation.\n\
10917\n\
10918Return -1 on failure.");
10919
10920static PyObject *
10921unicode_find(PyObject *self, PyObject *args)
10922{
10923    PyObject *substring;
10924    Py_ssize_t start;
10925    Py_ssize_t end;
10926    Py_ssize_t result;
10927
10928    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10929                                            &start, &end))
10930        return NULL;
10931
10932    if (PyUnicode_READY(self) == -1)
10933        return NULL;
10934    if (PyUnicode_READY(substring) == -1)
10935        return NULL;
10936
10937    result = any_find_slice(1, self, substring, start, end);
10938
10939    Py_DECREF(substring);
10940
10941    if (result == -2)
10942        return NULL;
10943
10944    return PyLong_FromSsize_t(result);
10945}
10946
10947static PyObject *
10948unicode_getitem(PyObject *self, Py_ssize_t index)
10949{
10950    void *data;
10951    enum PyUnicode_Kind kind;
10952    Py_UCS4 ch;
10953    PyObject *res;
10954
10955    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10956        PyErr_BadArgument();
10957        return NULL;
10958    }
10959    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10960        PyErr_SetString(PyExc_IndexError, "string index out of range");
10961        return NULL;
10962    }
10963    kind = PyUnicode_KIND(self);
10964    data = PyUnicode_DATA(self);
10965    ch = PyUnicode_READ(kind, data, index);
10966    if (ch < 256)
10967        return get_latin1_char(ch);
10968
10969    res = PyUnicode_New(1, ch);
10970    if (res == NULL)
10971        return NULL;
10972    kind = PyUnicode_KIND(res);
10973    data = PyUnicode_DATA(res);
10974    PyUnicode_WRITE(kind, data, 0, ch);
10975    assert(_PyUnicode_CheckConsistency(res, 1));
10976    return res;
10977}
10978
10979/* Believe it or not, this produces the same value for ASCII strings
10980   as bytes_hash(). */
10981static Py_hash_t
10982unicode_hash(PyObject *self)
10983{
10984    Py_ssize_t len;
10985    Py_uhash_t x;
10986
10987#ifdef Py_DEBUG
10988    assert(_Py_HashSecret_Initialized);
10989#endif
10990    if (_PyUnicode_HASH(self) != -1)
10991        return _PyUnicode_HASH(self);
10992    if (PyUnicode_READY(self) == -1)
10993        return -1;
10994    len = PyUnicode_GET_LENGTH(self);
10995    /*
10996      We make the hash of the empty string be 0, rather than using
10997      (prefix ^ suffix), since this slightly obfuscates the hash secret
10998    */
10999    if (len == 0) {
11000        _PyUnicode_HASH(self) = 0;
11001        return 0;
11002    }
11003
11004    /* The hash function as a macro, gets expanded three times below. */
11005#define HASH(P)                                            \
11006    x ^= (Py_uhash_t) *P << 7;                             \
11007    while (--len >= 0)                                     \
11008        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
11009
11010    x = (Py_uhash_t) _Py_HashSecret.prefix;
11011    switch (PyUnicode_KIND(self)) {
11012    case PyUnicode_1BYTE_KIND: {
11013        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11014        HASH(c);
11015        break;
11016    }
11017    case PyUnicode_2BYTE_KIND: {
11018        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11019        HASH(s);
11020        break;
11021    }
11022    default: {
11023        Py_UCS4 *l;
11024        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11025               "Impossible switch case in unicode_hash");
11026        l = PyUnicode_4BYTE_DATA(self);
11027        HASH(l);
11028        break;
11029    }
11030    }
11031    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11032    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
11033
11034    if (x == -1)
11035        x = -2;
11036    _PyUnicode_HASH(self) = x;
11037    return x;
11038}
11039#undef HASH
11040
11041PyDoc_STRVAR(index__doc__,
11042             "S.index(sub[, start[, end]]) -> int\n\
11043\n\
11044Like S.find() but raise ValueError when the substring is not found.");
11045
11046static PyObject *
11047unicode_index(PyObject *self, PyObject *args)
11048{
11049    Py_ssize_t result;
11050    PyObject *substring;
11051    Py_ssize_t start;
11052    Py_ssize_t end;
11053
11054    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11055                                            &start, &end))
11056        return NULL;
11057
11058    if (PyUnicode_READY(self) == -1)
11059        return NULL;
11060    if (PyUnicode_READY(substring) == -1)
11061        return NULL;
11062
11063    result = any_find_slice(1, self, substring, start, end);
11064
11065    Py_DECREF(substring);
11066
11067    if (result == -2)
11068        return NULL;
11069
11070    if (result < 0) {
11071        PyErr_SetString(PyExc_ValueError, "substring not found");
11072        return NULL;
11073    }
11074
11075    return PyLong_FromSsize_t(result);
11076}
11077
11078PyDoc_STRVAR(islower__doc__,
11079             "S.islower() -> bool\n\
11080\n\
11081Return True if all cased characters in S are lowercase and there is\n\
11082at least one cased character in S, False otherwise.");
11083
11084static PyObject*
11085unicode_islower(PyObject *self)
11086{
11087    Py_ssize_t i, length;
11088    int kind;
11089    void *data;
11090    int cased;
11091
11092    if (PyUnicode_READY(self) == -1)
11093        return NULL;
11094    length = PyUnicode_GET_LENGTH(self);
11095    kind = PyUnicode_KIND(self);
11096    data = PyUnicode_DATA(self);
11097
11098    /* Shortcut for single character strings */
11099    if (length == 1)
11100        return PyBool_FromLong(
11101            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11102
11103    /* Special case for empty strings */
11104    if (length == 0)
11105        return PyBool_FromLong(0);
11106
11107    cased = 0;
11108    for (i = 0; i < length; i++) {
11109        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11110
11111        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11112            return PyBool_FromLong(0);
11113        else if (!cased && Py_UNICODE_ISLOWER(ch))
11114            cased = 1;
11115    }
11116    return PyBool_FromLong(cased);
11117}
11118
11119PyDoc_STRVAR(isupper__doc__,
11120             "S.isupper() -> bool\n\
11121\n\
11122Return True if all cased characters in S are uppercase and there is\n\
11123at least one cased character in S, False otherwise.");
11124
11125static PyObject*
11126unicode_isupper(PyObject *self)
11127{
11128    Py_ssize_t i, length;
11129    int kind;
11130    void *data;
11131    int cased;
11132
11133    if (PyUnicode_READY(self) == -1)
11134        return NULL;
11135    length = PyUnicode_GET_LENGTH(self);
11136    kind = PyUnicode_KIND(self);
11137    data = PyUnicode_DATA(self);
11138
11139    /* Shortcut for single character strings */
11140    if (length == 1)
11141        return PyBool_FromLong(
11142            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11143
11144    /* Special case for empty strings */
11145    if (length == 0)
11146        return PyBool_FromLong(0);
11147
11148    cased = 0;
11149    for (i = 0; i < length; i++) {
11150        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11151
11152        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11153            return PyBool_FromLong(0);
11154        else if (!cased && Py_UNICODE_ISUPPER(ch))
11155            cased = 1;
11156    }
11157    return PyBool_FromLong(cased);
11158}
11159
11160PyDoc_STRVAR(istitle__doc__,
11161             "S.istitle() -> bool\n\
11162\n\
11163Return True if S is a titlecased string and there is at least one\n\
11164character in S, i.e. upper- and titlecase characters may only\n\
11165follow uncased characters and lowercase characters only cased ones.\n\
11166Return False otherwise.");
11167
11168static PyObject*
11169unicode_istitle(PyObject *self)
11170{
11171    Py_ssize_t i, length;
11172    int kind;
11173    void *data;
11174    int cased, previous_is_cased;
11175
11176    if (PyUnicode_READY(self) == -1)
11177        return NULL;
11178    length = PyUnicode_GET_LENGTH(self);
11179    kind = PyUnicode_KIND(self);
11180    data = PyUnicode_DATA(self);
11181
11182    /* Shortcut for single character strings */
11183    if (length == 1) {
11184        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11185        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11186                               (Py_UNICODE_ISUPPER(ch) != 0));
11187    }
11188
11189    /* Special case for empty strings */
11190    if (length == 0)
11191        return PyBool_FromLong(0);
11192
11193    cased = 0;
11194    previous_is_cased = 0;
11195    for (i = 0; i < length; i++) {
11196        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11197
11198        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11199            if (previous_is_cased)
11200                return PyBool_FromLong(0);
11201            previous_is_cased = 1;
11202            cased = 1;
11203        }
11204        else if (Py_UNICODE_ISLOWER(ch)) {
11205            if (!previous_is_cased)
11206                return PyBool_FromLong(0);
11207            previous_is_cased = 1;
11208            cased = 1;
11209        }
11210        else
11211            previous_is_cased = 0;
11212    }
11213    return PyBool_FromLong(cased);
11214}
11215
11216PyDoc_STRVAR(isspace__doc__,
11217             "S.isspace() -> bool\n\
11218\n\
11219Return True if all characters in S are whitespace\n\
11220and there is at least one character in S, False otherwise.");
11221
11222static PyObject*
11223unicode_isspace(PyObject *self)
11224{
11225    Py_ssize_t i, length;
11226    int kind;
11227    void *data;
11228
11229    if (PyUnicode_READY(self) == -1)
11230        return NULL;
11231    length = PyUnicode_GET_LENGTH(self);
11232    kind = PyUnicode_KIND(self);
11233    data = PyUnicode_DATA(self);
11234
11235    /* Shortcut for single character strings */
11236    if (length == 1)
11237        return PyBool_FromLong(
11238            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11239
11240    /* Special case for empty strings */
11241    if (length == 0)
11242        return PyBool_FromLong(0);
11243
11244    for (i = 0; i < length; i++) {
11245        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11246        if (!Py_UNICODE_ISSPACE(ch))
11247            return PyBool_FromLong(0);
11248    }
11249    return PyBool_FromLong(1);
11250}
11251
11252PyDoc_STRVAR(isalpha__doc__,
11253             "S.isalpha() -> bool\n\
11254\n\
11255Return True if all characters in S are alphabetic\n\
11256and there is at least one character in S, False otherwise.");
11257
11258static PyObject*
11259unicode_isalpha(PyObject *self)
11260{
11261    Py_ssize_t i, length;
11262    int kind;
11263    void *data;
11264
11265    if (PyUnicode_READY(self) == -1)
11266        return NULL;
11267    length = PyUnicode_GET_LENGTH(self);
11268    kind = PyUnicode_KIND(self);
11269    data = PyUnicode_DATA(self);
11270
11271    /* Shortcut for single character strings */
11272    if (length == 1)
11273        return PyBool_FromLong(
11274            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11275
11276    /* Special case for empty strings */
11277    if (length == 0)
11278        return PyBool_FromLong(0);
11279
11280    for (i = 0; i < length; i++) {
11281        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11282            return PyBool_FromLong(0);
11283    }
11284    return PyBool_FromLong(1);
11285}
11286
11287PyDoc_STRVAR(isalnum__doc__,
11288             "S.isalnum() -> bool\n\
11289\n\
11290Return True if all characters in S are alphanumeric\n\
11291and there is at least one character in S, False otherwise.");
11292
11293static PyObject*
11294unicode_isalnum(PyObject *self)
11295{
11296    int kind;
11297    void *data;
11298    Py_ssize_t len, i;
11299
11300    if (PyUnicode_READY(self) == -1)
11301        return NULL;
11302
11303    kind = PyUnicode_KIND(self);
11304    data = PyUnicode_DATA(self);
11305    len = PyUnicode_GET_LENGTH(self);
11306
11307    /* Shortcut for single character strings */
11308    if (len == 1) {
11309        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11310        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11311    }
11312
11313    /* Special case for empty strings */
11314    if (len == 0)
11315        return PyBool_FromLong(0);
11316
11317    for (i = 0; i < len; i++) {
11318        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11319        if (!Py_UNICODE_ISALNUM(ch))
11320            return PyBool_FromLong(0);
11321    }
11322    return PyBool_FromLong(1);
11323}
11324
11325PyDoc_STRVAR(isdecimal__doc__,
11326             "S.isdecimal() -> bool\n\
11327\n\
11328Return True if there are only decimal characters in S,\n\
11329False otherwise.");
11330
11331static PyObject*
11332unicode_isdecimal(PyObject *self)
11333{
11334    Py_ssize_t i, length;
11335    int kind;
11336    void *data;
11337
11338    if (PyUnicode_READY(self) == -1)
11339        return NULL;
11340    length = PyUnicode_GET_LENGTH(self);
11341    kind = PyUnicode_KIND(self);
11342    data = PyUnicode_DATA(self);
11343
11344    /* Shortcut for single character strings */
11345    if (length == 1)
11346        return PyBool_FromLong(
11347            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11348
11349    /* Special case for empty strings */
11350    if (length == 0)
11351        return PyBool_FromLong(0);
11352
11353    for (i = 0; i < length; i++) {
11354        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11355            return PyBool_FromLong(0);
11356    }
11357    return PyBool_FromLong(1);
11358}
11359
11360PyDoc_STRVAR(isdigit__doc__,
11361             "S.isdigit() -> bool\n\
11362\n\
11363Return True if all characters in S are digits\n\
11364and there is at least one character in S, False otherwise.");
11365
11366static PyObject*
11367unicode_isdigit(PyObject *self)
11368{
11369    Py_ssize_t i, length;
11370    int kind;
11371    void *data;
11372
11373    if (PyUnicode_READY(self) == -1)
11374        return NULL;
11375    length = PyUnicode_GET_LENGTH(self);
11376    kind = PyUnicode_KIND(self);
11377    data = PyUnicode_DATA(self);
11378
11379    /* Shortcut for single character strings */
11380    if (length == 1) {
11381        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11382        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11383    }
11384
11385    /* Special case for empty strings */
11386    if (length == 0)
11387        return PyBool_FromLong(0);
11388
11389    for (i = 0; i < length; i++) {
11390        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11391            return PyBool_FromLong(0);
11392    }
11393    return PyBool_FromLong(1);
11394}
11395
11396PyDoc_STRVAR(isnumeric__doc__,
11397             "S.isnumeric() -> bool\n\
11398\n\
11399Return True if there are only numeric characters in S,\n\
11400False otherwise.");
11401
11402static PyObject*
11403unicode_isnumeric(PyObject *self)
11404{
11405    Py_ssize_t i, length;
11406    int kind;
11407    void *data;
11408
11409    if (PyUnicode_READY(self) == -1)
11410        return NULL;
11411    length = PyUnicode_GET_LENGTH(self);
11412    kind = PyUnicode_KIND(self);
11413    data = PyUnicode_DATA(self);
11414
11415    /* Shortcut for single character strings */
11416    if (length == 1)
11417        return PyBool_FromLong(
11418            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11419
11420    /* Special case for empty strings */
11421    if (length == 0)
11422        return PyBool_FromLong(0);
11423
11424    for (i = 0; i < length; i++) {
11425        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11426            return PyBool_FromLong(0);
11427    }
11428    return PyBool_FromLong(1);
11429}
11430
11431int
11432PyUnicode_IsIdentifier(PyObject *self)
11433{
11434    int kind;
11435    void *data;
11436    Py_ssize_t i;
11437    Py_UCS4 first;
11438
11439    if (PyUnicode_READY(self) == -1) {
11440        Py_FatalError("identifier not ready");
11441        return 0;
11442    }
11443
11444    /* Special case for empty strings */
11445    if (PyUnicode_GET_LENGTH(self) == 0)
11446        return 0;
11447    kind = PyUnicode_KIND(self);
11448    data = PyUnicode_DATA(self);
11449
11450    /* PEP 3131 says that the first character must be in
11451       XID_Start and subsequent characters in XID_Continue,
11452       and for the ASCII range, the 2.x rules apply (i.e
11453       start with letters and underscore, continue with
11454       letters, digits, underscore). However, given the current
11455       definition of XID_Start and XID_Continue, it is sufficient
11456       to check just for these, except that _ must be allowed
11457       as starting an identifier.  */
11458    first = PyUnicode_READ(kind, data, 0);
11459    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11460        return 0;
11461
11462    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11463        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11464            return 0;
11465    return 1;
11466}
11467
11468PyDoc_STRVAR(isidentifier__doc__,
11469             "S.isidentifier() -> bool\n\
11470\n\
11471Return True if S is a valid identifier according\n\
11472to the language definition.");
11473
11474static PyObject*
11475unicode_isidentifier(PyObject *self)
11476{
11477    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11478}
11479
11480PyDoc_STRVAR(isprintable__doc__,
11481             "S.isprintable() -> bool\n\
11482\n\
11483Return True if all characters in S are considered\n\
11484printable in repr() or S is empty, False otherwise.");
11485
11486static PyObject*
11487unicode_isprintable(PyObject *self)
11488{
11489    Py_ssize_t i, length;
11490    int kind;
11491    void *data;
11492
11493    if (PyUnicode_READY(self) == -1)
11494        return NULL;
11495    length = PyUnicode_GET_LENGTH(self);
11496    kind = PyUnicode_KIND(self);
11497    data = PyUnicode_DATA(self);
11498
11499    /* Shortcut for single character strings */
11500    if (length == 1)
11501        return PyBool_FromLong(
11502            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11503
11504    for (i = 0; i < length; i++) {
11505        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11506            Py_RETURN_FALSE;
11507        }
11508    }
11509    Py_RETURN_TRUE;
11510}
11511
11512PyDoc_STRVAR(join__doc__,
11513             "S.join(iterable) -> str\n\
11514\n\
11515Return a string which is the concatenation of the strings in the\n\
11516iterable.  The separator between elements is S.");
11517
11518static PyObject*
11519unicode_join(PyObject *self, PyObject *data)
11520{
11521    return PyUnicode_Join(self, data);
11522}
11523
11524static Py_ssize_t
11525unicode_length(PyObject *self)
11526{
11527    if (PyUnicode_READY(self) == -1)
11528        return -1;
11529    return PyUnicode_GET_LENGTH(self);
11530}
11531
11532PyDoc_STRVAR(ljust__doc__,
11533             "S.ljust(width[, fillchar]) -> str\n\
11534\n\
11535Return S left-justified in a Unicode string of length width. Padding is\n\
11536done using the specified fill character (default is a space).");
11537
11538static PyObject *
11539unicode_ljust(PyObject *self, PyObject *args)
11540{
11541    Py_ssize_t width;
11542    Py_UCS4 fillchar = ' ';
11543
11544    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11545        return NULL;
11546
11547    if (PyUnicode_READY(self) == -1)
11548        return NULL;
11549
11550    if (PyUnicode_GET_LENGTH(self) >= width)
11551        return unicode_result_unchanged(self);
11552
11553    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11554}
11555
11556PyDoc_STRVAR(lower__doc__,
11557             "S.lower() -> str\n\
11558\n\
11559Return a copy of the string S converted to lowercase.");
11560
11561static PyObject*
11562unicode_lower(PyObject *self)
11563{
11564    if (PyUnicode_READY(self) == -1)
11565        return NULL;
11566    if (PyUnicode_IS_ASCII(self))
11567        return ascii_upper_or_lower(self, 1);
11568    return case_operation(self, do_lower);
11569}
11570
11571#define LEFTSTRIP 0
11572#define RIGHTSTRIP 1
11573#define BOTHSTRIP 2
11574
11575/* Arrays indexed by above */
11576static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11577
11578#define STRIPNAME(i) (stripformat[i]+3)
11579
11580/* externally visible for str.strip(unicode) */
11581PyObject *
11582_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11583{
11584    void *data;
11585    int kind;
11586    Py_ssize_t i, j, len;
11587    BLOOM_MASK sepmask;
11588
11589    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11590        return NULL;
11591
11592    kind = PyUnicode_KIND(self);
11593    data = PyUnicode_DATA(self);
11594    len = PyUnicode_GET_LENGTH(self);
11595    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11596                              PyUnicode_DATA(sepobj),
11597                              PyUnicode_GET_LENGTH(sepobj));
11598
11599    i = 0;
11600    if (striptype != RIGHTSTRIP) {
11601        while (i < len &&
11602               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11603            i++;
11604        }
11605    }
11606
11607    j = len;
11608    if (striptype != LEFTSTRIP) {
11609        do {
11610            j--;
11611        } while (j >= i &&
11612                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11613        j++;
11614    }
11615
11616    return PyUnicode_Substring(self, i, j);
11617}
11618
11619PyObject*
11620PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11621{
11622    unsigned char *data;
11623    int kind;
11624    Py_ssize_t length;
11625
11626    if (PyUnicode_READY(self) == -1)
11627        return NULL;
11628
11629    length = PyUnicode_GET_LENGTH(self);
11630    end = Py_MIN(end, length);
11631
11632    if (start == 0 && end == length)
11633        return unicode_result_unchanged(self);
11634
11635    if (start < 0 || end < 0) {
11636        PyErr_SetString(PyExc_IndexError, "string index out of range");
11637        return NULL;
11638    }
11639    if (start >= length || end < start) {
11640        Py_INCREF(unicode_empty);
11641        return unicode_empty;
11642    }
11643
11644    length = end - start;
11645    if (PyUnicode_IS_ASCII(self)) {
11646        data = PyUnicode_1BYTE_DATA(self);
11647        return _PyUnicode_FromASCII((char*)(data + start), length);
11648    }
11649    else {
11650        kind = PyUnicode_KIND(self);
11651        data = PyUnicode_1BYTE_DATA(self);
11652        return PyUnicode_FromKindAndData(kind,
11653                                         data + kind * start,
11654                                         length);
11655    }
11656}
11657
11658static PyObject *
11659do_strip(PyObject *self, int striptype)
11660{
11661    int kind;
11662    void *data;
11663    Py_ssize_t len, i, j;
11664
11665    if (PyUnicode_READY(self) == -1)
11666        return NULL;
11667
11668    kind = PyUnicode_KIND(self);
11669    data = PyUnicode_DATA(self);
11670    len = PyUnicode_GET_LENGTH(self);
11671
11672    i = 0;
11673    if (striptype != RIGHTSTRIP) {
11674        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11675            i++;
11676        }
11677    }
11678
11679    j = len;
11680    if (striptype != LEFTSTRIP) {
11681        do {
11682            j--;
11683        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11684        j++;
11685    }
11686
11687    return PyUnicode_Substring(self, i, j);
11688}
11689
11690
11691static PyObject *
11692do_argstrip(PyObject *self, int striptype, PyObject *args)
11693{
11694    PyObject *sep = NULL;
11695
11696    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11697        return NULL;
11698
11699    if (sep != NULL && sep != Py_None) {
11700        if (PyUnicode_Check(sep))
11701            return _PyUnicode_XStrip(self, striptype, sep);
11702        else {
11703            PyErr_Format(PyExc_TypeError,
11704                         "%s arg must be None or str",
11705                         STRIPNAME(striptype));
11706            return NULL;
11707        }
11708    }
11709
11710    return do_strip(self, striptype);
11711}
11712
11713
11714PyDoc_STRVAR(strip__doc__,
11715             "S.strip([chars]) -> str\n\
11716\n\
11717Return a copy of the string S with leading and trailing\n\
11718whitespace removed.\n\
11719If chars is given and not None, remove characters in chars instead.");
11720
11721static PyObject *
11722unicode_strip(PyObject *self, PyObject *args)
11723{
11724    if (PyTuple_GET_SIZE(args) == 0)
11725        return do_strip(self, BOTHSTRIP); /* Common case */
11726    else
11727        return do_argstrip(self, BOTHSTRIP, args);
11728}
11729
11730
11731PyDoc_STRVAR(lstrip__doc__,
11732             "S.lstrip([chars]) -> str\n\
11733\n\
11734Return a copy of the string S with leading whitespace removed.\n\
11735If chars is given and not None, remove characters in chars instead.");
11736
11737static PyObject *
11738unicode_lstrip(PyObject *self, PyObject *args)
11739{
11740    if (PyTuple_GET_SIZE(args) == 0)
11741        return do_strip(self, LEFTSTRIP); /* Common case */
11742    else
11743        return do_argstrip(self, LEFTSTRIP, args);
11744}
11745
11746
11747PyDoc_STRVAR(rstrip__doc__,
11748             "S.rstrip([chars]) -> str\n\
11749\n\
11750Return a copy of the string S with trailing whitespace removed.\n\
11751If chars is given and not None, remove characters in chars instead.");
11752
11753static PyObject *
11754unicode_rstrip(PyObject *self, PyObject *args)
11755{
11756    if (PyTuple_GET_SIZE(args) == 0)
11757        return do_strip(self, RIGHTSTRIP); /* Common case */
11758    else
11759        return do_argstrip(self, RIGHTSTRIP, args);
11760}
11761
11762
11763static PyObject*
11764unicode_repeat(PyObject *str, Py_ssize_t len)
11765{
11766    PyObject *u;
11767    Py_ssize_t nchars, n;
11768
11769    if (len < 1) {
11770        Py_INCREF(unicode_empty);
11771        return unicode_empty;
11772    }
11773
11774    /* no repeat, return original string */
11775    if (len == 1)
11776        return unicode_result_unchanged(str);
11777
11778    if (PyUnicode_READY(str) == -1)
11779        return NULL;
11780
11781    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11782        PyErr_SetString(PyExc_OverflowError,
11783                        "repeated string is too long");
11784        return NULL;
11785    }
11786    nchars = len * PyUnicode_GET_LENGTH(str);
11787
11788    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11789    if (!u)
11790        return NULL;
11791    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11792
11793    if (PyUnicode_GET_LENGTH(str) == 1) {
11794        const int kind = PyUnicode_KIND(str);
11795        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11796        if (kind == PyUnicode_1BYTE_KIND) {
11797            void *to = PyUnicode_DATA(u);
11798            memset(to, (unsigned char)fill_char, len);
11799        }
11800        else if (kind == PyUnicode_2BYTE_KIND) {
11801            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11802            for (n = 0; n < len; ++n)
11803                ucs2[n] = fill_char;
11804        } else {
11805            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11806            assert(kind == PyUnicode_4BYTE_KIND);
11807            for (n = 0; n < len; ++n)
11808                ucs4[n] = fill_char;
11809        }
11810    }
11811    else {
11812        /* number of characters copied this far */
11813        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11814        const Py_ssize_t char_size = PyUnicode_KIND(str);
11815        char *to = (char *) PyUnicode_DATA(u);
11816        Py_MEMCPY(to, PyUnicode_DATA(str),
11817                  PyUnicode_GET_LENGTH(str) * char_size);
11818        while (done < nchars) {
11819            n = (done <= nchars-done) ? done : nchars-done;
11820            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11821            done += n;
11822        }
11823    }
11824
11825    assert(_PyUnicode_CheckConsistency(u, 1));
11826    return u;
11827}
11828
11829PyObject *
11830PyUnicode_Replace(PyObject *obj,
11831                  PyObject *subobj,
11832                  PyObject *replobj,
11833                  Py_ssize_t maxcount)
11834{
11835    PyObject *self;
11836    PyObject *str1;
11837    PyObject *str2;
11838    PyObject *result;
11839
11840    self = PyUnicode_FromObject(obj);
11841    if (self == NULL)
11842        return NULL;
11843    str1 = PyUnicode_FromObject(subobj);
11844    if (str1 == NULL) {
11845        Py_DECREF(self);
11846        return NULL;
11847    }
11848    str2 = PyUnicode_FromObject(replobj);
11849    if (str2 == NULL) {
11850        Py_DECREF(self);
11851        Py_DECREF(str1);
11852        return NULL;
11853    }
11854    if (PyUnicode_READY(self) == -1 ||
11855        PyUnicode_READY(str1) == -1 ||
11856        PyUnicode_READY(str2) == -1)
11857        result = NULL;
11858    else
11859        result = replace(self, str1, str2, maxcount);
11860    Py_DECREF(self);
11861    Py_DECREF(str1);
11862    Py_DECREF(str2);
11863    return result;
11864}
11865
11866PyDoc_STRVAR(replace__doc__,
11867             "S.replace(old, new[, count]) -> str\n\
11868\n\
11869Return a copy of S with all occurrences of substring\n\
11870old replaced by new.  If the optional argument count is\n\
11871given, only the first count occurrences are replaced.");
11872
11873static PyObject*
11874unicode_replace(PyObject *self, PyObject *args)
11875{
11876    PyObject *str1;
11877    PyObject *str2;
11878    Py_ssize_t maxcount = -1;
11879    PyObject *result;
11880
11881    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11882        return NULL;
11883    if (PyUnicode_READY(self) == -1)
11884        return NULL;
11885    str1 = PyUnicode_FromObject(str1);
11886    if (str1 == NULL)
11887        return NULL;
11888    str2 = PyUnicode_FromObject(str2);
11889    if (str2 == NULL) {
11890        Py_DECREF(str1);
11891        return NULL;
11892    }
11893    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11894        result = NULL;
11895    else
11896        result = replace(self, str1, str2, maxcount);
11897
11898    Py_DECREF(str1);
11899    Py_DECREF(str2);
11900    return result;
11901}
11902
11903static PyObject *
11904unicode_repr(PyObject *unicode)
11905{
11906    PyObject *repr;
11907    Py_ssize_t isize;
11908    Py_ssize_t osize, squote, dquote, i, o;
11909    Py_UCS4 max, quote;
11910    int ikind, okind;
11911    void *idata, *odata;
11912
11913    if (PyUnicode_READY(unicode) == -1)
11914        return NULL;
11915
11916    isize = PyUnicode_GET_LENGTH(unicode);
11917    idata = PyUnicode_DATA(unicode);
11918
11919    /* Compute length of output, quote characters, and
11920       maximum character */
11921    osize = 2; /* quotes */
11922    max = 127;
11923    squote = dquote = 0;
11924    ikind = PyUnicode_KIND(unicode);
11925    for (i = 0; i < isize; i++) {
11926        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11927        switch (ch) {
11928        case '\'': squote++; osize++; break;
11929        case '"':  dquote++; osize++; break;
11930        case '\\': case '\t': case '\r': case '\n':
11931            osize += 2; break;
11932        default:
11933            /* Fast-path ASCII */
11934            if (ch < ' ' || ch == 0x7f)
11935                osize += 4; /* \xHH */
11936            else if (ch < 0x7f)
11937                osize++;
11938            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11939                osize++;
11940                max = ch > max ? ch : max;
11941            }
11942            else if (ch < 0x100)
11943                osize += 4; /* \xHH */
11944            else if (ch < 0x10000)
11945                osize += 6; /* \uHHHH */
11946            else
11947                osize += 10; /* \uHHHHHHHH */
11948        }
11949    }
11950
11951    quote = '\'';
11952    if (squote) {
11953        if (dquote)
11954            /* Both squote and dquote present. Use squote,
11955               and escape them */
11956            osize += squote;
11957        else
11958            quote = '"';
11959    }
11960
11961    repr = PyUnicode_New(osize, max);
11962    if (repr == NULL)
11963        return NULL;
11964    okind = PyUnicode_KIND(repr);
11965    odata = PyUnicode_DATA(repr);
11966
11967    PyUnicode_WRITE(okind, odata, 0, quote);
11968    PyUnicode_WRITE(okind, odata, osize-1, quote);
11969
11970    for (i = 0, o = 1; i < isize; i++) {
11971        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11972
11973        /* Escape quotes and backslashes */
11974        if ((ch == quote) || (ch == '\\')) {
11975            PyUnicode_WRITE(okind, odata, o++, '\\');
11976            PyUnicode_WRITE(okind, odata, o++, ch);
11977            continue;
11978        }
11979
11980        /* Map special whitespace to '\t', \n', '\r' */
11981        if (ch == '\t') {
11982            PyUnicode_WRITE(okind, odata, o++, '\\');
11983            PyUnicode_WRITE(okind, odata, o++, 't');
11984        }
11985        else if (ch == '\n') {
11986            PyUnicode_WRITE(okind, odata, o++, '\\');
11987            PyUnicode_WRITE(okind, odata, o++, 'n');
11988        }
11989        else if (ch == '\r') {
11990            PyUnicode_WRITE(okind, odata, o++, '\\');
11991            PyUnicode_WRITE(okind, odata, o++, 'r');
11992        }
11993
11994        /* Map non-printable US ASCII to '\xhh' */
11995        else if (ch < ' ' || ch == 0x7F) {
11996            PyUnicode_WRITE(okind, odata, o++, '\\');
11997            PyUnicode_WRITE(okind, odata, o++, 'x');
11998            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11999            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12000        }
12001
12002        /* Copy ASCII characters as-is */
12003        else if (ch < 0x7F) {
12004            PyUnicode_WRITE(okind, odata, o++, ch);
12005        }
12006
12007        /* Non-ASCII characters */
12008        else {
12009            /* Map Unicode whitespace and control characters
12010               (categories Z* and C* except ASCII space)
12011            */
12012            if (!Py_UNICODE_ISPRINTABLE(ch)) {
12013                PyUnicode_WRITE(okind, odata, o++, '\\');
12014                /* Map 8-bit characters to '\xhh' */
12015                if (ch <= 0xff) {
12016                    PyUnicode_WRITE(okind, odata, o++, 'x');
12017                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12018                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12019                }
12020                /* Map 16-bit characters to '\uxxxx' */
12021                else if (ch <= 0xffff) {
12022                    PyUnicode_WRITE(okind, odata, o++, 'u');
12023                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12024                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12025                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12026                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12027                }
12028                /* Map 21-bit characters to '\U00xxxxxx' */
12029                else {
12030                    PyUnicode_WRITE(okind, odata, o++, 'U');
12031                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12032                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12033                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12034                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12035                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12036                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12037                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12038                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12039                }
12040            }
12041            /* Copy characters as-is */
12042            else {
12043                PyUnicode_WRITE(okind, odata, o++, ch);
12044            }
12045        }
12046    }
12047    /* Closing quote already added at the beginning */
12048    assert(_PyUnicode_CheckConsistency(repr, 1));
12049    return repr;
12050}
12051
12052PyDoc_STRVAR(rfind__doc__,
12053             "S.rfind(sub[, start[, end]]) -> int\n\
12054\n\
12055Return the highest index in S where substring sub is found,\n\
12056such that sub is contained within S[start:end].  Optional\n\
12057arguments start and end are interpreted as in slice notation.\n\
12058\n\
12059Return -1 on failure.");
12060
12061static PyObject *
12062unicode_rfind(PyObject *self, PyObject *args)
12063{
12064    PyObject *substring;
12065    Py_ssize_t start;
12066    Py_ssize_t end;
12067    Py_ssize_t result;
12068
12069    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12070                                            &start, &end))
12071        return NULL;
12072
12073    if (PyUnicode_READY(self) == -1)
12074        return NULL;
12075    if (PyUnicode_READY(substring) == -1)
12076        return NULL;
12077
12078    result = any_find_slice(-1, self, substring, start, end);
12079
12080    Py_DECREF(substring);
12081
12082    if (result == -2)
12083        return NULL;
12084
12085    return PyLong_FromSsize_t(result);
12086}
12087
12088PyDoc_STRVAR(rindex__doc__,
12089             "S.rindex(sub[, start[, end]]) -> int\n\
12090\n\
12091Like S.rfind() but raise ValueError when the substring is not found.");
12092
12093static PyObject *
12094unicode_rindex(PyObject *self, PyObject *args)
12095{
12096    PyObject *substring;
12097    Py_ssize_t start;
12098    Py_ssize_t end;
12099    Py_ssize_t result;
12100
12101    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12102                                            &start, &end))
12103        return NULL;
12104
12105    if (PyUnicode_READY(self) == -1)
12106        return NULL;
12107    if (PyUnicode_READY(substring) == -1)
12108        return NULL;
12109
12110    result = any_find_slice(-1, self, substring, start, end);
12111
12112    Py_DECREF(substring);
12113
12114    if (result == -2)
12115        return NULL;
12116
12117    if (result < 0) {
12118        PyErr_SetString(PyExc_ValueError, "substring not found");
12119        return NULL;
12120    }
12121
12122    return PyLong_FromSsize_t(result);
12123}
12124
12125PyDoc_STRVAR(rjust__doc__,
12126             "S.rjust(width[, fillchar]) -> str\n\
12127\n\
12128Return S right-justified in a string of length width. Padding is\n\
12129done using the specified fill character (default is a space).");
12130
12131static PyObject *
12132unicode_rjust(PyObject *self, PyObject *args)
12133{
12134    Py_ssize_t width;
12135    Py_UCS4 fillchar = ' ';
12136
12137    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12138        return NULL;
12139
12140    if (PyUnicode_READY(self) == -1)
12141        return NULL;
12142
12143    if (PyUnicode_GET_LENGTH(self) >= width)
12144        return unicode_result_unchanged(self);
12145
12146    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12147}
12148
12149PyObject *
12150PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12151{
12152    PyObject *result;
12153
12154    s = PyUnicode_FromObject(s);
12155    if (s == NULL)
12156        return NULL;
12157    if (sep != NULL) {
12158        sep = PyUnicode_FromObject(sep);
12159        if (sep == NULL) {
12160            Py_DECREF(s);
12161            return NULL;
12162        }
12163    }
12164
12165    result = split(s, sep, maxsplit);
12166
12167    Py_DECREF(s);
12168    Py_XDECREF(sep);
12169    return result;
12170}
12171
12172PyDoc_STRVAR(split__doc__,
12173             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12174\n\
12175Return a list of the words in S, using sep as the\n\
12176delimiter string.  If maxsplit is given, at most maxsplit\n\
12177splits are done. If sep is not specified or is None, any\n\
12178whitespace string is a separator and empty strings are\n\
12179removed from the result.");
12180
12181static PyObject*
12182unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12183{
12184    static char *kwlist[] = {"sep", "maxsplit", 0};
12185    PyObject *substring = Py_None;
12186    Py_ssize_t maxcount = -1;
12187
12188    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12189                                     kwlist, &substring, &maxcount))
12190        return NULL;
12191
12192    if (substring == Py_None)
12193        return split(self, NULL, maxcount);
12194    else if (PyUnicode_Check(substring))
12195        return split(self, substring, maxcount);
12196    else
12197        return PyUnicode_Split(self, substring, maxcount);
12198}
12199
12200PyObject *
12201PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12202{
12203    PyObject* str_obj;
12204    PyObject* sep_obj;
12205    PyObject* out;
12206    int kind1, kind2, kind;
12207    void *buf1 = NULL, *buf2 = NULL;
12208    Py_ssize_t len1, len2;
12209
12210    str_obj = PyUnicode_FromObject(str_in);
12211    if (!str_obj)
12212        return NULL;
12213    sep_obj = PyUnicode_FromObject(sep_in);
12214    if (!sep_obj) {
12215        Py_DECREF(str_obj);
12216        return NULL;
12217    }
12218    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12219        Py_DECREF(sep_obj);
12220        Py_DECREF(str_obj);
12221        return NULL;
12222    }
12223
12224    kind1 = PyUnicode_KIND(str_obj);
12225    kind2 = PyUnicode_KIND(sep_obj);
12226    kind = Py_MAX(kind1, kind2);
12227    buf1 = PyUnicode_DATA(str_obj);
12228    if (kind1 != kind)
12229        buf1 = _PyUnicode_AsKind(str_obj, kind);
12230    if (!buf1)
12231        goto onError;
12232    buf2 = PyUnicode_DATA(sep_obj);
12233    if (kind2 != kind)
12234        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12235    if (!buf2)
12236        goto onError;
12237    len1 = PyUnicode_GET_LENGTH(str_obj);
12238    len2 = PyUnicode_GET_LENGTH(sep_obj);
12239
12240    switch (PyUnicode_KIND(str_obj)) {
12241    case PyUnicode_1BYTE_KIND:
12242        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12243            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12244        else
12245            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12246        break;
12247    case PyUnicode_2BYTE_KIND:
12248        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12249        break;
12250    case PyUnicode_4BYTE_KIND:
12251        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12252        break;
12253    default:
12254        assert(0);
12255        out = 0;
12256    }
12257
12258    Py_DECREF(sep_obj);
12259    Py_DECREF(str_obj);
12260    if (kind1 != kind)
12261        PyMem_Free(buf1);
12262    if (kind2 != kind)
12263        PyMem_Free(buf2);
12264
12265    return out;
12266  onError:
12267    Py_DECREF(sep_obj);
12268    Py_DECREF(str_obj);
12269    if (kind1 != kind && buf1)
12270        PyMem_Free(buf1);
12271    if (kind2 != kind && buf2)
12272        PyMem_Free(buf2);
12273    return NULL;
12274}
12275
12276
12277PyObject *
12278PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12279{
12280    PyObject* str_obj;
12281    PyObject* sep_obj;
12282    PyObject* out;
12283    int kind1, kind2, kind;
12284    void *buf1 = NULL, *buf2 = NULL;
12285    Py_ssize_t len1, len2;
12286
12287    str_obj = PyUnicode_FromObject(str_in);
12288    if (!str_obj)
12289        return NULL;
12290    sep_obj = PyUnicode_FromObject(sep_in);
12291    if (!sep_obj) {
12292        Py_DECREF(str_obj);
12293        return NULL;
12294    }
12295
12296    kind1 = PyUnicode_KIND(str_in);
12297    kind2 = PyUnicode_KIND(sep_obj);
12298    kind = Py_MAX(kind1, kind2);
12299    buf1 = PyUnicode_DATA(str_in);
12300    if (kind1 != kind)
12301        buf1 = _PyUnicode_AsKind(str_in, kind);
12302    if (!buf1)
12303        goto onError;
12304    buf2 = PyUnicode_DATA(sep_obj);
12305    if (kind2 != kind)
12306        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12307    if (!buf2)
12308        goto onError;
12309    len1 = PyUnicode_GET_LENGTH(str_obj);
12310    len2 = PyUnicode_GET_LENGTH(sep_obj);
12311
12312    switch (PyUnicode_KIND(str_in)) {
12313    case PyUnicode_1BYTE_KIND:
12314        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12315            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12316        else
12317            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12318        break;
12319    case PyUnicode_2BYTE_KIND:
12320        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12321        break;
12322    case PyUnicode_4BYTE_KIND:
12323        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12324        break;
12325    default:
12326        assert(0);
12327        out = 0;
12328    }
12329
12330    Py_DECREF(sep_obj);
12331    Py_DECREF(str_obj);
12332    if (kind1 != kind)
12333        PyMem_Free(buf1);
12334    if (kind2 != kind)
12335        PyMem_Free(buf2);
12336
12337    return out;
12338  onError:
12339    Py_DECREF(sep_obj);
12340    Py_DECREF(str_obj);
12341    if (kind1 != kind && buf1)
12342        PyMem_Free(buf1);
12343    if (kind2 != kind && buf2)
12344        PyMem_Free(buf2);
12345    return NULL;
12346}
12347
12348PyDoc_STRVAR(partition__doc__,
12349             "S.partition(sep) -> (head, sep, tail)\n\
12350\n\
12351Search for the separator sep in S, and return the part before it,\n\
12352the separator itself, and the part after it.  If the separator is not\n\
12353found, return S and two empty strings.");
12354
12355static PyObject*
12356unicode_partition(PyObject *self, PyObject *separator)
12357{
12358    return PyUnicode_Partition(self, separator);
12359}
12360
12361PyDoc_STRVAR(rpartition__doc__,
12362             "S.rpartition(sep) -> (head, sep, tail)\n\
12363\n\
12364Search for the separator sep in S, starting at the end of S, and return\n\
12365the part before it, the separator itself, and the part after it.  If the\n\
12366separator is not found, return two empty strings and S.");
12367
12368static PyObject*
12369unicode_rpartition(PyObject *self, PyObject *separator)
12370{
12371    return PyUnicode_RPartition(self, separator);
12372}
12373
12374PyObject *
12375PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12376{
12377    PyObject *result;
12378
12379    s = PyUnicode_FromObject(s);
12380    if (s == NULL)
12381        return NULL;
12382    if (sep != NULL) {
12383        sep = PyUnicode_FromObject(sep);
12384        if (sep == NULL) {
12385            Py_DECREF(s);
12386            return NULL;
12387        }
12388    }
12389
12390    result = rsplit(s, sep, maxsplit);
12391
12392    Py_DECREF(s);
12393    Py_XDECREF(sep);
12394    return result;
12395}
12396
12397PyDoc_STRVAR(rsplit__doc__,
12398             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12399\n\
12400Return a list of the words in S, using sep as the\n\
12401delimiter string, starting at the end of the string and\n\
12402working to the front.  If maxsplit is given, at most maxsplit\n\
12403splits are done. If sep is not specified, any whitespace string\n\
12404is a separator.");
12405
12406static PyObject*
12407unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12408{
12409    static char *kwlist[] = {"sep", "maxsplit", 0};
12410    PyObject *substring = Py_None;
12411    Py_ssize_t maxcount = -1;
12412
12413    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12414                                     kwlist, &substring, &maxcount))
12415        return NULL;
12416
12417    if (substring == Py_None)
12418        return rsplit(self, NULL, maxcount);
12419    else if (PyUnicode_Check(substring))
12420        return rsplit(self, substring, maxcount);
12421    else
12422        return PyUnicode_RSplit(self, substring, maxcount);
12423}
12424
12425PyDoc_STRVAR(splitlines__doc__,
12426             "S.splitlines([keepends]) -> list of strings\n\
12427\n\
12428Return a list of the lines in S, breaking at line boundaries.\n\
12429Line breaks are not included in the resulting list unless keepends\n\
12430is given and true.");
12431
12432static PyObject*
12433unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12434{
12435    static char *kwlist[] = {"keepends", 0};
12436    int keepends = 0;
12437
12438    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12439                                     kwlist, &keepends))
12440        return NULL;
12441
12442    return PyUnicode_Splitlines(self, keepends);
12443}
12444
12445static
12446PyObject *unicode_str(PyObject *self)
12447{
12448    return unicode_result_unchanged(self);
12449}
12450
12451PyDoc_STRVAR(swapcase__doc__,
12452             "S.swapcase() -> str\n\
12453\n\
12454Return a copy of S with uppercase characters converted to lowercase\n\
12455and vice versa.");
12456
12457static PyObject*
12458unicode_swapcase(PyObject *self)
12459{
12460    if (PyUnicode_READY(self) == -1)
12461        return NULL;
12462    return case_operation(self, do_swapcase);
12463}
12464
12465PyDoc_STRVAR(maketrans__doc__,
12466             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12467\n\
12468Return a translation table usable for str.translate().\n\
12469If there is only one argument, it must be a dictionary mapping Unicode\n\
12470ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12471Character keys will be then converted to ordinals.\n\
12472If there are two arguments, they must be strings of equal length, and\n\
12473in the resulting dictionary, each character in x will be mapped to the\n\
12474character at the same position in y. If there is a third argument, it\n\
12475must be a string, whose characters will be mapped to None in the result.");
12476
12477static PyObject*
12478unicode_maketrans(PyObject *null, PyObject *args)
12479{
12480    PyObject *x, *y = NULL, *z = NULL;
12481    PyObject *new = NULL, *key, *value;
12482    Py_ssize_t i = 0;
12483    int res;
12484
12485    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12486        return NULL;
12487    new = PyDict_New();
12488    if (!new)
12489        return NULL;
12490    if (y != NULL) {
12491        int x_kind, y_kind, z_kind;
12492        void *x_data, *y_data, *z_data;
12493
12494        /* x must be a string too, of equal length */
12495        if (!PyUnicode_Check(x)) {
12496            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12497                            "be a string if there is a second argument");
12498            goto err;
12499        }
12500        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12501            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12502                            "arguments must have equal length");
12503            goto err;
12504        }
12505        /* create entries for translating chars in x to those in y */
12506        x_kind = PyUnicode_KIND(x);
12507        y_kind = PyUnicode_KIND(y);
12508        x_data = PyUnicode_DATA(x);
12509        y_data = PyUnicode_DATA(y);
12510        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12511            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12512            if (!key)
12513                goto err;
12514            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12515            if (!value) {
12516                Py_DECREF(key);
12517                goto err;
12518            }
12519            res = PyDict_SetItem(new, key, value);
12520            Py_DECREF(key);
12521            Py_DECREF(value);
12522            if (res < 0)
12523                goto err;
12524        }
12525        /* create entries for deleting chars in z */
12526        if (z != NULL) {
12527            z_kind = PyUnicode_KIND(z);
12528            z_data = PyUnicode_DATA(z);
12529            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12530                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12531                if (!key)
12532                    goto err;
12533                res = PyDict_SetItem(new, key, Py_None);
12534                Py_DECREF(key);
12535                if (res < 0)
12536                    goto err;
12537            }
12538        }
12539    } else {
12540        int kind;
12541        void *data;
12542
12543        /* x must be a dict */
12544        if (!PyDict_CheckExact(x)) {
12545            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12546                            "to maketrans it must be a dict");
12547            goto err;
12548        }
12549        /* copy entries into the new dict, converting string keys to int keys */
12550        while (PyDict_Next(x, &i, &key, &value)) {
12551            if (PyUnicode_Check(key)) {
12552                /* convert string keys to integer keys */
12553                PyObject *newkey;
12554                if (PyUnicode_GET_LENGTH(key) != 1) {
12555                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12556                                    "table must be of length 1");
12557                    goto err;
12558                }
12559                kind = PyUnicode_KIND(key);
12560                data = PyUnicode_DATA(key);
12561                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12562                if (!newkey)
12563                    goto err;
12564                res = PyDict_SetItem(new, newkey, value);
12565                Py_DECREF(newkey);
12566                if (res < 0)
12567                    goto err;
12568            } else if (PyLong_Check(key)) {
12569                /* just keep integer keys */
12570                if (PyDict_SetItem(new, key, value) < 0)
12571                    goto err;
12572            } else {
12573                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12574                                "be strings or integers");
12575                goto err;
12576            }
12577        }
12578    }
12579    return new;
12580  err:
12581    Py_DECREF(new);
12582    return NULL;
12583}
12584
12585PyDoc_STRVAR(translate__doc__,
12586             "S.translate(table) -> str\n\
12587\n\
12588Return a copy of the string S, where all characters have been mapped\n\
12589through the given translation table, which must be a mapping of\n\
12590Unicode ordinals to Unicode ordinals, strings, or None.\n\
12591Unmapped characters are left untouched. Characters mapped to None\n\
12592are deleted.");
12593
12594static PyObject*
12595unicode_translate(PyObject *self, PyObject *table)
12596{
12597    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12598}
12599
12600PyDoc_STRVAR(upper__doc__,
12601             "S.upper() -> str\n\
12602\n\
12603Return a copy of S converted to uppercase.");
12604
12605static PyObject*
12606unicode_upper(PyObject *self)
12607{
12608    if (PyUnicode_READY(self) == -1)
12609        return NULL;
12610    if (PyUnicode_IS_ASCII(self))
12611        return ascii_upper_or_lower(self, 0);
12612    return case_operation(self, do_upper);
12613}
12614
12615PyDoc_STRVAR(zfill__doc__,
12616             "S.zfill(width) -> str\n\
12617\n\
12618Pad a numeric string S with zeros on the left, to fill a field\n\
12619of the specified width. The string S is never truncated.");
12620
12621static PyObject *
12622unicode_zfill(PyObject *self, PyObject *args)
12623{
12624    Py_ssize_t fill;
12625    PyObject *u;
12626    Py_ssize_t width;
12627    int kind;
12628    void *data;
12629    Py_UCS4 chr;
12630
12631    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12632        return NULL;
12633
12634    if (PyUnicode_READY(self) == -1)
12635        return NULL;
12636
12637    if (PyUnicode_GET_LENGTH(self) >= width)
12638        return unicode_result_unchanged(self);
12639
12640    fill = width - PyUnicode_GET_LENGTH(self);
12641
12642    u = pad(self, fill, 0, '0');
12643
12644    if (u == NULL)
12645        return NULL;
12646
12647    kind = PyUnicode_KIND(u);
12648    data = PyUnicode_DATA(u);
12649    chr = PyUnicode_READ(kind, data, fill);
12650
12651    if (chr == '+' || chr == '-') {
12652        /* move sign to beginning of string */
12653        PyUnicode_WRITE(kind, data, 0, chr);
12654        PyUnicode_WRITE(kind, data, fill, '0');
12655    }
12656
12657    assert(_PyUnicode_CheckConsistency(u, 1));
12658    return u;
12659}
12660
12661#if 0
12662static PyObject *
12663unicode__decimal2ascii(PyObject *self)
12664{
12665    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12666}
12667#endif
12668
12669PyDoc_STRVAR(startswith__doc__,
12670             "S.startswith(prefix[, start[, end]]) -> bool\n\
12671\n\
12672Return True if S starts with the specified prefix, False otherwise.\n\
12673With optional start, test S beginning at that position.\n\
12674With optional end, stop comparing S at that position.\n\
12675prefix can also be a tuple of strings to try.");
12676
12677static PyObject *
12678unicode_startswith(PyObject *self,
12679                   PyObject *args)
12680{
12681    PyObject *subobj;
12682    PyObject *substring;
12683    Py_ssize_t start = 0;
12684    Py_ssize_t end = PY_SSIZE_T_MAX;
12685    int result;
12686
12687    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12688        return NULL;
12689    if (PyTuple_Check(subobj)) {
12690        Py_ssize_t i;
12691        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12692            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12693            if (substring == NULL)
12694                return NULL;
12695            result = tailmatch(self, substring, start, end, -1);
12696            Py_DECREF(substring);
12697            if (result) {
12698                Py_RETURN_TRUE;
12699            }
12700        }
12701        /* nothing matched */
12702        Py_RETURN_FALSE;
12703    }
12704    substring = PyUnicode_FromObject(subobj);
12705    if (substring == NULL) {
12706        if (PyErr_ExceptionMatches(PyExc_TypeError))
12707            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12708                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12709        return NULL;
12710    }
12711    result = tailmatch(self, substring, start, end, -1);
12712    Py_DECREF(substring);
12713    return PyBool_FromLong(result);
12714}
12715
12716
12717PyDoc_STRVAR(endswith__doc__,
12718             "S.endswith(suffix[, start[, end]]) -> bool\n\
12719\n\
12720Return True if S ends with the specified suffix, False otherwise.\n\
12721With optional start, test S beginning at that position.\n\
12722With optional end, stop comparing S at that position.\n\
12723suffix can also be a tuple of strings to try.");
12724
12725static PyObject *
12726unicode_endswith(PyObject *self,
12727                 PyObject *args)
12728{
12729    PyObject *subobj;
12730    PyObject *substring;
12731    Py_ssize_t start = 0;
12732    Py_ssize_t end = PY_SSIZE_T_MAX;
12733    int result;
12734
12735    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12736        return NULL;
12737    if (PyTuple_Check(subobj)) {
12738        Py_ssize_t i;
12739        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12740            substring = PyUnicode_FromObject(
12741                PyTuple_GET_ITEM(subobj, i));
12742            if (substring == NULL)
12743                return NULL;
12744            result = tailmatch(self, substring, start, end, +1);
12745            Py_DECREF(substring);
12746            if (result) {
12747                Py_RETURN_TRUE;
12748            }
12749        }
12750        Py_RETURN_FALSE;
12751    }
12752    substring = PyUnicode_FromObject(subobj);
12753    if (substring == NULL) {
12754        if (PyErr_ExceptionMatches(PyExc_TypeError))
12755            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12756                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12757        return NULL;
12758    }
12759    result = tailmatch(self, substring, start, end, +1);
12760    Py_DECREF(substring);
12761    return PyBool_FromLong(result);
12762}
12763
12764Py_LOCAL_INLINE(void)
12765_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12766{
12767    writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12768    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12769    writer->data = PyUnicode_DATA(writer->buffer);
12770    writer->kind = PyUnicode_KIND(writer->buffer);
12771}
12772
12773void
12774_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
12775{
12776    memset(writer, 0, sizeof(*writer));
12777#ifdef Py_DEBUG
12778    writer->kind = 5;    /* invalid kind */
12779#endif
12780    writer->min_length = Py_MAX(min_length, 100);
12781    writer->overallocate = (min_length > 0);
12782}
12783
12784int
12785_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12786                                 Py_ssize_t length, Py_UCS4 maxchar)
12787{
12788    Py_ssize_t newlen;
12789    PyObject *newbuffer;
12790
12791    assert(length > 0);
12792
12793    if (length > PY_SSIZE_T_MAX - writer->pos) {
12794        PyErr_NoMemory();
12795        return -1;
12796    }
12797    newlen = writer->pos + length;
12798
12799    if (writer->buffer == NULL) {
12800        if (writer->overallocate) {
12801            /* overallocate 25% to limit the number of resize */
12802            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12803                newlen += newlen / 4;
12804            if (newlen < writer->min_length)
12805                newlen = writer->min_length;
12806        }
12807        writer->buffer = PyUnicode_New(newlen, maxchar);
12808        if (writer->buffer == NULL)
12809            return -1;
12810        _PyUnicodeWriter_Update(writer);
12811        return 0;
12812    }
12813
12814    if (newlen > writer->size) {
12815        if (writer->overallocate) {
12816            /* overallocate 25% to limit the number of resize */
12817            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12818                newlen += newlen / 4;
12819            if (newlen < writer->min_length)
12820                newlen = writer->min_length;
12821        }
12822
12823        if (maxchar > writer->maxchar || writer->readonly) {
12824            /* resize + widen */
12825            newbuffer = PyUnicode_New(newlen, maxchar);
12826            if (newbuffer == NULL)
12827                return -1;
12828            _PyUnicode_FastCopyCharacters(newbuffer, 0,
12829                                          writer->buffer, 0, writer->pos);
12830            Py_DECREF(writer->buffer);
12831            writer->readonly = 0;
12832        }
12833        else {
12834            newbuffer = resize_compact(writer->buffer, newlen);
12835            if (newbuffer == NULL)
12836                return -1;
12837        }
12838        writer->buffer = newbuffer;
12839        _PyUnicodeWriter_Update(writer);
12840    }
12841    else if (maxchar > writer->maxchar) {
12842        assert(!writer->readonly);
12843        newbuffer = PyUnicode_New(writer->size, maxchar);
12844        if (newbuffer == NULL)
12845            return -1;
12846        _PyUnicode_FastCopyCharacters(newbuffer, 0,
12847                                      writer->buffer, 0, writer->pos);
12848        Py_DECREF(writer->buffer);
12849        writer->buffer = newbuffer;
12850        _PyUnicodeWriter_Update(writer);
12851    }
12852    return 0;
12853}
12854
12855int
12856_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12857{
12858    Py_UCS4 maxchar;
12859    Py_ssize_t len;
12860
12861    if (PyUnicode_READY(str) == -1)
12862        return -1;
12863    len = PyUnicode_GET_LENGTH(str);
12864    if (len == 0)
12865        return 0;
12866    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12867    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
12868        if (writer->buffer == NULL && !writer->overallocate) {
12869            Py_INCREF(str);
12870            writer->buffer = str;
12871            _PyUnicodeWriter_Update(writer);
12872            writer->readonly = 1;
12873            writer->size = 0;
12874            writer->pos += len;
12875            return 0;
12876        }
12877        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12878            return -1;
12879    }
12880    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12881                                  str, 0, len);
12882    writer->pos += len;
12883    return 0;
12884}
12885
12886PyObject *
12887_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
12888{
12889    if (writer->pos == 0) {
12890        Py_XDECREF(writer->buffer);
12891        Py_INCREF(unicode_empty);
12892        return unicode_empty;
12893    }
12894    if (writer->readonly) {
12895        assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12896        return writer->buffer;
12897    }
12898    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12899        PyObject *newbuffer;
12900        newbuffer = resize_compact(writer->buffer, writer->pos);
12901        if (newbuffer == NULL) {
12902            Py_DECREF(writer->buffer);
12903            return NULL;
12904        }
12905        writer->buffer = newbuffer;
12906    }
12907    assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
12908    return writer->buffer;
12909}
12910
12911void
12912_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
12913{
12914    Py_CLEAR(writer->buffer);
12915}
12916
12917#include "stringlib/unicode_format.h"
12918
12919PyDoc_STRVAR(format__doc__,
12920             "S.format(*args, **kwargs) -> str\n\
12921\n\
12922Return a formatted version of S, using substitutions from args and kwargs.\n\
12923The substitutions are identified by braces ('{' and '}').");
12924
12925PyDoc_STRVAR(format_map__doc__,
12926             "S.format_map(mapping) -> str\n\
12927\n\
12928Return a formatted version of S, using substitutions from mapping.\n\
12929The substitutions are identified by braces ('{' and '}').");
12930
12931static PyObject *
12932unicode__format__(PyObject* self, PyObject* args)
12933{
12934    PyObject *format_spec;
12935    _PyUnicodeWriter writer;
12936    int ret;
12937
12938    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12939        return NULL;
12940
12941    if (PyUnicode_READY(self) == -1)
12942        return NULL;
12943    _PyUnicodeWriter_Init(&writer, 0);
12944    ret = _PyUnicode_FormatAdvancedWriter(&writer,
12945                                          self, format_spec, 0,
12946                                          PyUnicode_GET_LENGTH(format_spec));
12947    if (ret == -1) {
12948        _PyUnicodeWriter_Dealloc(&writer);
12949        return NULL;
12950    }
12951    return _PyUnicodeWriter_Finish(&writer);
12952}
12953
12954PyDoc_STRVAR(p_format__doc__,
12955             "S.__format__(format_spec) -> str\n\
12956\n\
12957Return a formatted version of S as described by format_spec.");
12958
12959static PyObject *
12960unicode__sizeof__(PyObject *v)
12961{
12962    Py_ssize_t size;
12963
12964    /* If it's a compact object, account for base structure +
12965       character data. */
12966    if (PyUnicode_IS_COMPACT_ASCII(v))
12967        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12968    else if (PyUnicode_IS_COMPACT(v))
12969        size = sizeof(PyCompactUnicodeObject) +
12970            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12971    else {
12972        /* If it is a two-block object, account for base object, and
12973           for character block if present. */
12974        size = sizeof(PyUnicodeObject);
12975        if (_PyUnicode_DATA_ANY(v))
12976            size += (PyUnicode_GET_LENGTH(v) + 1) *
12977                PyUnicode_KIND(v);
12978    }
12979    /* If the wstr pointer is present, account for it unless it is shared
12980       with the data pointer. Check if the data is not shared. */
12981    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12982        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12983    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12984        size += PyUnicode_UTF8_LENGTH(v) + 1;
12985
12986    return PyLong_FromSsize_t(size);
12987}
12988
12989PyDoc_STRVAR(sizeof__doc__,
12990             "S.__sizeof__() -> size of S in memory, in bytes");
12991
12992static PyObject *
12993unicode_getnewargs(PyObject *v)
12994{
12995    PyObject *copy = _PyUnicode_Copy(v);
12996    if (!copy)
12997        return NULL;
12998    return Py_BuildValue("(N)", copy);
12999}
13000
13001static PyMethodDef unicode_methods[] = {
13002    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13003    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13004    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13005    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13006    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13007    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13008    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13009    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13010    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13011    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13012    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13013    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13014    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13015    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13016    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13017    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13018    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13019    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13020    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13021    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13022    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13023    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13024    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13025    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13026    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13027    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13028    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13029    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13030    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13031    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13032    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13033    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13034    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13035    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13036    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13037    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13038    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13039    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13040    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13041    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13042    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13043    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13044    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13045    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13046    {"maketrans", (PyCFunction) unicode_maketrans,
13047     METH_VARARGS | METH_STATIC, maketrans__doc__},
13048    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13049#if 0
13050    /* These methods are just used for debugging the implementation. */
13051    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13052#endif
13053
13054    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13055    {NULL, NULL}
13056};
13057
13058static PyObject *
13059unicode_mod(PyObject *v, PyObject *w)
13060{
13061    if (!PyUnicode_Check(v))
13062        Py_RETURN_NOTIMPLEMENTED;
13063    return PyUnicode_Format(v, w);
13064}
13065
13066static PyNumberMethods unicode_as_number = {
13067    0,              /*nb_add*/
13068    0,              /*nb_subtract*/
13069    0,              /*nb_multiply*/
13070    unicode_mod,            /*nb_remainder*/
13071};
13072
13073static PySequenceMethods unicode_as_sequence = {
13074    (lenfunc) unicode_length,       /* sq_length */
13075    PyUnicode_Concat,           /* sq_concat */
13076    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13077    (ssizeargfunc) unicode_getitem,     /* sq_item */
13078    0,                  /* sq_slice */
13079    0,                  /* sq_ass_item */
13080    0,                  /* sq_ass_slice */
13081    PyUnicode_Contains,         /* sq_contains */
13082};
13083
13084static PyObject*
13085unicode_subscript(PyObject* self, PyObject* item)
13086{
13087    if (PyUnicode_READY(self) == -1)
13088        return NULL;
13089
13090    if (PyIndex_Check(item)) {
13091        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13092        if (i == -1 && PyErr_Occurred())
13093            return NULL;
13094        if (i < 0)
13095            i += PyUnicode_GET_LENGTH(self);
13096        return unicode_getitem(self, i);
13097    } else if (PySlice_Check(item)) {
13098        Py_ssize_t start, stop, step, slicelength, cur, i;
13099        PyObject *result;
13100        void *src_data, *dest_data;
13101        int src_kind, dest_kind;
13102        Py_UCS4 ch, max_char, kind_limit;
13103
13104        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13105                                 &start, &stop, &step, &slicelength) < 0) {
13106            return NULL;
13107        }
13108
13109        if (slicelength <= 0) {
13110            Py_INCREF(unicode_empty);
13111            return unicode_empty;
13112        } else if (start == 0 && step == 1 &&
13113                   slicelength == PyUnicode_GET_LENGTH(self)) {
13114            return unicode_result_unchanged(self);
13115        } else if (step == 1) {
13116            return PyUnicode_Substring(self,
13117                                       start, start + slicelength);
13118        }
13119        /* General case */
13120        src_kind = PyUnicode_KIND(self);
13121        src_data = PyUnicode_DATA(self);
13122        if (!PyUnicode_IS_ASCII(self)) {
13123            kind_limit = kind_maxchar_limit(src_kind);
13124            max_char = 0;
13125            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13126                ch = PyUnicode_READ(src_kind, src_data, cur);
13127                if (ch > max_char) {
13128                    max_char = ch;
13129                    if (max_char >= kind_limit)
13130                        break;
13131                }
13132            }
13133        }
13134        else
13135            max_char = 127;
13136        result = PyUnicode_New(slicelength, max_char);
13137        if (result == NULL)
13138            return NULL;
13139        dest_kind = PyUnicode_KIND(result);
13140        dest_data = PyUnicode_DATA(result);
13141
13142        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13143            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13144            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13145        }
13146        assert(_PyUnicode_CheckConsistency(result, 1));
13147        return result;
13148    } else {
13149        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13150        return NULL;
13151    }
13152}
13153
13154static PyMappingMethods unicode_as_mapping = {
13155    (lenfunc)unicode_length,        /* mp_length */
13156    (binaryfunc)unicode_subscript,  /* mp_subscript */
13157    (objobjargproc)0,           /* mp_ass_subscript */
13158};
13159
13160
13161/* Helpers for PyUnicode_Format() */
13162
13163static PyObject *
13164getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
13165{
13166    Py_ssize_t argidx = *p_argidx;
13167    if (argidx < arglen) {
13168        (*p_argidx)++;
13169        if (arglen < 0)
13170            return args;
13171        else
13172            return PyTuple_GetItem(args, argidx);
13173    }
13174    PyErr_SetString(PyExc_TypeError,
13175                    "not enough arguments for format string");
13176    return NULL;
13177}
13178
13179/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13180
13181static int
13182formatfloat(PyObject *v, int flags, int prec, int type,
13183            PyObject **p_output, _PyUnicodeWriter *writer)
13184{
13185    char *p;
13186    double x;
13187    Py_ssize_t len;
13188
13189    x = PyFloat_AsDouble(v);
13190    if (x == -1.0 && PyErr_Occurred())
13191        return -1;
13192
13193    if (prec < 0)
13194        prec = 6;
13195
13196    p = PyOS_double_to_string(x, type, prec,
13197                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
13198    if (p == NULL)
13199        return -1;
13200    len = strlen(p);
13201    if (writer) {
13202        if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13203            return -1;
13204        memcpy((char*)writer->data + writer->pos * writer->kind,
13205               p,
13206               len);
13207        writer->pos += len;
13208    }
13209    else
13210        *p_output = _PyUnicode_FromASCII(p, len);
13211    PyMem_Free(p);
13212    return 0;
13213}
13214
13215/* formatlong() emulates the format codes d, u, o, x and X, and
13216 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13217 * Python's regular ints.
13218 * Return value:  a new PyUnicodeObject*, or NULL if error.
13219 *     The output string is of the form
13220 *         "-"? ("0x" | "0X")? digit+
13221 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13222 *         set in flags.  The case of hex digits will be correct,
13223 *     There will be at least prec digits, zero-filled on the left if
13224 *         necessary to get that many.
13225 * val          object to be converted
13226 * flags        bitmask of format flags; only F_ALT is looked at
13227 * prec         minimum number of digits; 0-fill on left if needed
13228 * type         a character in [duoxX]; u acts the same as d
13229 *
13230 * CAUTION:  o, x and X conversions on regular ints can never
13231 * produce a '-' sign, but can for Python's unbounded ints.
13232 */
13233static PyObject*
13234formatlong(PyObject *val, int flags, int prec, int type)
13235{
13236    PyObject *result = NULL;
13237    char *buf;
13238    Py_ssize_t i;
13239    int sign;           /* 1 if '-', else 0 */
13240    int len;            /* number of characters */
13241    Py_ssize_t llen;
13242    int numdigits;      /* len == numnondigits + numdigits */
13243    int numnondigits = 0;
13244
13245    /* Avoid exceeding SSIZE_T_MAX */
13246    if (prec > INT_MAX-3) {
13247        PyErr_SetString(PyExc_OverflowError,
13248                        "precision too large");
13249        return NULL;
13250    }
13251
13252    assert(PyLong_Check(val));
13253
13254    switch (type) {
13255    case 'd':
13256    case 'u':
13257        /* Special-case boolean: we want 0/1 */
13258        if (PyBool_Check(val))
13259            result = PyNumber_ToBase(val, 10);
13260        else
13261            result = Py_TYPE(val)->tp_str(val);
13262        break;
13263    case 'o':
13264        numnondigits = 2;
13265        result = PyNumber_ToBase(val, 8);
13266        break;
13267    case 'x':
13268    case 'X':
13269        numnondigits = 2;
13270        result = PyNumber_ToBase(val, 16);
13271        break;
13272    default:
13273        assert(!"'type' not in [duoxX]");
13274    }
13275    if (!result)
13276        return NULL;
13277
13278    assert(unicode_modifiable(result));
13279    assert(PyUnicode_IS_READY(result));
13280    assert(PyUnicode_IS_ASCII(result));
13281
13282    /* To modify the string in-place, there can only be one reference. */
13283    if (Py_REFCNT(result) != 1) {
13284        PyErr_BadInternalCall();
13285        return NULL;
13286    }
13287    buf = PyUnicode_DATA(result);
13288    llen = PyUnicode_GET_LENGTH(result);
13289    if (llen > INT_MAX) {
13290        PyErr_SetString(PyExc_ValueError,
13291                        "string too large in _PyBytes_FormatLong");
13292        return NULL;
13293    }
13294    len = (int)llen;
13295    sign = buf[0] == '-';
13296    numnondigits += sign;
13297    numdigits = len - numnondigits;
13298    assert(numdigits > 0);
13299
13300    /* Get rid of base marker unless F_ALT */
13301    if (((flags & F_ALT) == 0 &&
13302        (type == 'o' || type == 'x' || type == 'X'))) {
13303        assert(buf[sign] == '0');
13304        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13305               buf[sign+1] == 'o');
13306        numnondigits -= 2;
13307        buf += 2;
13308        len -= 2;
13309        if (sign)
13310            buf[0] = '-';
13311        assert(len == numnondigits + numdigits);
13312        assert(numdigits > 0);
13313    }
13314
13315    /* Fill with leading zeroes to meet minimum width. */
13316    if (prec > numdigits) {
13317        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13318                                numnondigits + prec);
13319        char *b1;
13320        if (!r1) {
13321            Py_DECREF(result);
13322            return NULL;
13323        }
13324        b1 = PyBytes_AS_STRING(r1);
13325        for (i = 0; i < numnondigits; ++i)
13326            *b1++ = *buf++;
13327        for (i = 0; i < prec - numdigits; i++)
13328            *b1++ = '0';
13329        for (i = 0; i < numdigits; i++)
13330            *b1++ = *buf++;
13331        *b1 = '\0';
13332        Py_DECREF(result);
13333        result = r1;
13334        buf = PyBytes_AS_STRING(result);
13335        len = numnondigits + prec;
13336    }
13337
13338    /* Fix up case for hex conversions. */
13339    if (type == 'X') {
13340        /* Need to convert all lower case letters to upper case.
13341           and need to convert 0x to 0X (and -0x to -0X). */
13342        for (i = 0; i < len; i++)
13343            if (buf[i] >= 'a' && buf[i] <= 'x')
13344                buf[i] -= 'a'-'A';
13345    }
13346    if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13347        PyObject *unicode;
13348        unicode = _PyUnicode_FromASCII(buf, len);
13349        Py_DECREF(result);
13350        result = unicode;
13351    }
13352    return result;
13353}
13354
13355static Py_UCS4
13356formatchar(PyObject *v)
13357{
13358    /* presume that the buffer is at least 3 characters long */
13359    if (PyUnicode_Check(v)) {
13360        if (PyUnicode_GET_LENGTH(v) == 1) {
13361            return PyUnicode_READ_CHAR(v, 0);
13362        }
13363        goto onError;
13364    }
13365    else {
13366        /* Integer input truncated to a character */
13367        long x;
13368        x = PyLong_AsLong(v);
13369        if (x == -1 && PyErr_Occurred())
13370            goto onError;
13371
13372        if (x < 0 || x > MAX_UNICODE) {
13373            PyErr_SetString(PyExc_OverflowError,
13374                            "%c arg not in range(0x110000)");
13375            return (Py_UCS4) -1;
13376        }
13377
13378        return (Py_UCS4) x;
13379    }
13380
13381  onError:
13382    PyErr_SetString(PyExc_TypeError,
13383                    "%c requires int or char");
13384    return (Py_UCS4) -1;
13385}
13386
13387PyObject *
13388PyUnicode_Format(PyObject *format, PyObject *args)
13389{
13390    Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
13391    int args_owned = 0;
13392    PyObject *dict = NULL;
13393    PyObject *temp = NULL;
13394    PyObject *second = NULL;
13395    PyObject *uformat;
13396    void *fmt;
13397    enum PyUnicode_Kind kind, fmtkind;
13398    _PyUnicodeWriter writer;
13399    Py_ssize_t sublen;
13400    Py_UCS4 maxchar;
13401
13402    if (format == NULL || args == NULL) {
13403        PyErr_BadInternalCall();
13404        return NULL;
13405    }
13406    uformat = PyUnicode_FromObject(format);
13407    if (uformat == NULL)
13408        return NULL;
13409    if (PyUnicode_READY(uformat) == -1)
13410        Py_DECREF(uformat);
13411
13412    fmt = PyUnicode_DATA(uformat);
13413    fmtkind = PyUnicode_KIND(uformat);
13414    fmtcnt = PyUnicode_GET_LENGTH(uformat);
13415    fmtpos = 0;
13416
13417    _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
13418
13419    if (PyTuple_Check(args)) {
13420        arglen = PyTuple_Size(args);
13421        argidx = 0;
13422    }
13423    else {
13424        arglen = -1;
13425        argidx = -2;
13426    }
13427    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
13428        !PyUnicode_Check(args))
13429        dict = args;
13430
13431    while (--fmtcnt >= 0) {
13432        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13433            Py_ssize_t nonfmtpos;
13434            nonfmtpos = fmtpos++;
13435            while (fmtcnt >= 0 &&
13436                   PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13437                fmtpos++;
13438                fmtcnt--;
13439            }
13440            if (fmtcnt < 0)
13441                fmtpos--;
13442            sublen = fmtpos - nonfmtpos;
13443            maxchar = _PyUnicode_FindMaxChar(uformat,
13444                                             nonfmtpos, nonfmtpos + sublen);
13445            if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
13446                goto onError;
13447
13448            _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13449                                          uformat, nonfmtpos, sublen);
13450            writer.pos += sublen;
13451        }
13452        else {
13453            /* Got a format specifier */
13454            int flags = 0;
13455            Py_ssize_t width = -1;
13456            int prec = -1;
13457            Py_UCS4 c = '\0';
13458            Py_UCS4 fill;
13459            int sign;
13460            Py_UCS4 signchar;
13461            int isnumok;
13462            PyObject *v = NULL;
13463            void *pbuf = NULL;
13464            Py_ssize_t pindex, len;
13465            Py_UCS4 bufmaxchar;
13466            Py_ssize_t buflen;
13467
13468            fmtpos++;
13469            c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13470            if (c == '(') {
13471                Py_ssize_t keystart;
13472                Py_ssize_t keylen;
13473                PyObject *key;
13474                int pcount = 1;
13475
13476                if (dict == NULL) {
13477                    PyErr_SetString(PyExc_TypeError,
13478                                    "format requires a mapping");
13479                    goto onError;
13480                }
13481                ++fmtpos;
13482                --fmtcnt;
13483                keystart = fmtpos;
13484                /* Skip over balanced parentheses */
13485                while (pcount > 0 && --fmtcnt >= 0) {
13486                    c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13487                    if (c == ')')
13488                        --pcount;
13489                    else if (c == '(')
13490                        ++pcount;
13491                    fmtpos++;
13492                }
13493                keylen = fmtpos - keystart - 1;
13494                if (fmtcnt < 0 || pcount > 0) {
13495                    PyErr_SetString(PyExc_ValueError,
13496                                    "incomplete format key");
13497                    goto onError;
13498                }
13499                key = PyUnicode_Substring(uformat,
13500                                          keystart, keystart + keylen);
13501                if (key == NULL)
13502                    goto onError;
13503                if (args_owned) {
13504                    Py_DECREF(args);
13505                    args_owned = 0;
13506                }
13507                args = PyObject_GetItem(dict, key);
13508                Py_DECREF(key);
13509                if (args == NULL) {
13510                    goto onError;
13511                }
13512                args_owned = 1;
13513                arglen = -1;
13514                argidx = -2;
13515            }
13516            while (--fmtcnt >= 0) {
13517                c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13518                switch (c) {
13519                case '-': flags |= F_LJUST; continue;
13520                case '+': flags |= F_SIGN; continue;
13521                case ' ': flags |= F_BLANK; continue;
13522                case '#': flags |= F_ALT; continue;
13523                case '0': flags |= F_ZERO; continue;
13524                }
13525                break;
13526            }
13527            if (c == '*') {
13528                v = getnextarg(args, arglen, &argidx);
13529                if (v == NULL)
13530                    goto onError;
13531                if (!PyLong_Check(v)) {
13532                    PyErr_SetString(PyExc_TypeError,
13533                                    "* wants int");
13534                    goto onError;
13535                }
13536                width = PyLong_AsLong(v);
13537                if (width == -1 && PyErr_Occurred())
13538                    goto onError;
13539                if (width < 0) {
13540                    flags |= F_LJUST;
13541                    width = -width;
13542                }
13543                if (--fmtcnt >= 0)
13544                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13545            }
13546            else if (c >= '0' && c <= '9') {
13547                width = c - '0';
13548                while (--fmtcnt >= 0) {
13549                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13550                    if (c < '0' || c > '9')
13551                        break;
13552                    /* Since c is unsigned, the RHS would end up as unsigned,
13553                       mixing signed and unsigned comparison. Since c is between
13554                       '0' and '9', casting to int is safe. */
13555                    if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
13556                        PyErr_SetString(PyExc_ValueError,
13557                                        "width too big");
13558                        goto onError;
13559                    }
13560                    width = width*10 + (c - '0');
13561                }
13562            }
13563            if (c == '.') {
13564                prec = 0;
13565                if (--fmtcnt >= 0)
13566                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13567                if (c == '*') {
13568                    v = getnextarg(args, arglen, &argidx);
13569                    if (v == NULL)
13570                        goto onError;
13571                    if (!PyLong_Check(v)) {
13572                        PyErr_SetString(PyExc_TypeError,
13573                                        "* wants int");
13574                        goto onError;
13575                    }
13576                    prec = PyLong_AsLong(v);
13577                    if (prec == -1 && PyErr_Occurred())
13578                        goto onError;
13579                    if (prec < 0)
13580                        prec = 0;
13581                    if (--fmtcnt >= 0)
13582                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13583                }
13584                else if (c >= '0' && c <= '9') {
13585                    prec = c - '0';
13586                    while (--fmtcnt >= 0) {
13587                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13588                        if (c < '0' || c > '9')
13589                            break;
13590                        if (prec > (INT_MAX - ((int)c - '0')) / 10) {
13591                            PyErr_SetString(PyExc_ValueError,
13592                                            "prec too big");
13593                            goto onError;
13594                        }
13595                        prec = prec*10 + (c - '0');
13596                    }
13597                }
13598            } /* prec */
13599            if (fmtcnt >= 0) {
13600                if (c == 'h' || c == 'l' || c == 'L') {
13601                    if (--fmtcnt >= 0)
13602                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13603                }
13604            }
13605            if (fmtcnt < 0) {
13606                PyErr_SetString(PyExc_ValueError,
13607                                "incomplete format");
13608                goto onError;
13609            }
13610            if (fmtcnt == 0)
13611                writer.overallocate = 0;
13612
13613            if (c == '%') {
13614                if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
13615                    goto onError;
13616                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13617                writer.pos += 1;
13618                continue;
13619            }
13620
13621            v = getnextarg(args, arglen, &argidx);
13622            if (v == NULL)
13623                goto onError;
13624
13625            sign = 0;
13626            signchar = '\0';
13627            fill = ' ';
13628            switch (c) {
13629
13630            case 's':
13631            case 'r':
13632            case 'a':
13633                if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13634                    /* Fast path */
13635                    if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13636                        goto onError;
13637                    goto nextarg;
13638                }
13639
13640                if (PyUnicode_CheckExact(v) && c == 's') {
13641                    temp = v;
13642                    Py_INCREF(temp);
13643                }
13644                else {
13645                    if (c == 's')
13646                        temp = PyObject_Str(v);
13647                    else if (c == 'r')
13648                        temp = PyObject_Repr(v);
13649                    else
13650                        temp = PyObject_ASCII(v);
13651                }
13652                break;
13653
13654            case 'i':
13655            case 'd':
13656            case 'u':
13657            case 'o':
13658            case 'x':
13659            case 'X':
13660                if (PyLong_CheckExact(v)
13661                    && width == -1 && prec == -1
13662                    && !(flags & (F_SIGN | F_BLANK)))
13663                {
13664                    /* Fast path */
13665                    switch(c)
13666                    {
13667                    case 'd':
13668                    case 'i':
13669                    case 'u':
13670                        if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13671                            goto onError;
13672                        goto nextarg;
13673                    case 'x':
13674                        if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13675                            goto onError;
13676                        goto nextarg;
13677                    case 'o':
13678                        if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13679                            goto onError;
13680                        goto nextarg;
13681                    default:
13682                        break;
13683                    }
13684                }
13685
13686                isnumok = 0;
13687                if (PyNumber_Check(v)) {
13688                    PyObject *iobj=NULL;
13689
13690                    if (PyLong_Check(v)) {
13691                        iobj = v;
13692                        Py_INCREF(iobj);
13693                    }
13694                    else {
13695                        iobj = PyNumber_Long(v);
13696                    }
13697                    if (iobj!=NULL) {
13698                        if (PyLong_Check(iobj)) {
13699                            isnumok = 1;
13700                            sign = 1;
13701                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13702                            Py_DECREF(iobj);
13703                        }
13704                        else {
13705                            Py_DECREF(iobj);
13706                        }
13707                    }
13708                }
13709                if (!isnumok) {
13710                    PyErr_Format(PyExc_TypeError,
13711                                 "%%%c format: a number is required, "
13712                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13713                    goto onError;
13714                }
13715                if (flags & F_ZERO)
13716                    fill = '0';
13717                break;
13718
13719            case 'e':
13720            case 'E':
13721            case 'f':
13722            case 'F':
13723            case 'g':
13724            case 'G':
13725                if (width == -1 && prec == -1
13726                    && !(flags & (F_SIGN | F_BLANK)))
13727                {
13728                    /* Fast path */
13729                    if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13730                        goto onError;
13731                    goto nextarg;
13732                }
13733
13734                sign = 1;
13735                if (flags & F_ZERO)
13736                    fill = '0';
13737                if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13738                    temp = NULL;
13739                break;
13740
13741            case 'c':
13742            {
13743                Py_UCS4 ch = formatchar(v);
13744                if (ch == (Py_UCS4) -1)
13745                    goto onError;
13746                if (width == -1 && prec == -1) {
13747                    /* Fast path */
13748                    if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13749                        goto onError;
13750                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13751                    writer.pos += 1;
13752                    goto nextarg;
13753                }
13754                temp = PyUnicode_FromOrdinal(ch);
13755                break;
13756            }
13757
13758            default:
13759                PyErr_Format(PyExc_ValueError,
13760                             "unsupported format character '%c' (0x%x) "
13761                             "at index %zd",
13762                             (31<=c && c<=126) ? (char)c : '?',
13763                             (int)c,
13764                             fmtpos - 1);
13765                goto onError;
13766            }
13767            if (temp == NULL)
13768                goto onError;
13769            assert (PyUnicode_Check(temp));
13770
13771            if (width == -1 && prec == -1
13772                && !(flags & (F_SIGN | F_BLANK)))
13773            {
13774                /* Fast path */
13775                if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13776                    goto onError;
13777                goto nextarg;
13778            }
13779
13780            if (PyUnicode_READY(temp) == -1) {
13781                Py_CLEAR(temp);
13782                goto onError;
13783            }
13784            kind = PyUnicode_KIND(temp);
13785            pbuf = PyUnicode_DATA(temp);
13786            len = PyUnicode_GET_LENGTH(temp);
13787
13788            if (c == 's' || c == 'r' || c == 'a') {
13789                if (prec >= 0 && len > prec)
13790                    len = prec;
13791            }
13792
13793            /* pbuf is initialized here. */
13794            pindex = 0;
13795            if (sign) {
13796                Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13797                if (ch == '-' || ch == '+') {
13798                    signchar = ch;
13799                    len--;
13800                    pindex++;
13801                }
13802                else if (flags & F_SIGN)
13803                    signchar = '+';
13804                else if (flags & F_BLANK)
13805                    signchar = ' ';
13806                else
13807                    sign = 0;
13808            }
13809            if (width < len)
13810                width = len;
13811
13812            /* Compute the length and maximum character of the
13813               written characters */
13814            bufmaxchar = 127;
13815            if (!(flags & F_LJUST)) {
13816                if (sign) {
13817                    if ((width-1) > len)
13818                        bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13819                }
13820                else {
13821                    if (width > len)
13822                        bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13823                }
13824            }
13825            maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
13826            bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13827
13828            buflen = width;
13829            if (sign && len == width)
13830                buflen++;
13831
13832            if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
13833                goto onError;
13834
13835            /* Write characters */
13836            if (sign) {
13837                if (fill != ' ') {
13838                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13839                    writer.pos += 1;
13840                }
13841                if (width > len)
13842                    width--;
13843            }
13844            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13845                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13846                assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13847                if (fill != ' ') {
13848                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13849                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13850                    writer.pos += 2;
13851                    pindex += 2;
13852                }
13853                width -= 2;
13854                if (width < 0)
13855                    width = 0;
13856                len -= 2;
13857            }
13858            if (width > len && !(flags & F_LJUST)) {
13859                sublen = width - len;
13860                FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13861                writer.pos += sublen;
13862                width = len;
13863            }
13864            if (fill == ' ') {
13865                if (sign) {
13866                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13867                    writer.pos += 1;
13868                }
13869                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13870                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13871                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13872                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13873                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13874                    writer.pos += 2;
13875                    pindex += 2;
13876                }
13877            }
13878
13879            _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13880                                          temp, pindex, len);
13881            writer.pos += len;
13882            if (width > len) {
13883                sublen = width - len;
13884                FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13885                writer.pos += sublen;
13886            }
13887
13888nextarg:
13889            if (dict && (argidx < arglen) && c != '%') {
13890                PyErr_SetString(PyExc_TypeError,
13891                                "not all arguments converted during string formatting");
13892                goto onError;
13893            }
13894            Py_CLEAR(temp);
13895        } /* '%' */
13896    } /* until end */
13897    if (argidx < arglen && !dict) {
13898        PyErr_SetString(PyExc_TypeError,
13899                        "not all arguments converted during string formatting");
13900        goto onError;
13901    }
13902
13903    if (args_owned) {
13904        Py_DECREF(args);
13905    }
13906    Py_DECREF(uformat);
13907    Py_XDECREF(temp);
13908    Py_XDECREF(second);
13909    return _PyUnicodeWriter_Finish(&writer);
13910
13911  onError:
13912    Py_DECREF(uformat);
13913    Py_XDECREF(temp);
13914    Py_XDECREF(second);
13915    _PyUnicodeWriter_Dealloc(&writer);
13916    if (args_owned) {
13917        Py_DECREF(args);
13918    }
13919    return NULL;
13920}
13921
13922static PyObject *
13923unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13924
13925static PyObject *
13926unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13927{
13928    PyObject *x = NULL;
13929    static char *kwlist[] = {"object", "encoding", "errors", 0};
13930    char *encoding = NULL;
13931    char *errors = NULL;
13932
13933    if (type != &PyUnicode_Type)
13934        return unicode_subtype_new(type, args, kwds);
13935    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13936                                     kwlist, &x, &encoding, &errors))
13937        return NULL;
13938    if (x == NULL) {
13939        Py_INCREF(unicode_empty);
13940        return unicode_empty;
13941    }
13942    if (encoding == NULL && errors == NULL)
13943        return PyObject_Str(x);
13944    else
13945        return PyUnicode_FromEncodedObject(x, encoding, errors);
13946}
13947
13948static PyObject *
13949unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13950{
13951    PyObject *unicode, *self;
13952    Py_ssize_t length, char_size;
13953    int share_wstr, share_utf8;
13954    unsigned int kind;
13955    void *data;
13956
13957    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13958
13959    unicode = unicode_new(&PyUnicode_Type, args, kwds);
13960    if (unicode == NULL)
13961        return NULL;
13962    assert(_PyUnicode_CHECK(unicode));
13963    if (PyUnicode_READY(unicode) == -1) {
13964        Py_DECREF(unicode);
13965        return NULL;
13966    }
13967
13968    self = type->tp_alloc(type, 0);
13969    if (self == NULL) {
13970        Py_DECREF(unicode);
13971        return NULL;
13972    }
13973    kind = PyUnicode_KIND(unicode);
13974    length = PyUnicode_GET_LENGTH(unicode);
13975
13976    _PyUnicode_LENGTH(self) = length;
13977#ifdef Py_DEBUG
13978    _PyUnicode_HASH(self) = -1;
13979#else
13980    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13981#endif
13982    _PyUnicode_STATE(self).interned = 0;
13983    _PyUnicode_STATE(self).kind = kind;
13984    _PyUnicode_STATE(self).compact = 0;
13985    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13986    _PyUnicode_STATE(self).ready = 1;
13987    _PyUnicode_WSTR(self) = NULL;
13988    _PyUnicode_UTF8_LENGTH(self) = 0;
13989    _PyUnicode_UTF8(self) = NULL;
13990    _PyUnicode_WSTR_LENGTH(self) = 0;
13991    _PyUnicode_DATA_ANY(self) = NULL;
13992
13993    share_utf8 = 0;
13994    share_wstr = 0;
13995    if (kind == PyUnicode_1BYTE_KIND) {
13996        char_size = 1;
13997        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13998            share_utf8 = 1;
13999    }
14000    else if (kind == PyUnicode_2BYTE_KIND) {
14001        char_size = 2;
14002        if (sizeof(wchar_t) == 2)
14003            share_wstr = 1;
14004    }
14005    else {
14006        assert(kind == PyUnicode_4BYTE_KIND);
14007        char_size = 4;
14008        if (sizeof(wchar_t) == 4)
14009            share_wstr = 1;
14010    }
14011
14012    /* Ensure we won't overflow the length. */
14013    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14014        PyErr_NoMemory();
14015        goto onError;
14016    }
14017    data = PyObject_MALLOC((length + 1) * char_size);
14018    if (data == NULL) {
14019        PyErr_NoMemory();
14020        goto onError;
14021    }
14022
14023    _PyUnicode_DATA_ANY(self) = data;
14024    if (share_utf8) {
14025        _PyUnicode_UTF8_LENGTH(self) = length;
14026        _PyUnicode_UTF8(self) = data;
14027    }
14028    if (share_wstr) {
14029        _PyUnicode_WSTR_LENGTH(self) = length;
14030        _PyUnicode_WSTR(self) = (wchar_t *)data;
14031    }
14032
14033    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14034              kind * (length + 1));
14035    assert(_PyUnicode_CheckConsistency(self, 1));
14036#ifdef Py_DEBUG
14037    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14038#endif
14039    Py_DECREF(unicode);
14040    return self;
14041
14042onError:
14043    Py_DECREF(unicode);
14044    Py_DECREF(self);
14045    return NULL;
14046}
14047
14048PyDoc_STRVAR(unicode_doc,
14049             "str(string[, encoding[, errors]]) -> str\n\
14050\n\
14051Create a new string object from the given encoded string.\n\
14052encoding defaults to the current default string encoding.\n\
14053errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
14054
14055static PyObject *unicode_iter(PyObject *seq);
14056
14057PyTypeObject PyUnicode_Type = {
14058    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14059    "str",              /* tp_name */
14060    sizeof(PyUnicodeObject),        /* tp_size */
14061    0,                  /* tp_itemsize */
14062    /* Slots */
14063    (destructor)unicode_dealloc,    /* tp_dealloc */
14064    0,                  /* tp_print */
14065    0,                  /* tp_getattr */
14066    0,                  /* tp_setattr */
14067    0,                  /* tp_reserved */
14068    unicode_repr,           /* tp_repr */
14069    &unicode_as_number,         /* tp_as_number */
14070    &unicode_as_sequence,       /* tp_as_sequence */
14071    &unicode_as_mapping,        /* tp_as_mapping */
14072    (hashfunc) unicode_hash,        /* tp_hash*/
14073    0,                  /* tp_call*/
14074    (reprfunc) unicode_str,     /* tp_str */
14075    PyObject_GenericGetAttr,        /* tp_getattro */
14076    0,                  /* tp_setattro */
14077    0,                  /* tp_as_buffer */
14078    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14079    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14080    unicode_doc,            /* tp_doc */
14081    0,                  /* tp_traverse */
14082    0,                  /* tp_clear */
14083    PyUnicode_RichCompare,      /* tp_richcompare */
14084    0,                  /* tp_weaklistoffset */
14085    unicode_iter,           /* tp_iter */
14086    0,                  /* tp_iternext */
14087    unicode_methods,            /* tp_methods */
14088    0,                  /* tp_members */
14089    0,                  /* tp_getset */
14090    &PyBaseObject_Type,         /* tp_base */
14091    0,                  /* tp_dict */
14092    0,                  /* tp_descr_get */
14093    0,                  /* tp_descr_set */
14094    0,                  /* tp_dictoffset */
14095    0,                  /* tp_init */
14096    0,                  /* tp_alloc */
14097    unicode_new,            /* tp_new */
14098    PyObject_Del,           /* tp_free */
14099};
14100
14101/* Initialize the Unicode implementation */
14102
14103int _PyUnicode_Init(void)
14104{
14105    int i;
14106
14107    /* XXX - move this array to unicodectype.c ? */
14108    Py_UCS2 linebreak[] = {
14109        0x000A, /* LINE FEED */
14110        0x000D, /* CARRIAGE RETURN */
14111        0x001C, /* FILE SEPARATOR */
14112        0x001D, /* GROUP SEPARATOR */
14113        0x001E, /* RECORD SEPARATOR */
14114        0x0085, /* NEXT LINE */
14115        0x2028, /* LINE SEPARATOR */
14116        0x2029, /* PARAGRAPH SEPARATOR */
14117    };
14118
14119    /* Init the implementation */
14120    unicode_empty = PyUnicode_New(0, 0);
14121    if (!unicode_empty)
14122        Py_FatalError("Can't create empty string");
14123    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
14124
14125    for (i = 0; i < 256; i++)
14126        unicode_latin1[i] = NULL;
14127    if (PyType_Ready(&PyUnicode_Type) < 0)
14128        Py_FatalError("Can't initialize 'unicode'");
14129
14130    /* initialize the linebreak bloom filter */
14131    bloom_linebreak = make_bloom_mask(
14132        PyUnicode_2BYTE_KIND, linebreak,
14133        Py_ARRAY_LENGTH(linebreak));
14134
14135    PyType_Ready(&EncodingMapType);
14136
14137#ifdef HAVE_MBCS
14138    winver.dwOSVersionInfoSize = sizeof(winver);
14139    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14140        PyErr_SetFromWindowsErr(0);
14141        return -1;
14142    }
14143#endif
14144    return 0;
14145}
14146
14147/* Finalize the Unicode implementation */
14148
14149int
14150PyUnicode_ClearFreeList(void)
14151{
14152    return 0;
14153}
14154
14155void
14156_PyUnicode_Fini(void)
14157{
14158    int i;
14159
14160    Py_XDECREF(unicode_empty);
14161    unicode_empty = NULL;
14162
14163    for (i = 0; i < 256; i++) {
14164        if (unicode_latin1[i]) {
14165            Py_DECREF(unicode_latin1[i]);
14166            unicode_latin1[i] = NULL;
14167        }
14168    }
14169    _PyUnicode_ClearStaticStrings();
14170    (void)PyUnicode_ClearFreeList();
14171}
14172
14173void
14174PyUnicode_InternInPlace(PyObject **p)
14175{
14176    register PyObject *s = *p;
14177    PyObject *t;
14178#ifdef Py_DEBUG
14179    assert(s != NULL);
14180    assert(_PyUnicode_CHECK(s));
14181#else
14182    if (s == NULL || !PyUnicode_Check(s))
14183        return;
14184#endif
14185    /* If it's a subclass, we don't really know what putting
14186       it in the interned dict might do. */
14187    if (!PyUnicode_CheckExact(s))
14188        return;
14189    if (PyUnicode_CHECK_INTERNED(s))
14190        return;
14191    if (interned == NULL) {
14192        interned = PyDict_New();
14193        if (interned == NULL) {
14194            PyErr_Clear(); /* Don't leave an exception */
14195            return;
14196        }
14197    }
14198    /* It might be that the GetItem call fails even
14199       though the key is present in the dictionary,
14200       namely when this happens during a stack overflow. */
14201    Py_ALLOW_RECURSION
14202    t = PyDict_GetItem(interned, s);
14203    Py_END_ALLOW_RECURSION
14204
14205        if (t) {
14206            Py_INCREF(t);
14207            Py_DECREF(*p);
14208            *p = t;
14209            return;
14210        }
14211
14212    PyThreadState_GET()->recursion_critical = 1;
14213    if (PyDict_SetItem(interned, s, s) < 0) {
14214        PyErr_Clear();
14215        PyThreadState_GET()->recursion_critical = 0;
14216        return;
14217    }
14218    PyThreadState_GET()->recursion_critical = 0;
14219    /* The two references in interned are not counted by refcnt.
14220       The deallocator will take care of this */
14221    Py_REFCNT(s) -= 2;
14222    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
14223}
14224
14225void
14226PyUnicode_InternImmortal(PyObject **p)
14227{
14228    PyUnicode_InternInPlace(p);
14229    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
14230        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
14231        Py_INCREF(*p);
14232    }
14233}
14234
14235PyObject *
14236PyUnicode_InternFromString(const char *cp)
14237{
14238    PyObject *s = PyUnicode_FromString(cp);
14239    if (s == NULL)
14240        return NULL;
14241    PyUnicode_InternInPlace(&s);
14242    return s;
14243}
14244
14245void
14246_Py_ReleaseInternedUnicodeStrings(void)
14247{
14248    PyObject *keys;
14249    PyObject *s;
14250    Py_ssize_t i, n;
14251    Py_ssize_t immortal_size = 0, mortal_size = 0;
14252
14253    if (interned == NULL || !PyDict_Check(interned))
14254        return;
14255    keys = PyDict_Keys(interned);
14256    if (keys == NULL || !PyList_Check(keys)) {
14257        PyErr_Clear();
14258        return;
14259    }
14260
14261    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14262       detector, interned unicode strings are not forcibly deallocated;
14263       rather, we give them their stolen references back, and then clear
14264       and DECREF the interned dict. */
14265
14266    n = PyList_GET_SIZE(keys);
14267    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14268            n);
14269    for (i = 0; i < n; i++) {
14270        s = PyList_GET_ITEM(keys, i);
14271        if (PyUnicode_READY(s) == -1) {
14272            assert(0 && "could not ready string");
14273            fprintf(stderr, "could not ready string\n");
14274        }
14275        switch (PyUnicode_CHECK_INTERNED(s)) {
14276        case SSTATE_NOT_INTERNED:
14277            /* XXX Shouldn't happen */
14278            break;
14279        case SSTATE_INTERNED_IMMORTAL:
14280            Py_REFCNT(s) += 1;
14281            immortal_size += PyUnicode_GET_LENGTH(s);
14282            break;
14283        case SSTATE_INTERNED_MORTAL:
14284            Py_REFCNT(s) += 2;
14285            mortal_size += PyUnicode_GET_LENGTH(s);
14286            break;
14287        default:
14288            Py_FatalError("Inconsistent interned string state.");
14289        }
14290        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14291    }
14292    fprintf(stderr, "total size of all interned strings: "
14293            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14294            "mortal/immortal\n", mortal_size, immortal_size);
14295    Py_DECREF(keys);
14296    PyDict_Clear(interned);
14297    Py_DECREF(interned);
14298    interned = NULL;
14299}
14300
14301
14302/********************* Unicode Iterator **************************/
14303
14304typedef struct {
14305    PyObject_HEAD
14306    Py_ssize_t it_index;
14307    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14308} unicodeiterobject;
14309
14310static void
14311unicodeiter_dealloc(unicodeiterobject *it)
14312{
14313    _PyObject_GC_UNTRACK(it);
14314    Py_XDECREF(it->it_seq);
14315    PyObject_GC_Del(it);
14316}
14317
14318static int
14319unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14320{
14321    Py_VISIT(it->it_seq);
14322    return 0;
14323}
14324
14325static PyObject *
14326unicodeiter_next(unicodeiterobject *it)
14327{
14328    PyObject *seq, *item;
14329
14330    assert(it != NULL);
14331    seq = it->it_seq;
14332    if (seq == NULL)
14333        return NULL;
14334    assert(_PyUnicode_CHECK(seq));
14335
14336    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14337        int kind = PyUnicode_KIND(seq);
14338        void *data = PyUnicode_DATA(seq);
14339        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14340        item = PyUnicode_FromOrdinal(chr);
14341        if (item != NULL)
14342            ++it->it_index;
14343        return item;
14344    }
14345
14346    Py_DECREF(seq);
14347    it->it_seq = NULL;
14348    return NULL;
14349}
14350
14351static PyObject *
14352unicodeiter_len(unicodeiterobject *it)
14353{
14354    Py_ssize_t len = 0;
14355    if (it->it_seq)
14356        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14357    return PyLong_FromSsize_t(len);
14358}
14359
14360PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14361
14362static PyObject *
14363unicodeiter_reduce(unicodeiterobject *it)
14364{
14365    if (it->it_seq != NULL) {
14366        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
14367                             it->it_seq, it->it_index);
14368    } else {
14369        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14370        if (u == NULL)
14371            return NULL;
14372        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
14373    }
14374}
14375
14376PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14377
14378static PyObject *
14379unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14380{
14381    Py_ssize_t index = PyLong_AsSsize_t(state);
14382    if (index == -1 && PyErr_Occurred())
14383        return NULL;
14384    if (index < 0)
14385        index = 0;
14386    it->it_index = index;
14387    Py_RETURN_NONE;
14388}
14389
14390PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14391
14392static PyMethodDef unicodeiter_methods[] = {
14393    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14394     length_hint_doc},
14395    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14396     reduce_doc},
14397    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
14398     setstate_doc},
14399    {NULL,      NULL}       /* sentinel */
14400};
14401
14402PyTypeObject PyUnicodeIter_Type = {
14403    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14404    "str_iterator",         /* tp_name */
14405    sizeof(unicodeiterobject),      /* tp_basicsize */
14406    0,                  /* tp_itemsize */
14407    /* methods */
14408    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14409    0,                  /* tp_print */
14410    0,                  /* tp_getattr */
14411    0,                  /* tp_setattr */
14412    0,                  /* tp_reserved */
14413    0,                  /* tp_repr */
14414    0,                  /* tp_as_number */
14415    0,                  /* tp_as_sequence */
14416    0,                  /* tp_as_mapping */
14417    0,                  /* tp_hash */
14418    0,                  /* tp_call */
14419    0,                  /* tp_str */
14420    PyObject_GenericGetAttr,        /* tp_getattro */
14421    0,                  /* tp_setattro */
14422    0,                  /* tp_as_buffer */
14423    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14424    0,                  /* tp_doc */
14425    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14426    0,                  /* tp_clear */
14427    0,                  /* tp_richcompare */
14428    0,                  /* tp_weaklistoffset */
14429    PyObject_SelfIter,          /* tp_iter */
14430    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14431    unicodeiter_methods,            /* tp_methods */
14432    0,
14433};
14434
14435static PyObject *
14436unicode_iter(PyObject *seq)
14437{
14438    unicodeiterobject *it;
14439
14440    if (!PyUnicode_Check(seq)) {
14441        PyErr_BadInternalCall();
14442        return NULL;
14443    }
14444    if (PyUnicode_READY(seq) == -1)
14445        return NULL;
14446    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14447    if (it == NULL)
14448        return NULL;
14449    it->it_index = 0;
14450    Py_INCREF(seq);
14451    it->it_seq = seq;
14452    _PyObject_GC_TRACK(it);
14453    return (PyObject *)it;
14454}
14455
14456
14457size_t
14458Py_UNICODE_strlen(const Py_UNICODE *u)
14459{
14460    int res = 0;
14461    while(*u++)
14462        res++;
14463    return res;
14464}
14465
14466Py_UNICODE*
14467Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14468{
14469    Py_UNICODE *u = s1;
14470    while ((*u++ = *s2++));
14471    return s1;
14472}
14473
14474Py_UNICODE*
14475Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14476{
14477    Py_UNICODE *u = s1;
14478    while ((*u++ = *s2++))
14479        if (n-- == 0)
14480            break;
14481    return s1;
14482}
14483
14484Py_UNICODE*
14485Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14486{
14487    Py_UNICODE *u1 = s1;
14488    u1 += Py_UNICODE_strlen(u1);
14489    Py_UNICODE_strcpy(u1, s2);
14490    return s1;
14491}
14492
14493int
14494Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14495{
14496    while (*s1 && *s2 && *s1 == *s2)
14497        s1++, s2++;
14498    if (*s1 && *s2)
14499        return (*s1 < *s2) ? -1 : +1;
14500    if (*s1)
14501        return 1;
14502    if (*s2)
14503        return -1;
14504    return 0;
14505}
14506
14507int
14508Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14509{
14510    register Py_UNICODE u1, u2;
14511    for (; n != 0; n--) {
14512        u1 = *s1;
14513        u2 = *s2;
14514        if (u1 != u2)
14515            return (u1 < u2) ? -1 : +1;
14516        if (u1 == '\0')
14517            return 0;
14518        s1++;
14519        s2++;
14520    }
14521    return 0;
14522}
14523
14524Py_UNICODE*
14525Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14526{
14527    const Py_UNICODE *p;
14528    for (p = s; *p; p++)
14529        if (*p == c)
14530            return (Py_UNICODE*)p;
14531    return NULL;
14532}
14533
14534Py_UNICODE*
14535Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14536{
14537    const Py_UNICODE *p;
14538    p = s + Py_UNICODE_strlen(s);
14539    while (p != s) {
14540        p--;
14541        if (*p == c)
14542            return (Py_UNICODE*)p;
14543    }
14544    return NULL;
14545}
14546
14547Py_UNICODE*
14548PyUnicode_AsUnicodeCopy(PyObject *unicode)
14549{
14550    Py_UNICODE *u, *copy;
14551    Py_ssize_t len, size;
14552
14553    if (!PyUnicode_Check(unicode)) {
14554        PyErr_BadArgument();
14555        return NULL;
14556    }
14557    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14558    if (u == NULL)
14559        return NULL;
14560    /* Ensure we won't overflow the size. */
14561    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14562        PyErr_NoMemory();
14563        return NULL;
14564    }
14565    size = len + 1; /* copy the null character */
14566    size *= sizeof(Py_UNICODE);
14567    copy = PyMem_Malloc(size);
14568    if (copy == NULL) {
14569        PyErr_NoMemory();
14570        return NULL;
14571    }
14572    memcpy(copy, u, size);
14573    return copy;
14574}
14575
14576/* A _string module, to export formatter_parser and formatter_field_name_split
14577   to the string.Formatter class implemented in Python. */
14578
14579static PyMethodDef _string_methods[] = {
14580    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14581     METH_O, PyDoc_STR("split the argument as a field name")},
14582    {"formatter_parser", (PyCFunction) formatter_parser,
14583     METH_O, PyDoc_STR("parse the argument as a format string")},
14584    {NULL, NULL}
14585};
14586
14587static struct PyModuleDef _string_module = {
14588    PyModuleDef_HEAD_INIT,
14589    "_string",
14590    PyDoc_STR("string helper module"),
14591    0,
14592    _string_methods,
14593    NULL,
14594    NULL,
14595    NULL,
14596    NULL
14597};
14598
14599PyMODINIT_FUNC
14600PyInit__string(void)
14601{
14602    return PyModule_Create(&_string_module);
14603}
14604
14605
14606#ifdef __cplusplus
14607}
14608#endif
14609