unicodeobject.c revision 1929407406966f9f2093a9e6b421cad39361dbb4
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
58/* --- Globals ------------------------------------------------------------
59
60   The globals are initialized by the _PyUnicode_Init() API and should
61   not be used before calling that API.
62
63*/
64
65
66#ifdef __cplusplus
67extern "C" {
68#endif
69
70/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
73#ifdef Py_DEBUG
74#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
75#else
76#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
78
79#define _PyUnicode_UTF8(op)                             \
80    (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op)                              \
82    (assert(_PyUnicode_CHECK(op)),                      \
83     assert(PyUnicode_IS_READY(op)),                    \
84     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
85         ((char*)((PyASCIIObject*)(op) + 1)) :          \
86         _PyUnicode_UTF8(op))
87#define _PyUnicode_UTF8_LENGTH(op)                      \
88    (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op)                       \
90    (assert(_PyUnicode_CHECK(op)),                      \
91     assert(PyUnicode_IS_READY(op)),                    \
92     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
93         ((PyASCIIObject*)(op))->length :               \
94         _PyUnicode_UTF8_LENGTH(op))
95#define _PyUnicode_WSTR(op)                             \
96    (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op)                      \
98    (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op)                           \
100    (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op)                            \
102    (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op)                             \
104    (((PyASCIIObject *)(op))->hash)
105#define _PyUnicode_KIND(op)                             \
106    (assert(_PyUnicode_CHECK(op)),                      \
107     ((PyASCIIObject *)(op))->state.kind)
108#define _PyUnicode_GET_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     ((PyASCIIObject *)(op))->length)
111#define _PyUnicode_DATA_ANY(op)                         \
112    (((PyUnicodeObject*)(op))->data.any)
113
114/* Optimized version of Py_MAX() to compute the maximum character:
115   use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2)                 \
117    ((maxchar1) | (maxchar2))
118
119#undef PyUnicode_READY
120#define PyUnicode_READY(op)                             \
121    (assert(_PyUnicode_CHECK(op)),                      \
122     (PyUnicode_IS_READY(op) ?                          \
123      0 :                                               \
124      _PyUnicode_Ready(op)))
125
126#define _PyUnicode_SHARE_UTF8(op)                       \
127    (assert(_PyUnicode_CHECK(op)),                      \
128     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
129     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op)                       \
131    (assert(_PyUnicode_CHECK(op)),                      \
132     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated UTF-8 memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
137    (assert(_PyUnicode_CHECK(op)),                      \
138     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
139      && _PyUnicode_UTF8(op)                            \
140      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
142/* true if the Unicode object has an allocated wstr memory block
143   (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
145    (assert(_PyUnicode_CHECK(op)),                      \
146     (_PyUnicode_WSTR(op) &&                            \
147      (!PyUnicode_IS_READY(op) ||                       \
148       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
150/* Generic helper macro to convert characters of different types.
151   from_type and to_type have to be valid type names, begin and end
152   are pointers to the source characters which should be of type
153   "from_type *".  to is a pointer of type "to_type *" and points to the
154   buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156    do {                                                \
157        to_type *_to = (to_type *) to;                  \
158        const from_type *_iter = (begin);               \
159        const from_type *_end = (end);                  \
160        Py_ssize_t n = (_end) - (_iter);                \
161        const from_type *_unrolled_end =                \
162            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
163        while (_iter < (_unrolled_end)) {               \
164            _to[0] = (to_type) _iter[0];                \
165            _to[1] = (to_type) _iter[1];                \
166            _to[2] = (to_type) _iter[2];                \
167            _to[3] = (to_type) _iter[3];                \
168            _iter += 4; _to += 4;                       \
169        }                                               \
170        while (_iter < (_end))                          \
171            *_to++ = (to_type) *_iter++;                \
172    } while (0)
173
174/* This dictionary holds all interned unicode strings.  Note that references
175   to strings in this dictionary are *not* counted in the string's ob_refcnt.
176   When the interned string reaches a refcnt of 0 the string deallocation
177   function will delete the reference from this dictionary.
178
179   Another way to look at this is that to say that the actual reference
180   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
181*/
182static PyObject *interned;
183
184/* The empty Unicode object is shared to improve performance. */
185static PyObject *unicode_empty;
186
187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
190/* Single character Unicode strings in the Latin-1 range are being
191   shared as well. */
192static PyObject *unicode_latin1[256];
193
194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
196    0, 0, 0, 0, 0, 0, 0, 0,
197/*     case 0x0009: * CHARACTER TABULATION */
198/*     case 0x000A: * LINE FEED */
199/*     case 0x000B: * LINE TABULATION */
200/*     case 0x000C: * FORM FEED */
201/*     case 0x000D: * CARRIAGE RETURN */
202    0, 1, 1, 1, 1, 1, 0, 0,
203    0, 0, 0, 0, 0, 0, 0, 0,
204/*     case 0x001C: * FILE SEPARATOR */
205/*     case 0x001D: * GROUP SEPARATOR */
206/*     case 0x001E: * RECORD SEPARATOR */
207/*     case 0x001F: * UNIT SEPARATOR */
208    0, 0, 0, 0, 1, 1, 1, 1,
209/*     case 0x0020: * SPACE */
210    1, 0, 0, 0, 0, 0, 0, 0,
211    0, 0, 0, 0, 0, 0, 0, 0,
212    0, 0, 0, 0, 0, 0, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* forward */
226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
227static PyObject* get_latin1_char(unsigned char ch);
228static int unicode_modifiable(PyObject *unicode);
229
230
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
239unicode_encode_call_errorhandler(const char *errors,
240       PyObject **errorHandler,const char *encoding, const char *reason,
241       PyObject *unicode, PyObject **exceptionObject,
242       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
244static void
245raise_encode_exception(PyObject **exceptionObject,
246                       const char *encoding,
247                       PyObject *unicode,
248                       Py_ssize_t startpos, Py_ssize_t endpos,
249                       const char *reason);
250
251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
253    0, 0, 0, 0, 0, 0, 0, 0,
254/*         0x000A, * LINE FEED */
255/*         0x000B, * LINE TABULATION */
256/*         0x000C, * FORM FEED */
257/*         0x000D, * CARRIAGE RETURN */
258    0, 0, 1, 1, 1, 1, 0, 0,
259    0, 0, 0, 0, 0, 0, 0, 0,
260/*         0x001C, * FILE SEPARATOR */
261/*         0x001D, * GROUP SEPARATOR */
262/*         0x001E, * RECORD SEPARATOR */
263    0, 0, 0, 0, 1, 1, 1, 0,
264    0, 0, 0, 0, 0, 0, 0, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268
269    0, 0, 0, 0, 0, 0, 0, 0,
270    0, 0, 0, 0, 0, 0, 0, 0,
271    0, 0, 0, 0, 0, 0, 0, 0,
272    0, 0, 0, 0, 0, 0, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0
277};
278
279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280   This function is kept for backward compatibility with the old API. */
281Py_UNICODE
282PyUnicode_GetMax(void)
283{
284#ifdef Py_UNICODE_WIDE
285    return 0x10FFFF;
286#else
287    /* This is actually an illegal character, so it should
288       not be passed to unichr. */
289    return 0xFFFF;
290#endif
291}
292
293#ifdef Py_DEBUG
294int
295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
296{
297    PyASCIIObject *ascii;
298    unsigned int kind;
299
300    assert(PyUnicode_Check(op));
301
302    ascii = (PyASCIIObject *)op;
303    kind = ascii->state.kind;
304
305    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
306        assert(kind == PyUnicode_1BYTE_KIND);
307        assert(ascii->state.ready == 1);
308    }
309    else {
310        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
311        void *data;
312
313        if (ascii->state.compact == 1) {
314            data = compact + 1;
315            assert(kind == PyUnicode_1BYTE_KIND
316                   || kind == PyUnicode_2BYTE_KIND
317                   || kind == PyUnicode_4BYTE_KIND);
318            assert(ascii->state.ascii == 0);
319            assert(ascii->state.ready == 1);
320            assert (compact->utf8 != data);
321        }
322        else {
323            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325            data = unicode->data.any;
326            if (kind == PyUnicode_WCHAR_KIND) {
327                assert(ascii->length == 0);
328                assert(ascii->hash == -1);
329                assert(ascii->state.compact == 0);
330                assert(ascii->state.ascii == 0);
331                assert(ascii->state.ready == 0);
332                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
333                assert(ascii->wstr != NULL);
334                assert(data == NULL);
335                assert(compact->utf8 == NULL);
336            }
337            else {
338                assert(kind == PyUnicode_1BYTE_KIND
339                       || kind == PyUnicode_2BYTE_KIND
340                       || kind == PyUnicode_4BYTE_KIND);
341                assert(ascii->state.compact == 0);
342                assert(ascii->state.ready == 1);
343                assert(data != NULL);
344                if (ascii->state.ascii) {
345                    assert (compact->utf8 == data);
346                    assert (compact->utf8_length == ascii->length);
347                }
348                else
349                    assert (compact->utf8 != data);
350            }
351        }
352        if (kind != PyUnicode_WCHAR_KIND) {
353            if (
354#if SIZEOF_WCHAR_T == 2
355                kind == PyUnicode_2BYTE_KIND
356#else
357                kind == PyUnicode_4BYTE_KIND
358#endif
359               )
360            {
361                assert(ascii->wstr == data);
362                assert(compact->wstr_length == ascii->length);
363            } else
364                assert(ascii->wstr != data);
365        }
366
367        if (compact->utf8 == NULL)
368            assert(compact->utf8_length == 0);
369        if (ascii->wstr == NULL)
370            assert(compact->wstr_length == 0);
371    }
372    /* check that the best kind is used */
373    if (check_content && kind != PyUnicode_WCHAR_KIND)
374    {
375        Py_ssize_t i;
376        Py_UCS4 maxchar = 0;
377        void *data;
378        Py_UCS4 ch;
379
380        data = PyUnicode_DATA(ascii);
381        for (i=0; i < ascii->length; i++)
382        {
383            ch = PyUnicode_READ(kind, data, i);
384            if (ch > maxchar)
385                maxchar = ch;
386        }
387        if (kind == PyUnicode_1BYTE_KIND) {
388            if (ascii->state.ascii == 0) {
389                assert(maxchar >= 128);
390                assert(maxchar <= 255);
391            }
392            else
393                assert(maxchar < 128);
394        }
395        else if (kind == PyUnicode_2BYTE_KIND) {
396            assert(maxchar >= 0x100);
397            assert(maxchar <= 0xFFFF);
398        }
399        else {
400            assert(maxchar >= 0x10000);
401            assert(maxchar <= MAX_UNICODE);
402        }
403        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
404    }
405    return 1;
406}
407#endif
408
409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413    Py_ssize_t len;
414
415    assert(Py_REFCNT(unicode) == 1);
416
417    len = _PyUnicode_WSTR_LENGTH(unicode);
418    if (len == 0) {
419        Py_INCREF(unicode_empty);
420        Py_DECREF(unicode);
421        return unicode_empty;
422    }
423
424    if (len == 1) {
425        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
426        if (ch < 256) {
427            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
428            Py_DECREF(unicode);
429            return latin1_char;
430        }
431    }
432
433    if (_PyUnicode_Ready(unicode) < 0) {
434        Py_XDECREF(unicode);
435        return NULL;
436    }
437#else
438    /* don't make the result ready in debug mode to ensure that the caller
439       makes the string ready before using it */
440    assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442    return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448    Py_ssize_t length;
449
450    length = PyUnicode_GET_LENGTH(unicode);
451    if (length == 0) {
452        if (unicode != unicode_empty) {
453            Py_INCREF(unicode_empty);
454            Py_DECREF(unicode);
455        }
456        return unicode_empty;
457    }
458
459    if (length == 1) {
460        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461        if (ch < 256) {
462            PyObject *latin1_char = unicode_latin1[ch];
463            if (latin1_char != NULL) {
464                if (unicode != latin1_char) {
465                    Py_INCREF(latin1_char);
466                    Py_DECREF(unicode);
467                }
468                return latin1_char;
469            }
470            else {
471                assert(_PyUnicode_CheckConsistency(unicode, 1));
472                Py_INCREF(unicode);
473                unicode_latin1[ch] = unicode;
474                return unicode;
475            }
476        }
477    }
478
479    assert(_PyUnicode_CheckConsistency(unicode, 1));
480    return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486    assert(_PyUnicode_CHECK(unicode));
487    if (PyUnicode_IS_READY(unicode))
488        return unicode_result_ready(unicode);
489    else
490        return unicode_result_wchar(unicode);
491}
492
493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496    if (PyUnicode_CheckExact(unicode)) {
497        if (PyUnicode_READY(unicode) == -1)
498            return NULL;
499        Py_INCREF(unicode);
500        return unicode;
501    }
502    else
503        /* Subtype -- return genuine unicode string with the same value. */
504        return _PyUnicode_Copy(unicode);
505}
506
507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514   to keep things simple, we use a single bitmask, using the least 5
515   bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
535
536#define BLOOM_LINEBREAK(ch)                                             \
537    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
538     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
539
540Py_LOCAL_INLINE(BLOOM_MASK)
541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
542{
543    /* calculate simple bloom-style bitmask for a given unicode string */
544
545    BLOOM_MASK mask;
546    Py_ssize_t i;
547
548    mask = 0;
549    for (i = 0; i < len; i++)
550        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
551
552    return mask;
553}
554
555#define BLOOM_MEMBER(mask, chr, str) \
556    (BLOOM(mask, chr) \
557     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
558
559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
605#include "stringlib/undef.h"
606
607/* --- Unicode Object ----------------------------------------------------- */
608
609static PyObject *
610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
611
612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613                                     Py_ssize_t size, Py_UCS4 ch,
614                                     int direction)
615{
616    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618    switch (kind) {
619    case PyUnicode_1BYTE_KIND:
620        {
621            Py_UCS1 ch1 = (Py_UCS1) ch;
622            if (ch1 == ch)
623                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624            else
625                return -1;
626        }
627    case PyUnicode_2BYTE_KIND:
628        {
629            Py_UCS2 ch2 = (Py_UCS2) ch;
630            if (ch2 == ch)
631                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632            else
633                return -1;
634        }
635    case PyUnicode_4BYTE_KIND:
636        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637    default:
638        assert(0);
639        return -1;
640    }
641}
642
643static PyObject*
644resize_compact(PyObject *unicode, Py_ssize_t length)
645{
646    Py_ssize_t char_size;
647    Py_ssize_t struct_size;
648    Py_ssize_t new_size;
649    int share_wstr;
650    PyObject *new_unicode;
651    assert(unicode_modifiable(unicode));
652    assert(PyUnicode_IS_READY(unicode));
653    assert(PyUnicode_IS_COMPACT(unicode));
654
655    char_size = PyUnicode_KIND(unicode);
656    if (PyUnicode_IS_ASCII(unicode))
657        struct_size = sizeof(PyASCIIObject);
658    else
659        struct_size = sizeof(PyCompactUnicodeObject);
660    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
661
662    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663        PyErr_NoMemory();
664        return NULL;
665    }
666    new_size = (struct_size + (length + 1) * char_size);
667
668    _Py_DEC_REFTOTAL;
669    _Py_ForgetReference(unicode);
670
671    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672    if (new_unicode == NULL) {
673        _Py_NewReference(unicode);
674        PyErr_NoMemory();
675        return NULL;
676    }
677    unicode = new_unicode;
678    _Py_NewReference(unicode);
679
680    _PyUnicode_LENGTH(unicode) = length;
681    if (share_wstr) {
682        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
683        if (!PyUnicode_IS_ASCII(unicode))
684            _PyUnicode_WSTR_LENGTH(unicode) = length;
685    }
686    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687                    length, 0);
688    assert(_PyUnicode_CheckConsistency(unicode, 0));
689    return unicode;
690}
691
692static int
693resize_inplace(PyObject *unicode, Py_ssize_t length)
694{
695    wchar_t *wstr;
696    Py_ssize_t new_size;
697    assert(!PyUnicode_IS_COMPACT(unicode));
698    assert(Py_REFCNT(unicode) == 1);
699
700    if (PyUnicode_IS_READY(unicode)) {
701        Py_ssize_t char_size;
702        int share_wstr, share_utf8;
703        void *data;
704
705        data = _PyUnicode_DATA_ANY(unicode);
706        char_size = PyUnicode_KIND(unicode);
707        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
708        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
709
710        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
711            PyErr_NoMemory();
712            return -1;
713        }
714        new_size = (length + 1) * char_size;
715
716        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
717        {
718            PyObject_DEL(_PyUnicode_UTF8(unicode));
719            _PyUnicode_UTF8(unicode) = NULL;
720            _PyUnicode_UTF8_LENGTH(unicode) = 0;
721        }
722
723        data = (PyObject *)PyObject_REALLOC(data, new_size);
724        if (data == NULL) {
725            PyErr_NoMemory();
726            return -1;
727        }
728        _PyUnicode_DATA_ANY(unicode) = data;
729        if (share_wstr) {
730            _PyUnicode_WSTR(unicode) = data;
731            _PyUnicode_WSTR_LENGTH(unicode) = length;
732        }
733        if (share_utf8) {
734            _PyUnicode_UTF8(unicode) = data;
735            _PyUnicode_UTF8_LENGTH(unicode) = length;
736        }
737        _PyUnicode_LENGTH(unicode) = length;
738        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
739        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
740            assert(_PyUnicode_CheckConsistency(unicode, 0));
741            return 0;
742        }
743    }
744    assert(_PyUnicode_WSTR(unicode) != NULL);
745
746    /* check for integer overflow */
747    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
748        PyErr_NoMemory();
749        return -1;
750    }
751    new_size = sizeof(wchar_t) * (length + 1);
752    wstr =  _PyUnicode_WSTR(unicode);
753    wstr = PyObject_REALLOC(wstr, new_size);
754    if (!wstr) {
755        PyErr_NoMemory();
756        return -1;
757    }
758    _PyUnicode_WSTR(unicode) = wstr;
759    _PyUnicode_WSTR(unicode)[length] = 0;
760    _PyUnicode_WSTR_LENGTH(unicode) = length;
761    assert(_PyUnicode_CheckConsistency(unicode, 0));
762    return 0;
763}
764
765static PyObject*
766resize_copy(PyObject *unicode, Py_ssize_t length)
767{
768    Py_ssize_t copy_length;
769    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
770        PyObject *copy;
771
772        if (PyUnicode_READY(unicode) == -1)
773            return NULL;
774
775        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
776        if (copy == NULL)
777            return NULL;
778
779        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
780        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
781        return copy;
782    }
783    else {
784        PyObject *w;
785
786        w = (PyObject*)_PyUnicode_New(length);
787        if (w == NULL)
788            return NULL;
789        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
790        copy_length = Py_MIN(copy_length, length);
791        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
792                        copy_length);
793        return w;
794    }
795}
796
797/* We allocate one more byte to make sure the string is
798   Ux0000 terminated; some code (e.g. new_identifier)
799   relies on that.
800
801   XXX This allocator could further be enhanced by assuring that the
802   free list never reduces its size below 1.
803
804*/
805
806static PyUnicodeObject *
807_PyUnicode_New(Py_ssize_t length)
808{
809    register PyUnicodeObject *unicode;
810    size_t new_size;
811
812    /* Optimization for empty strings */
813    if (length == 0 && unicode_empty != NULL) {
814        Py_INCREF(unicode_empty);
815        return (PyUnicodeObject*)unicode_empty;
816    }
817
818    /* Ensure we won't overflow the size. */
819    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
820        return (PyUnicodeObject *)PyErr_NoMemory();
821    }
822    if (length < 0) {
823        PyErr_SetString(PyExc_SystemError,
824                        "Negative size passed to _PyUnicode_New");
825        return NULL;
826    }
827
828    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
829    if (unicode == NULL)
830        return NULL;
831    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
832    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
833    if (!_PyUnicode_WSTR(unicode)) {
834        Py_DECREF(unicode);
835        PyErr_NoMemory();
836        return NULL;
837    }
838
839    /* Initialize the first element to guard against cases where
840     * the caller fails before initializing str -- unicode_resize()
841     * reads str[0], and the Keep-Alive optimization can keep memory
842     * allocated for str alive across a call to unicode_dealloc(unicode).
843     * We don't want unicode_resize to read uninitialized memory in
844     * that case.
845     */
846    _PyUnicode_WSTR(unicode)[0] = 0;
847    _PyUnicode_WSTR(unicode)[length] = 0;
848    _PyUnicode_WSTR_LENGTH(unicode) = length;
849    _PyUnicode_HASH(unicode) = -1;
850    _PyUnicode_STATE(unicode).interned = 0;
851    _PyUnicode_STATE(unicode).kind = 0;
852    _PyUnicode_STATE(unicode).compact = 0;
853    _PyUnicode_STATE(unicode).ready = 0;
854    _PyUnicode_STATE(unicode).ascii = 0;
855    _PyUnicode_DATA_ANY(unicode) = NULL;
856    _PyUnicode_LENGTH(unicode) = 0;
857    _PyUnicode_UTF8(unicode) = NULL;
858    _PyUnicode_UTF8_LENGTH(unicode) = 0;
859    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
860    return unicode;
861}
862
863static const char*
864unicode_kind_name(PyObject *unicode)
865{
866    /* don't check consistency: unicode_kind_name() is called from
867       _PyUnicode_Dump() */
868    if (!PyUnicode_IS_COMPACT(unicode))
869    {
870        if (!PyUnicode_IS_READY(unicode))
871            return "wstr";
872        switch (PyUnicode_KIND(unicode))
873        {
874        case PyUnicode_1BYTE_KIND:
875            if (PyUnicode_IS_ASCII(unicode))
876                return "legacy ascii";
877            else
878                return "legacy latin1";
879        case PyUnicode_2BYTE_KIND:
880            return "legacy UCS2";
881        case PyUnicode_4BYTE_KIND:
882            return "legacy UCS4";
883        default:
884            return "<legacy invalid kind>";
885        }
886    }
887    assert(PyUnicode_IS_READY(unicode));
888    switch (PyUnicode_KIND(unicode)) {
889    case PyUnicode_1BYTE_KIND:
890        if (PyUnicode_IS_ASCII(unicode))
891            return "ascii";
892        else
893            return "latin1";
894    case PyUnicode_2BYTE_KIND:
895        return "UCS2";
896    case PyUnicode_4BYTE_KIND:
897        return "UCS4";
898    default:
899        return "<invalid compact kind>";
900    }
901}
902
903#ifdef Py_DEBUG
904/* Functions wrapping macros for use in debugger */
905char *_PyUnicode_utf8(void *unicode){
906    return PyUnicode_UTF8(unicode);
907}
908
909void *_PyUnicode_compact_data(void *unicode) {
910    return _PyUnicode_COMPACT_DATA(unicode);
911}
912void *_PyUnicode_data(void *unicode){
913    printf("obj %p\n", unicode);
914    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
915    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
916    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
917    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
918    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
919    return PyUnicode_DATA(unicode);
920}
921
922void
923_PyUnicode_Dump(PyObject *op)
924{
925    PyASCIIObject *ascii = (PyASCIIObject *)op;
926    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
927    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
928    void *data;
929
930    if (ascii->state.compact)
931    {
932        if (ascii->state.ascii)
933            data = (ascii + 1);
934        else
935            data = (compact + 1);
936    }
937    else
938        data = unicode->data.any;
939    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
940
941    if (ascii->wstr == data)
942        printf("shared ");
943    printf("wstr=%p", ascii->wstr);
944
945    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
946        printf(" (%zu), ", compact->wstr_length);
947        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
948            printf("shared ");
949        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
950    }
951    printf(", data=%p\n", data);
952}
953#endif
954
955PyObject *
956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
957{
958    PyObject *obj;
959    PyCompactUnicodeObject *unicode;
960    void *data;
961    enum PyUnicode_Kind kind;
962    int is_sharing, is_ascii;
963    Py_ssize_t char_size;
964    Py_ssize_t struct_size;
965
966    /* Optimization for empty strings */
967    if (size == 0 && unicode_empty != NULL) {
968        Py_INCREF(unicode_empty);
969        return unicode_empty;
970    }
971
972    is_ascii = 0;
973    is_sharing = 0;
974    struct_size = sizeof(PyCompactUnicodeObject);
975    if (maxchar < 128) {
976        kind = PyUnicode_1BYTE_KIND;
977        char_size = 1;
978        is_ascii = 1;
979        struct_size = sizeof(PyASCIIObject);
980    }
981    else if (maxchar < 256) {
982        kind = PyUnicode_1BYTE_KIND;
983        char_size = 1;
984    }
985    else if (maxchar < 65536) {
986        kind = PyUnicode_2BYTE_KIND;
987        char_size = 2;
988        if (sizeof(wchar_t) == 2)
989            is_sharing = 1;
990    }
991    else {
992        if (maxchar > MAX_UNICODE) {
993            PyErr_SetString(PyExc_SystemError,
994                            "invalid maximum character passed to PyUnicode_New");
995            return NULL;
996        }
997        kind = PyUnicode_4BYTE_KIND;
998        char_size = 4;
999        if (sizeof(wchar_t) == 4)
1000            is_sharing = 1;
1001    }
1002
1003    /* Ensure we won't overflow the size. */
1004    if (size < 0) {
1005        PyErr_SetString(PyExc_SystemError,
1006                        "Negative size passed to PyUnicode_New");
1007        return NULL;
1008    }
1009    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1010        return PyErr_NoMemory();
1011
1012    /* Duplicated allocation code from _PyObject_New() instead of a call to
1013     * PyObject_New() so we are able to allocate space for the object and
1014     * it's data buffer.
1015     */
1016    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1017    if (obj == NULL)
1018        return PyErr_NoMemory();
1019    obj = PyObject_INIT(obj, &PyUnicode_Type);
1020    if (obj == NULL)
1021        return NULL;
1022
1023    unicode = (PyCompactUnicodeObject *)obj;
1024    if (is_ascii)
1025        data = ((PyASCIIObject*)obj) + 1;
1026    else
1027        data = unicode + 1;
1028    _PyUnicode_LENGTH(unicode) = size;
1029    _PyUnicode_HASH(unicode) = -1;
1030    _PyUnicode_STATE(unicode).interned = 0;
1031    _PyUnicode_STATE(unicode).kind = kind;
1032    _PyUnicode_STATE(unicode).compact = 1;
1033    _PyUnicode_STATE(unicode).ready = 1;
1034    _PyUnicode_STATE(unicode).ascii = is_ascii;
1035    if (is_ascii) {
1036        ((char*)data)[size] = 0;
1037        _PyUnicode_WSTR(unicode) = NULL;
1038    }
1039    else if (kind == PyUnicode_1BYTE_KIND) {
1040        ((char*)data)[size] = 0;
1041        _PyUnicode_WSTR(unicode) = NULL;
1042        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1043        unicode->utf8 = NULL;
1044        unicode->utf8_length = 0;
1045    }
1046    else {
1047        unicode->utf8 = NULL;
1048        unicode->utf8_length = 0;
1049        if (kind == PyUnicode_2BYTE_KIND)
1050            ((Py_UCS2*)data)[size] = 0;
1051        else /* kind == PyUnicode_4BYTE_KIND */
1052            ((Py_UCS4*)data)[size] = 0;
1053        if (is_sharing) {
1054            _PyUnicode_WSTR_LENGTH(unicode) = size;
1055            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1056        }
1057        else {
1058            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1059            _PyUnicode_WSTR(unicode) = NULL;
1060        }
1061    }
1062#ifdef Py_DEBUG
1063    /* Fill the data with invalid characters to detect bugs earlier.
1064       _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1065       at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1066       and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1067    memset(data, 0xff, size * kind);
1068#endif
1069    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1070    return obj;
1071}
1072
1073#if SIZEOF_WCHAR_T == 2
1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1075   will decode surrogate pairs, the other conversions are implemented as macros
1076   for efficiency.
1077
1078   This function assumes that unicode can hold one more code point than wstr
1079   characters for a terminating null character. */
1080static void
1081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1082                              PyObject *unicode)
1083{
1084    const wchar_t *iter;
1085    Py_UCS4 *ucs4_out;
1086
1087    assert(unicode != NULL);
1088    assert(_PyUnicode_CHECK(unicode));
1089    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1090    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1091
1092    for (iter = begin; iter < end; ) {
1093        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1094                           _PyUnicode_GET_LENGTH(unicode)));
1095        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1096            && (iter+1) < end
1097            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1098        {
1099            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1100            iter += 2;
1101        }
1102        else {
1103            *ucs4_out++ = *iter;
1104            iter++;
1105        }
1106    }
1107    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1108                        _PyUnicode_GET_LENGTH(unicode)));
1109
1110}
1111#endif
1112
1113static int
1114unicode_check_modifiable(PyObject *unicode)
1115{
1116    if (!unicode_modifiable(unicode)) {
1117        PyErr_SetString(PyExc_SystemError,
1118                        "Cannot modify a string currently used");
1119        return -1;
1120    }
1121    return 0;
1122}
1123
1124static int
1125_copy_characters(PyObject *to, Py_ssize_t to_start,
1126                 PyObject *from, Py_ssize_t from_start,
1127                 Py_ssize_t how_many, int check_maxchar)
1128{
1129    unsigned int from_kind, to_kind;
1130    void *from_data, *to_data;
1131
1132    assert(0 <= how_many);
1133    assert(0 <= from_start);
1134    assert(0 <= to_start);
1135    assert(PyUnicode_Check(from));
1136    assert(PyUnicode_IS_READY(from));
1137    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1138
1139    assert(PyUnicode_Check(to));
1140    assert(PyUnicode_IS_READY(to));
1141    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142
1143    if (how_many == 0)
1144        return 0;
1145
1146    from_kind = PyUnicode_KIND(from);
1147    from_data = PyUnicode_DATA(from);
1148    to_kind = PyUnicode_KIND(to);
1149    to_data = PyUnicode_DATA(to);
1150
1151#ifdef Py_DEBUG
1152    if (!check_maxchar
1153        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1154    {
1155        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1156        Py_UCS4 ch;
1157        Py_ssize_t i;
1158        for (i=0; i < how_many; i++) {
1159            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1160            assert(ch <= to_maxchar);
1161        }
1162    }
1163#endif
1164
1165    if (from_kind == to_kind) {
1166        if (check_maxchar
1167            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1168        {
1169            /* Writing Latin-1 characters into an ASCII string requires to
1170               check that all written characters are pure ASCII */
1171            Py_UCS4 max_char;
1172            max_char = ucs1lib_find_max_char(from_data,
1173                                             (Py_UCS1*)from_data + how_many);
1174            if (max_char >= 128)
1175                return -1;
1176        }
1177        Py_MEMCPY((char*)to_data + to_kind * to_start,
1178                  (char*)from_data + from_kind * from_start,
1179                  to_kind * how_many);
1180    }
1181    else if (from_kind == PyUnicode_1BYTE_KIND
1182             && to_kind == PyUnicode_2BYTE_KIND)
1183    {
1184        _PyUnicode_CONVERT_BYTES(
1185            Py_UCS1, Py_UCS2,
1186            PyUnicode_1BYTE_DATA(from) + from_start,
1187            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1188            PyUnicode_2BYTE_DATA(to) + to_start
1189            );
1190    }
1191    else if (from_kind == PyUnicode_1BYTE_KIND
1192             && to_kind == PyUnicode_4BYTE_KIND)
1193    {
1194        _PyUnicode_CONVERT_BYTES(
1195            Py_UCS1, Py_UCS4,
1196            PyUnicode_1BYTE_DATA(from) + from_start,
1197            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1198            PyUnicode_4BYTE_DATA(to) + to_start
1199            );
1200    }
1201    else if (from_kind == PyUnicode_2BYTE_KIND
1202             && to_kind == PyUnicode_4BYTE_KIND)
1203    {
1204        _PyUnicode_CONVERT_BYTES(
1205            Py_UCS2, Py_UCS4,
1206            PyUnicode_2BYTE_DATA(from) + from_start,
1207            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1208            PyUnicode_4BYTE_DATA(to) + to_start
1209            );
1210    }
1211    else {
1212        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1213
1214        if (!check_maxchar) {
1215            if (from_kind == PyUnicode_2BYTE_KIND
1216                && to_kind == PyUnicode_1BYTE_KIND)
1217            {
1218                _PyUnicode_CONVERT_BYTES(
1219                    Py_UCS2, Py_UCS1,
1220                    PyUnicode_2BYTE_DATA(from) + from_start,
1221                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1222                    PyUnicode_1BYTE_DATA(to) + to_start
1223                    );
1224            }
1225            else if (from_kind == PyUnicode_4BYTE_KIND
1226                     && to_kind == PyUnicode_1BYTE_KIND)
1227            {
1228                _PyUnicode_CONVERT_BYTES(
1229                    Py_UCS4, Py_UCS1,
1230                    PyUnicode_4BYTE_DATA(from) + from_start,
1231                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1232                    PyUnicode_1BYTE_DATA(to) + to_start
1233                    );
1234            }
1235            else if (from_kind == PyUnicode_4BYTE_KIND
1236                     && to_kind == PyUnicode_2BYTE_KIND)
1237            {
1238                _PyUnicode_CONVERT_BYTES(
1239                    Py_UCS4, Py_UCS2,
1240                    PyUnicode_4BYTE_DATA(from) + from_start,
1241                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1242                    PyUnicode_2BYTE_DATA(to) + to_start
1243                    );
1244            }
1245            else {
1246                assert(0);
1247                return -1;
1248            }
1249        }
1250        else {
1251            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1252            Py_UCS4 ch;
1253            Py_ssize_t i;
1254
1255            for (i=0; i < how_many; i++) {
1256                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1257                if (ch > to_maxchar)
1258                    return -1;
1259                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1260            }
1261        }
1262    }
1263    return 0;
1264}
1265
1266void
1267_PyUnicode_FastCopyCharacters(
1268    PyObject *to, Py_ssize_t to_start,
1269    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1270{
1271    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1272}
1273
1274Py_ssize_t
1275PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1276                         PyObject *from, Py_ssize_t from_start,
1277                         Py_ssize_t how_many)
1278{
1279    int err;
1280
1281    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1282        PyErr_BadInternalCall();
1283        return -1;
1284    }
1285
1286    if (PyUnicode_READY(from) == -1)
1287        return -1;
1288    if (PyUnicode_READY(to) == -1)
1289        return -1;
1290
1291    if (from_start < 0) {
1292        PyErr_SetString(PyExc_IndexError, "string index out of range");
1293        return -1;
1294    }
1295    if (to_start < 0) {
1296        PyErr_SetString(PyExc_IndexError, "string index out of range");
1297        return -1;
1298    }
1299    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1300    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1301        PyErr_Format(PyExc_SystemError,
1302                     "Cannot write %zi characters at %zi "
1303                     "in a string of %zi characters",
1304                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1305        return -1;
1306    }
1307
1308    if (how_many == 0)
1309        return 0;
1310
1311    if (unicode_check_modifiable(to))
1312        return -1;
1313
1314    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1315    if (err) {
1316        PyErr_Format(PyExc_SystemError,
1317                     "Cannot copy %s characters "
1318                     "into a string of %s characters",
1319                     unicode_kind_name(from),
1320                     unicode_kind_name(to));
1321        return -1;
1322    }
1323    return how_many;
1324}
1325
1326/* Find the maximum code point and count the number of surrogate pairs so a
1327   correct string length can be computed before converting a string to UCS4.
1328   This function counts single surrogates as a character and not as a pair.
1329
1330   Return 0 on success, or -1 on error. */
1331static int
1332find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1333                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1334{
1335    const wchar_t *iter;
1336    Py_UCS4 ch;
1337
1338    assert(num_surrogates != NULL && maxchar != NULL);
1339    *num_surrogates = 0;
1340    *maxchar = 0;
1341
1342    for (iter = begin; iter < end; ) {
1343#if SIZEOF_WCHAR_T == 2
1344        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1345            && (iter+1) < end
1346            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1347        {
1348            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1349            ++(*num_surrogates);
1350            iter += 2;
1351        }
1352        else
1353#endif
1354        {
1355            ch = *iter;
1356            iter++;
1357        }
1358        if (ch > *maxchar) {
1359            *maxchar = ch;
1360            if (*maxchar > MAX_UNICODE) {
1361                PyErr_Format(PyExc_ValueError,
1362                             "character U+%x is not in range [U+0000; U+10ffff]",
1363                             ch);
1364                return -1;
1365            }
1366        }
1367    }
1368    return 0;
1369}
1370
1371int
1372_PyUnicode_Ready(PyObject *unicode)
1373{
1374    wchar_t *end;
1375    Py_UCS4 maxchar = 0;
1376    Py_ssize_t num_surrogates;
1377#if SIZEOF_WCHAR_T == 2
1378    Py_ssize_t length_wo_surrogates;
1379#endif
1380
1381    /* _PyUnicode_Ready() is only intended for old-style API usage where
1382       strings were created using _PyObject_New() and where no canonical
1383       representation (the str field) has been set yet aka strings
1384       which are not yet ready. */
1385    assert(_PyUnicode_CHECK(unicode));
1386    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1387    assert(_PyUnicode_WSTR(unicode) != NULL);
1388    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1389    assert(_PyUnicode_UTF8(unicode) == NULL);
1390    /* Actually, it should neither be interned nor be anything else: */
1391    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1392
1393    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1394    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1395                                &maxchar, &num_surrogates) == -1)
1396        return -1;
1397
1398    if (maxchar < 256) {
1399        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1400        if (!_PyUnicode_DATA_ANY(unicode)) {
1401            PyErr_NoMemory();
1402            return -1;
1403        }
1404        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1405                                _PyUnicode_WSTR(unicode), end,
1406                                PyUnicode_1BYTE_DATA(unicode));
1407        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1408        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1409        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1410        if (maxchar < 128) {
1411            _PyUnicode_STATE(unicode).ascii = 1;
1412            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1413            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1414        }
1415        else {
1416            _PyUnicode_STATE(unicode).ascii = 0;
1417            _PyUnicode_UTF8(unicode) = NULL;
1418            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1419        }
1420        PyObject_FREE(_PyUnicode_WSTR(unicode));
1421        _PyUnicode_WSTR(unicode) = NULL;
1422        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1423    }
1424    /* In this case we might have to convert down from 4-byte native
1425       wchar_t to 2-byte unicode. */
1426    else if (maxchar < 65536) {
1427        assert(num_surrogates == 0 &&
1428               "FindMaxCharAndNumSurrogatePairs() messed up");
1429
1430#if SIZEOF_WCHAR_T == 2
1431        /* We can share representations and are done. */
1432        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1433        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1434        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1435        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1436        _PyUnicode_UTF8(unicode) = NULL;
1437        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1438#else
1439        /* sizeof(wchar_t) == 4 */
1440        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1441            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1442        if (!_PyUnicode_DATA_ANY(unicode)) {
1443            PyErr_NoMemory();
1444            return -1;
1445        }
1446        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1447                                _PyUnicode_WSTR(unicode), end,
1448                                PyUnicode_2BYTE_DATA(unicode));
1449        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1450        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1451        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1452        _PyUnicode_UTF8(unicode) = NULL;
1453        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1454        PyObject_FREE(_PyUnicode_WSTR(unicode));
1455        _PyUnicode_WSTR(unicode) = NULL;
1456        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457#endif
1458    }
1459    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1460    else {
1461#if SIZEOF_WCHAR_T == 2
1462        /* in case the native representation is 2-bytes, we need to allocate a
1463           new normalized 4-byte version. */
1464        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1465        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1466        if (!_PyUnicode_DATA_ANY(unicode)) {
1467            PyErr_NoMemory();
1468            return -1;
1469        }
1470        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1471        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1472        _PyUnicode_UTF8(unicode) = NULL;
1473        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1474        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1475        _PyUnicode_STATE(unicode).ready = 1;
1476        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1477        PyObject_FREE(_PyUnicode_WSTR(unicode));
1478        _PyUnicode_WSTR(unicode) = NULL;
1479        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1480#else
1481        assert(num_surrogates == 0);
1482
1483        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1484        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1485        _PyUnicode_UTF8(unicode) = NULL;
1486        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1487        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1488#endif
1489        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1490    }
1491    _PyUnicode_STATE(unicode).ready = 1;
1492    assert(_PyUnicode_CheckConsistency(unicode, 1));
1493    return 0;
1494}
1495
1496static void
1497unicode_dealloc(register PyObject *unicode)
1498{
1499    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1500    case SSTATE_NOT_INTERNED:
1501        break;
1502
1503    case SSTATE_INTERNED_MORTAL:
1504        /* revive dead object temporarily for DelItem */
1505        Py_REFCNT(unicode) = 3;
1506        if (PyDict_DelItem(interned, unicode) != 0)
1507            Py_FatalError(
1508                "deletion of interned string failed");
1509        break;
1510
1511    case SSTATE_INTERNED_IMMORTAL:
1512        Py_FatalError("Immortal interned string died.");
1513
1514    default:
1515        Py_FatalError("Inconsistent interned string state.");
1516    }
1517
1518    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1519        PyObject_DEL(_PyUnicode_WSTR(unicode));
1520    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1521        PyObject_DEL(_PyUnicode_UTF8(unicode));
1522    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1523        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1524
1525    Py_TYPE(unicode)->tp_free(unicode);
1526}
1527
1528#ifdef Py_DEBUG
1529static int
1530unicode_is_singleton(PyObject *unicode)
1531{
1532    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1533    if (unicode == unicode_empty)
1534        return 1;
1535    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1536    {
1537        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1538        if (ch < 256 && unicode_latin1[ch] == unicode)
1539            return 1;
1540    }
1541    return 0;
1542}
1543#endif
1544
1545static int
1546unicode_modifiable(PyObject *unicode)
1547{
1548    assert(_PyUnicode_CHECK(unicode));
1549    if (Py_REFCNT(unicode) != 1)
1550        return 0;
1551    if (_PyUnicode_HASH(unicode) != -1)
1552        return 0;
1553    if (PyUnicode_CHECK_INTERNED(unicode))
1554        return 0;
1555    if (!PyUnicode_CheckExact(unicode))
1556        return 0;
1557#ifdef Py_DEBUG
1558    /* singleton refcount is greater than 1 */
1559    assert(!unicode_is_singleton(unicode));
1560#endif
1561    return 1;
1562}
1563
1564static int
1565unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1566{
1567    PyObject *unicode;
1568    Py_ssize_t old_length;
1569
1570    assert(p_unicode != NULL);
1571    unicode = *p_unicode;
1572
1573    assert(unicode != NULL);
1574    assert(PyUnicode_Check(unicode));
1575    assert(0 <= length);
1576
1577    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1578        old_length = PyUnicode_WSTR_LENGTH(unicode);
1579    else
1580        old_length = PyUnicode_GET_LENGTH(unicode);
1581    if (old_length == length)
1582        return 0;
1583
1584    if (length == 0) {
1585        Py_DECREF(*p_unicode);
1586        *p_unicode = unicode_empty;
1587        Py_INCREF(*p_unicode);
1588        return 0;
1589    }
1590
1591    if (!unicode_modifiable(unicode)) {
1592        PyObject *copy = resize_copy(unicode, length);
1593        if (copy == NULL)
1594            return -1;
1595        Py_DECREF(*p_unicode);
1596        *p_unicode = copy;
1597        return 0;
1598    }
1599
1600    if (PyUnicode_IS_COMPACT(unicode)) {
1601        PyObject *new_unicode = resize_compact(unicode, length);
1602        if (new_unicode == NULL)
1603            return -1;
1604        *p_unicode = new_unicode;
1605        return 0;
1606    }
1607    return resize_inplace(unicode, length);
1608}
1609
1610int
1611PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1612{
1613    PyObject *unicode;
1614    if (p_unicode == NULL) {
1615        PyErr_BadInternalCall();
1616        return -1;
1617    }
1618    unicode = *p_unicode;
1619    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1620    {
1621        PyErr_BadInternalCall();
1622        return -1;
1623    }
1624    return unicode_resize(p_unicode, length);
1625}
1626
1627static int
1628unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1629              unsigned int maxchar)
1630{
1631    PyObject *result;
1632    assert(PyUnicode_IS_READY(*p_unicode));
1633    assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
1634    if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1635        return 0;
1636    result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1637                           maxchar);
1638    if (result == NULL)
1639        return -1;
1640    _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
1641    Py_DECREF(*p_unicode);
1642    *p_unicode = result;
1643    return 0;
1644}
1645
1646static int
1647unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1648                Py_UCS4 ch)
1649{
1650    assert(ch <= MAX_UNICODE);
1651    if (unicode_widen(p_unicode, *pos, ch) < 0)
1652        return -1;
1653    PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1654                    PyUnicode_DATA(*p_unicode),
1655                    (*pos)++, ch);
1656    return 0;
1657}
1658
1659/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1660
1661   WARNING: The function doesn't copy the terminating null character and
1662   doesn't check the maximum character (may write a latin1 character in an
1663   ASCII string). */
1664static void
1665unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1666                   const char *str, Py_ssize_t len)
1667{
1668    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1669    void *data = PyUnicode_DATA(unicode);
1670    const char *end = str + len;
1671
1672    switch (kind) {
1673    case PyUnicode_1BYTE_KIND: {
1674        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1675        memcpy((char *) data + index, str, len);
1676        break;
1677    }
1678    case PyUnicode_2BYTE_KIND: {
1679        Py_UCS2 *start = (Py_UCS2 *)data + index;
1680        Py_UCS2 *ucs2 = start;
1681        assert(index <= PyUnicode_GET_LENGTH(unicode));
1682
1683        for (; str < end; ++ucs2, ++str)
1684            *ucs2 = (Py_UCS2)*str;
1685
1686        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1687        break;
1688    }
1689    default: {
1690        Py_UCS4 *start = (Py_UCS4 *)data + index;
1691        Py_UCS4 *ucs4 = start;
1692        assert(kind == PyUnicode_4BYTE_KIND);
1693        assert(index <= PyUnicode_GET_LENGTH(unicode));
1694
1695        for (; str < end; ++ucs4, ++str)
1696            *ucs4 = (Py_UCS4)*str;
1697
1698        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1699    }
1700    }
1701}
1702
1703
1704static PyObject*
1705get_latin1_char(unsigned char ch)
1706{
1707    PyObject *unicode = unicode_latin1[ch];
1708    if (!unicode) {
1709        unicode = PyUnicode_New(1, ch);
1710        if (!unicode)
1711            return NULL;
1712        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1713        assert(_PyUnicode_CheckConsistency(unicode, 1));
1714        unicode_latin1[ch] = unicode;
1715    }
1716    Py_INCREF(unicode);
1717    return unicode;
1718}
1719
1720PyObject *
1721PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1722{
1723    PyObject *unicode;
1724    Py_UCS4 maxchar = 0;
1725    Py_ssize_t num_surrogates;
1726
1727    if (u == NULL)
1728        return (PyObject*)_PyUnicode_New(size);
1729
1730    /* If the Unicode data is known at construction time, we can apply
1731       some optimizations which share commonly used objects. */
1732
1733    /* Optimization for empty strings */
1734    if (size == 0 && unicode_empty != NULL) {
1735        Py_INCREF(unicode_empty);
1736        return unicode_empty;
1737    }
1738
1739    /* Single character Unicode objects in the Latin-1 range are
1740       shared when using this constructor */
1741    if (size == 1 && *u < 256)
1742        return get_latin1_char((unsigned char)*u);
1743
1744    /* If not empty and not single character, copy the Unicode data
1745       into the new object */
1746    if (find_maxchar_surrogates(u, u + size,
1747                                &maxchar, &num_surrogates) == -1)
1748        return NULL;
1749
1750    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1751    if (!unicode)
1752        return NULL;
1753
1754    switch (PyUnicode_KIND(unicode)) {
1755    case PyUnicode_1BYTE_KIND:
1756        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1757                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1758        break;
1759    case PyUnicode_2BYTE_KIND:
1760#if Py_UNICODE_SIZE == 2
1761        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1762#else
1763        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1764                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1765#endif
1766        break;
1767    case PyUnicode_4BYTE_KIND:
1768#if SIZEOF_WCHAR_T == 2
1769        /* This is the only case which has to process surrogates, thus
1770           a simple copy loop is not enough and we need a function. */
1771        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1772#else
1773        assert(num_surrogates == 0);
1774        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1775#endif
1776        break;
1777    default:
1778        assert(0 && "Impossible state");
1779    }
1780
1781    return unicode_result(unicode);
1782}
1783
1784PyObject *
1785PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1786{
1787    if (size < 0) {
1788        PyErr_SetString(PyExc_SystemError,
1789                        "Negative size passed to PyUnicode_FromStringAndSize");
1790        return NULL;
1791    }
1792    if (u != NULL)
1793        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1794    else
1795        return (PyObject *)_PyUnicode_New(size);
1796}
1797
1798PyObject *
1799PyUnicode_FromString(const char *u)
1800{
1801    size_t size = strlen(u);
1802    if (size > PY_SSIZE_T_MAX) {
1803        PyErr_SetString(PyExc_OverflowError, "input too long");
1804        return NULL;
1805    }
1806    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1807}
1808
1809PyObject *
1810_PyUnicode_FromId(_Py_Identifier *id)
1811{
1812    if (!id->object) {
1813        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1814                                                  strlen(id->string),
1815                                                  NULL, NULL);
1816        if (!id->object)
1817            return NULL;
1818        PyUnicode_InternInPlace(&id->object);
1819        assert(!id->next);
1820        id->next = static_strings;
1821        static_strings = id;
1822    }
1823    return id->object;
1824}
1825
1826void
1827_PyUnicode_ClearStaticStrings()
1828{
1829    _Py_Identifier *i;
1830    for (i = static_strings; i; i = i->next) {
1831        Py_DECREF(i->object);
1832        i->object = NULL;
1833        i->next = NULL;
1834    }
1835}
1836
1837/* Internal function, doesn't check maximum character */
1838
1839PyObject*
1840_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1841{
1842    const unsigned char *s = (const unsigned char *)buffer;
1843    PyObject *unicode;
1844    if (size == 1) {
1845#ifdef Py_DEBUG
1846        assert(s[0] < 128);
1847#endif
1848        return get_latin1_char(s[0]);
1849    }
1850    unicode = PyUnicode_New(size, 127);
1851    if (!unicode)
1852        return NULL;
1853    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1854    assert(_PyUnicode_CheckConsistency(unicode, 1));
1855    return unicode;
1856}
1857
1858static Py_UCS4
1859kind_maxchar_limit(unsigned int kind)
1860{
1861    switch (kind) {
1862    case PyUnicode_1BYTE_KIND:
1863        return 0x80;
1864    case PyUnicode_2BYTE_KIND:
1865        return 0x100;
1866    case PyUnicode_4BYTE_KIND:
1867        return 0x10000;
1868    default:
1869        assert(0 && "invalid kind");
1870        return MAX_UNICODE;
1871    }
1872}
1873
1874Py_LOCAL_INLINE(Py_UCS4)
1875align_maxchar(Py_UCS4 maxchar)
1876{
1877    if (maxchar <= 127)
1878        return 127;
1879    else if (maxchar <= 255)
1880        return 255;
1881    else if (maxchar <= 65535)
1882        return 65535;
1883    else
1884        return MAX_UNICODE;
1885}
1886
1887static PyObject*
1888_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1889{
1890    PyObject *res;
1891    unsigned char max_char;
1892
1893    if (size == 0) {
1894        Py_INCREF(unicode_empty);
1895        return unicode_empty;
1896    }
1897    assert(size > 0);
1898    if (size == 1)
1899        return get_latin1_char(u[0]);
1900
1901    max_char = ucs1lib_find_max_char(u, u + size);
1902    res = PyUnicode_New(size, max_char);
1903    if (!res)
1904        return NULL;
1905    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1906    assert(_PyUnicode_CheckConsistency(res, 1));
1907    return res;
1908}
1909
1910static PyObject*
1911_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1912{
1913    PyObject *res;
1914    Py_UCS2 max_char;
1915
1916    if (size == 0) {
1917        Py_INCREF(unicode_empty);
1918        return unicode_empty;
1919    }
1920    assert(size > 0);
1921    if (size == 1) {
1922        Py_UCS4 ch = u[0];
1923        if (ch < 256)
1924            return get_latin1_char((unsigned char)ch);
1925
1926        res = PyUnicode_New(1, ch);
1927        if (res == NULL)
1928            return NULL;
1929        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1930        assert(_PyUnicode_CheckConsistency(res, 1));
1931        return res;
1932    }
1933
1934    max_char = ucs2lib_find_max_char(u, u + size);
1935    res = PyUnicode_New(size, max_char);
1936    if (!res)
1937        return NULL;
1938    if (max_char >= 256)
1939        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1940    else {
1941        _PyUnicode_CONVERT_BYTES(
1942            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1943    }
1944    assert(_PyUnicode_CheckConsistency(res, 1));
1945    return res;
1946}
1947
1948static PyObject*
1949_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1950{
1951    PyObject *res;
1952    Py_UCS4 max_char;
1953
1954    if (size == 0) {
1955        Py_INCREF(unicode_empty);
1956        return unicode_empty;
1957    }
1958    assert(size > 0);
1959    if (size == 1) {
1960        Py_UCS4 ch = u[0];
1961        if (ch < 256)
1962            return get_latin1_char((unsigned char)ch);
1963
1964        res = PyUnicode_New(1, ch);
1965        if (res == NULL)
1966            return NULL;
1967        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1968        assert(_PyUnicode_CheckConsistency(res, 1));
1969        return res;
1970    }
1971
1972    max_char = ucs4lib_find_max_char(u, u + size);
1973    res = PyUnicode_New(size, max_char);
1974    if (!res)
1975        return NULL;
1976    if (max_char < 256)
1977        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1978                                 PyUnicode_1BYTE_DATA(res));
1979    else if (max_char < 0x10000)
1980        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1981                                 PyUnicode_2BYTE_DATA(res));
1982    else
1983        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1984    assert(_PyUnicode_CheckConsistency(res, 1));
1985    return res;
1986}
1987
1988PyObject*
1989PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1990{
1991    if (size < 0) {
1992        PyErr_SetString(PyExc_ValueError, "size must be positive");
1993        return NULL;
1994    }
1995    switch (kind) {
1996    case PyUnicode_1BYTE_KIND:
1997        return _PyUnicode_FromUCS1(buffer, size);
1998    case PyUnicode_2BYTE_KIND:
1999        return _PyUnicode_FromUCS2(buffer, size);
2000    case PyUnicode_4BYTE_KIND:
2001        return _PyUnicode_FromUCS4(buffer, size);
2002    default:
2003        PyErr_SetString(PyExc_SystemError, "invalid kind");
2004        return NULL;
2005    }
2006}
2007
2008Py_UCS4
2009_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2010{
2011    enum PyUnicode_Kind kind;
2012    void *startptr, *endptr;
2013
2014    assert(PyUnicode_IS_READY(unicode));
2015    assert(0 <= start);
2016    assert(end <= PyUnicode_GET_LENGTH(unicode));
2017    assert(start <= end);
2018
2019    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2020        return PyUnicode_MAX_CHAR_VALUE(unicode);
2021
2022    if (start == end)
2023        return 127;
2024
2025    if (PyUnicode_IS_ASCII(unicode))
2026        return 127;
2027
2028    kind = PyUnicode_KIND(unicode);
2029    startptr = PyUnicode_DATA(unicode);
2030    endptr = (char *)startptr + end * kind;
2031    startptr = (char *)startptr + start * kind;
2032    switch(kind) {
2033    case PyUnicode_1BYTE_KIND:
2034        return ucs1lib_find_max_char(startptr, endptr);
2035    case PyUnicode_2BYTE_KIND:
2036        return ucs2lib_find_max_char(startptr, endptr);
2037    case PyUnicode_4BYTE_KIND:
2038        return ucs4lib_find_max_char(startptr, endptr);
2039    default:
2040        assert(0);
2041        return 0;
2042    }
2043}
2044
2045/* Ensure that a string uses the most efficient storage, if it is not the
2046   case: create a new string with of the right kind. Write NULL into *p_unicode
2047   on error. */
2048static void
2049unicode_adjust_maxchar(PyObject **p_unicode)
2050{
2051    PyObject *unicode, *copy;
2052    Py_UCS4 max_char;
2053    Py_ssize_t len;
2054    unsigned int kind;
2055
2056    assert(p_unicode != NULL);
2057    unicode = *p_unicode;
2058    assert(PyUnicode_IS_READY(unicode));
2059    if (PyUnicode_IS_ASCII(unicode))
2060        return;
2061
2062    len = PyUnicode_GET_LENGTH(unicode);
2063    kind = PyUnicode_KIND(unicode);
2064    if (kind == PyUnicode_1BYTE_KIND) {
2065        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2066        max_char = ucs1lib_find_max_char(u, u + len);
2067        if (max_char >= 128)
2068            return;
2069    }
2070    else if (kind == PyUnicode_2BYTE_KIND) {
2071        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2072        max_char = ucs2lib_find_max_char(u, u + len);
2073        if (max_char >= 256)
2074            return;
2075    }
2076    else {
2077        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2078        assert(kind == PyUnicode_4BYTE_KIND);
2079        max_char = ucs4lib_find_max_char(u, u + len);
2080        if (max_char >= 0x10000)
2081            return;
2082    }
2083    copy = PyUnicode_New(len, max_char);
2084    if (copy != NULL)
2085        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2086    Py_DECREF(unicode);
2087    *p_unicode = copy;
2088}
2089
2090PyObject*
2091_PyUnicode_Copy(PyObject *unicode)
2092{
2093    Py_ssize_t length;
2094    PyObject *copy;
2095
2096    if (!PyUnicode_Check(unicode)) {
2097        PyErr_BadInternalCall();
2098        return NULL;
2099    }
2100    if (PyUnicode_READY(unicode) == -1)
2101        return NULL;
2102
2103    length = PyUnicode_GET_LENGTH(unicode);
2104    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2105    if (!copy)
2106        return NULL;
2107    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2108
2109    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2110              length * PyUnicode_KIND(unicode));
2111    assert(_PyUnicode_CheckConsistency(copy, 1));
2112    return copy;
2113}
2114
2115
2116/* Widen Unicode objects to larger buffers. Don't write terminating null
2117   character. Return NULL on error. */
2118
2119void*
2120_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2121{
2122    Py_ssize_t len;
2123    void *result;
2124    unsigned int skind;
2125
2126    if (PyUnicode_READY(s) == -1)
2127        return NULL;
2128
2129    len = PyUnicode_GET_LENGTH(s);
2130    skind = PyUnicode_KIND(s);
2131    if (skind >= kind) {
2132        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2133        return NULL;
2134    }
2135    switch (kind) {
2136    case PyUnicode_2BYTE_KIND:
2137        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2138        if (!result)
2139            return PyErr_NoMemory();
2140        assert(skind == PyUnicode_1BYTE_KIND);
2141        _PyUnicode_CONVERT_BYTES(
2142            Py_UCS1, Py_UCS2,
2143            PyUnicode_1BYTE_DATA(s),
2144            PyUnicode_1BYTE_DATA(s) + len,
2145            result);
2146        return result;
2147    case PyUnicode_4BYTE_KIND:
2148        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2149        if (!result)
2150            return PyErr_NoMemory();
2151        if (skind == PyUnicode_2BYTE_KIND) {
2152            _PyUnicode_CONVERT_BYTES(
2153                Py_UCS2, Py_UCS4,
2154                PyUnicode_2BYTE_DATA(s),
2155                PyUnicode_2BYTE_DATA(s) + len,
2156                result);
2157        }
2158        else {
2159            assert(skind == PyUnicode_1BYTE_KIND);
2160            _PyUnicode_CONVERT_BYTES(
2161                Py_UCS1, Py_UCS4,
2162                PyUnicode_1BYTE_DATA(s),
2163                PyUnicode_1BYTE_DATA(s) + len,
2164                result);
2165        }
2166        return result;
2167    default:
2168        break;
2169    }
2170    PyErr_SetString(PyExc_SystemError, "invalid kind");
2171    return NULL;
2172}
2173
2174static Py_UCS4*
2175as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2176        int copy_null)
2177{
2178    int kind;
2179    void *data;
2180    Py_ssize_t len, targetlen;
2181    if (PyUnicode_READY(string) == -1)
2182        return NULL;
2183    kind = PyUnicode_KIND(string);
2184    data = PyUnicode_DATA(string);
2185    len = PyUnicode_GET_LENGTH(string);
2186    targetlen = len;
2187    if (copy_null)
2188        targetlen++;
2189    if (!target) {
2190        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2191            PyErr_NoMemory();
2192            return NULL;
2193        }
2194        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2195        if (!target) {
2196            PyErr_NoMemory();
2197            return NULL;
2198        }
2199    }
2200    else {
2201        if (targetsize < targetlen) {
2202            PyErr_Format(PyExc_SystemError,
2203                         "string is longer than the buffer");
2204            if (copy_null && 0 < targetsize)
2205                target[0] = 0;
2206            return NULL;
2207        }
2208    }
2209    if (kind == PyUnicode_1BYTE_KIND) {
2210        Py_UCS1 *start = (Py_UCS1 *) data;
2211        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2212    }
2213    else if (kind == PyUnicode_2BYTE_KIND) {
2214        Py_UCS2 *start = (Py_UCS2 *) data;
2215        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2216    }
2217    else {
2218        assert(kind == PyUnicode_4BYTE_KIND);
2219        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2220    }
2221    if (copy_null)
2222        target[len] = 0;
2223    return target;
2224}
2225
2226Py_UCS4*
2227PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228                 int copy_null)
2229{
2230    if (target == NULL || targetsize < 0) {
2231        PyErr_BadInternalCall();
2232        return NULL;
2233    }
2234    return as_ucs4(string, target, targetsize, copy_null);
2235}
2236
2237Py_UCS4*
2238PyUnicode_AsUCS4Copy(PyObject *string)
2239{
2240    return as_ucs4(string, NULL, 0, 1);
2241}
2242
2243#ifdef HAVE_WCHAR_H
2244
2245PyObject *
2246PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2247{
2248    if (w == NULL) {
2249        if (size == 0) {
2250            Py_INCREF(unicode_empty);
2251            return unicode_empty;
2252        }
2253        PyErr_BadInternalCall();
2254        return NULL;
2255    }
2256
2257    if (size == -1) {
2258        size = wcslen(w);
2259    }
2260
2261    return PyUnicode_FromUnicode(w, size);
2262}
2263
2264#endif /* HAVE_WCHAR_H */
2265
2266static void
2267makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2268        int zeropad, int width, int precision, char c)
2269{
2270    *fmt++ = '%';
2271    if (width) {
2272        if (zeropad)
2273            *fmt++ = '0';
2274        fmt += sprintf(fmt, "%d", width);
2275    }
2276    if (precision)
2277        fmt += sprintf(fmt, ".%d", precision);
2278    if (longflag)
2279        *fmt++ = 'l';
2280    else if (longlongflag) {
2281        /* longlongflag should only ever be nonzero on machines with
2282           HAVE_LONG_LONG defined */
2283#ifdef HAVE_LONG_LONG
2284        char *f = PY_FORMAT_LONG_LONG;
2285        while (*f)
2286            *fmt++ = *f++;
2287#else
2288        /* we shouldn't ever get here */
2289        assert(0);
2290        *fmt++ = 'l';
2291#endif
2292    }
2293    else if (size_tflag) {
2294        char *f = PY_FORMAT_SIZE_T;
2295        while (*f)
2296            *fmt++ = *f++;
2297    }
2298    *fmt++ = c;
2299    *fmt = '\0';
2300}
2301
2302/* helper for PyUnicode_FromFormatV() */
2303
2304static const char*
2305parse_format_flags(const char *f,
2306                   int *p_width, int *p_precision,
2307                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2308{
2309    int width, precision, longflag, longlongflag, size_tflag;
2310
2311    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2312    f++;
2313    width = 0;
2314    while (Py_ISDIGIT((unsigned)*f))
2315        width = (width*10) + *f++ - '0';
2316    precision = 0;
2317    if (*f == '.') {
2318        f++;
2319        while (Py_ISDIGIT((unsigned)*f))
2320            precision = (precision*10) + *f++ - '0';
2321        if (*f == '%') {
2322            /* "%.3%s" => f points to "3" */
2323            f--;
2324        }
2325    }
2326    if (*f == '\0') {
2327        /* bogus format "%.1" => go backward, f points to "1" */
2328        f--;
2329    }
2330    if (p_width != NULL)
2331        *p_width = width;
2332    if (p_precision != NULL)
2333        *p_precision = precision;
2334
2335    /* Handle %ld, %lu, %lld and %llu. */
2336    longflag = 0;
2337    longlongflag = 0;
2338    size_tflag = 0;
2339
2340    if (*f == 'l') {
2341        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2342            longflag = 1;
2343            ++f;
2344        }
2345#ifdef HAVE_LONG_LONG
2346        else if (f[1] == 'l' &&
2347                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2348            longlongflag = 1;
2349            f += 2;
2350        }
2351#endif
2352    }
2353    /* handle the size_t flag. */
2354    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2355        size_tflag = 1;
2356        ++f;
2357    }
2358    if (p_longflag != NULL)
2359        *p_longflag = longflag;
2360    if (p_longlongflag != NULL)
2361        *p_longlongflag = longlongflag;
2362    if (p_size_tflag != NULL)
2363        *p_size_tflag = size_tflag;
2364    return f;
2365}
2366
2367/* maximum number of characters required for output of %ld.  21 characters
2368   allows for 64-bit integers (in decimal) and an optional sign. */
2369#define MAX_LONG_CHARS 21
2370/* maximum number of characters required for output of %lld.
2371   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2372   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2373#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2374
2375PyObject *
2376PyUnicode_FromFormatV(const char *format, va_list vargs)
2377{
2378    va_list count;
2379    Py_ssize_t callcount = 0;
2380    PyObject **callresults = NULL;
2381    PyObject **callresult = NULL;
2382    Py_ssize_t n = 0;
2383    int width = 0;
2384    int precision = 0;
2385    int zeropad;
2386    const char* f;
2387    PyObject *string;
2388    /* used by sprintf */
2389    char fmt[61]; /* should be enough for %0width.precisionlld */
2390    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2391    Py_UCS4 argmaxchar;
2392    Py_ssize_t numbersize = 0;
2393    char *numberresults = NULL;
2394    char *numberresult = NULL;
2395    Py_ssize_t i;
2396    int kind;
2397    void *data;
2398
2399    Py_VA_COPY(count, vargs);
2400    /* step 1: count the number of %S/%R/%A/%s format specifications
2401     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2402     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2403     * result in an array)
2404     * also estimate a upper bound for all the number formats in the string,
2405     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2406     * buffer before putting everything together. */
2407    for (f = format; *f; f++) {
2408        if (*f == '%') {
2409            int longlongflag;
2410            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2411            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2412            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2413                ++callcount;
2414
2415            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2416#ifdef HAVE_LONG_LONG
2417                if (longlongflag) {
2418                    if (width < MAX_LONG_LONG_CHARS)
2419                        width = MAX_LONG_LONG_CHARS;
2420                }
2421                else
2422#endif
2423                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2424                       including sign.  Decimal takes the most space.  This
2425                       isn't enough for octal.  If a width is specified we
2426                       need more (which we allocate later). */
2427                    if (width < MAX_LONG_CHARS)
2428                        width = MAX_LONG_CHARS;
2429
2430                /* account for the size + '\0' to separate numbers
2431                   inside of the numberresults buffer */
2432                numbersize += (width + 1);
2433            }
2434        }
2435        else if ((unsigned char)*f > 127) {
2436            PyErr_Format(PyExc_ValueError,
2437                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2438                "string, got a non-ASCII byte: 0x%02x",
2439                (unsigned char)*f);
2440            return NULL;
2441        }
2442    }
2443    /* step 2: allocate memory for the results of
2444     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2445    if (callcount) {
2446        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2447        if (!callresults) {
2448            PyErr_NoMemory();
2449            return NULL;
2450        }
2451        callresult = callresults;
2452    }
2453    /* step 2.5: allocate memory for the results of formating numbers */
2454    if (numbersize) {
2455        numberresults = PyObject_Malloc(numbersize);
2456        if (!numberresults) {
2457            PyErr_NoMemory();
2458            goto fail;
2459        }
2460        numberresult = numberresults;
2461    }
2462
2463    /* step 3: format numbers and figure out how large a buffer we need */
2464    for (f = format; *f; f++) {
2465        if (*f == '%') {
2466            const char* p;
2467            int longflag;
2468            int longlongflag;
2469            int size_tflag;
2470            int numprinted;
2471
2472            p = f;
2473            zeropad = (f[1] == '0');
2474            f = parse_format_flags(f, &width, &precision,
2475                                   &longflag, &longlongflag, &size_tflag);
2476            switch (*f) {
2477            case 'c':
2478            {
2479                Py_UCS4 ordinal = va_arg(count, int);
2480                maxchar = MAX_MAXCHAR(maxchar, ordinal);
2481                n++;
2482                break;
2483            }
2484            case '%':
2485                n++;
2486                break;
2487            case 'i':
2488            case 'd':
2489                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2490                        width, precision, *f);
2491                if (longflag)
2492                    numprinted = sprintf(numberresult, fmt,
2493                                         va_arg(count, long));
2494#ifdef HAVE_LONG_LONG
2495                else if (longlongflag)
2496                    numprinted = sprintf(numberresult, fmt,
2497                                         va_arg(count, PY_LONG_LONG));
2498#endif
2499                else if (size_tflag)
2500                    numprinted = sprintf(numberresult, fmt,
2501                                         va_arg(count, Py_ssize_t));
2502                else
2503                    numprinted = sprintf(numberresult, fmt,
2504                                         va_arg(count, int));
2505                n += numprinted;
2506                /* advance by +1 to skip over the '\0' */
2507                numberresult += (numprinted + 1);
2508                assert(*(numberresult - 1) == '\0');
2509                assert(*(numberresult - 2) != '\0');
2510                assert(numprinted >= 0);
2511                assert(numberresult <= numberresults + numbersize);
2512                break;
2513            case 'u':
2514                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2515                        width, precision, 'u');
2516                if (longflag)
2517                    numprinted = sprintf(numberresult, fmt,
2518                                         va_arg(count, unsigned long));
2519#ifdef HAVE_LONG_LONG
2520                else if (longlongflag)
2521                    numprinted = sprintf(numberresult, fmt,
2522                                         va_arg(count, unsigned PY_LONG_LONG));
2523#endif
2524                else if (size_tflag)
2525                    numprinted = sprintf(numberresult, fmt,
2526                                         va_arg(count, size_t));
2527                else
2528                    numprinted = sprintf(numberresult, fmt,
2529                                         va_arg(count, unsigned int));
2530                n += numprinted;
2531                numberresult += (numprinted + 1);
2532                assert(*(numberresult - 1) == '\0');
2533                assert(*(numberresult - 2) != '\0');
2534                assert(numprinted >= 0);
2535                assert(numberresult <= numberresults + numbersize);
2536                break;
2537            case 'x':
2538                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2539                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2540                n += numprinted;
2541                numberresult += (numprinted + 1);
2542                assert(*(numberresult - 1) == '\0');
2543                assert(*(numberresult - 2) != '\0');
2544                assert(numprinted >= 0);
2545                assert(numberresult <= numberresults + numbersize);
2546                break;
2547            case 'p':
2548                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2549                /* %p is ill-defined:  ensure leading 0x. */
2550                if (numberresult[1] == 'X')
2551                    numberresult[1] = 'x';
2552                else if (numberresult[1] != 'x') {
2553                    memmove(numberresult + 2, numberresult,
2554                            strlen(numberresult) + 1);
2555                    numberresult[0] = '0';
2556                    numberresult[1] = 'x';
2557                    numprinted += 2;
2558                }
2559                n += numprinted;
2560                numberresult += (numprinted + 1);
2561                assert(*(numberresult - 1) == '\0');
2562                assert(*(numberresult - 2) != '\0');
2563                assert(numprinted >= 0);
2564                assert(numberresult <= numberresults + numbersize);
2565                break;
2566            case 's':
2567            {
2568                /* UTF-8 */
2569                const char *s = va_arg(count, const char*);
2570                PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2571                if (!str)
2572                    goto fail;
2573                /* since PyUnicode_DecodeUTF8 returns already flexible
2574                   unicode objects, there is no need to call ready on them */
2575                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2576                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2577                n += PyUnicode_GET_LENGTH(str);
2578                /* Remember the str and switch to the next slot */
2579                *callresult++ = str;
2580                break;
2581            }
2582            case 'U':
2583            {
2584                PyObject *obj = va_arg(count, PyObject *);
2585                assert(obj && _PyUnicode_CHECK(obj));
2586                if (PyUnicode_READY(obj) == -1)
2587                    goto fail;
2588                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2589                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2590                n += PyUnicode_GET_LENGTH(obj);
2591                break;
2592            }
2593            case 'V':
2594            {
2595                PyObject *obj = va_arg(count, PyObject *);
2596                const char *str = va_arg(count, const char *);
2597                PyObject *str_obj;
2598                assert(obj || str);
2599                assert(!obj || _PyUnicode_CHECK(obj));
2600                if (obj) {
2601                    if (PyUnicode_READY(obj) == -1)
2602                        goto fail;
2603                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2604                    maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2605                    n += PyUnicode_GET_LENGTH(obj);
2606                    *callresult++ = NULL;
2607                }
2608                else {
2609                    str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2610                    if (!str_obj)
2611                        goto fail;
2612                    if (PyUnicode_READY(str_obj) == -1) {
2613                        Py_DECREF(str_obj);
2614                        goto fail;
2615                    }
2616                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2617                    maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2618                    n += PyUnicode_GET_LENGTH(str_obj);
2619                    *callresult++ = str_obj;
2620                }
2621                break;
2622            }
2623            case 'S':
2624            {
2625                PyObject *obj = va_arg(count, PyObject *);
2626                PyObject *str;
2627                assert(obj);
2628                str = PyObject_Str(obj);
2629                if (!str)
2630                    goto fail;
2631                if (PyUnicode_READY(str) == -1) {
2632                    Py_DECREF(str);
2633                    goto fail;
2634                }
2635                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2636                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2637                n += PyUnicode_GET_LENGTH(str);
2638                /* Remember the str and switch to the next slot */
2639                *callresult++ = str;
2640                break;
2641            }
2642            case 'R':
2643            {
2644                PyObject *obj = va_arg(count, PyObject *);
2645                PyObject *repr;
2646                assert(obj);
2647                repr = PyObject_Repr(obj);
2648                if (!repr)
2649                    goto fail;
2650                if (PyUnicode_READY(repr) == -1) {
2651                    Py_DECREF(repr);
2652                    goto fail;
2653                }
2654                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2655                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2656                n += PyUnicode_GET_LENGTH(repr);
2657                /* Remember the repr and switch to the next slot */
2658                *callresult++ = repr;
2659                break;
2660            }
2661            case 'A':
2662            {
2663                PyObject *obj = va_arg(count, PyObject *);
2664                PyObject *ascii;
2665                assert(obj);
2666                ascii = PyObject_ASCII(obj);
2667                if (!ascii)
2668                    goto fail;
2669                if (PyUnicode_READY(ascii) == -1) {
2670                    Py_DECREF(ascii);
2671                    goto fail;
2672                }
2673                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2674                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2675                n += PyUnicode_GET_LENGTH(ascii);
2676                /* Remember the repr and switch to the next slot */
2677                *callresult++ = ascii;
2678                break;
2679            }
2680            default:
2681                /* if we stumble upon an unknown
2682                   formatting code, copy the rest of
2683                   the format string to the output
2684                   string. (we cannot just skip the
2685                   code, since there's no way to know
2686                   what's in the argument list) */
2687                n += strlen(p);
2688                goto expand;
2689            }
2690        } else
2691            n++;
2692    }
2693  expand:
2694    /* step 4: fill the buffer */
2695    /* Since we've analyzed how much space we need,
2696       we don't have to resize the string.
2697       There can be no errors beyond this point. */
2698    string = PyUnicode_New(n, maxchar);
2699    if (!string)
2700        goto fail;
2701    kind = PyUnicode_KIND(string);
2702    data = PyUnicode_DATA(string);
2703    callresult = callresults;
2704    numberresult = numberresults;
2705
2706    for (i = 0, f = format; *f; f++) {
2707        if (*f == '%') {
2708            const char* p;
2709
2710            p = f;
2711            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2712            /* checking for == because the last argument could be a empty
2713               string, which causes i to point to end, the assert at the end of
2714               the loop */
2715            assert(i <= PyUnicode_GET_LENGTH(string));
2716
2717            switch (*f) {
2718            case 'c':
2719            {
2720                const int ordinal = va_arg(vargs, int);
2721                PyUnicode_WRITE(kind, data, i++, ordinal);
2722                break;
2723            }
2724            case 'i':
2725            case 'd':
2726            case 'u':
2727            case 'x':
2728            case 'p':
2729            {
2730                Py_ssize_t len;
2731                /* unused, since we already have the result */
2732                if (*f == 'p')
2733                    (void) va_arg(vargs, void *);
2734                else
2735                    (void) va_arg(vargs, int);
2736                /* extract the result from numberresults and append. */
2737                len = strlen(numberresult);
2738                unicode_write_cstr(string, i, numberresult, len);
2739                /* skip over the separating '\0' */
2740                i += len;
2741                numberresult += len;
2742                assert(*numberresult == '\0');
2743                numberresult++;
2744                assert(numberresult <= numberresults + numbersize);
2745                break;
2746            }
2747            case 's':
2748            {
2749                /* unused, since we already have the result */
2750                Py_ssize_t size;
2751                (void) va_arg(vargs, char *);
2752                size = PyUnicode_GET_LENGTH(*callresult);
2753                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2754                _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2755                i += size;
2756                /* We're done with the unicode()/repr() => forget it */
2757                Py_DECREF(*callresult);
2758                /* switch to next unicode()/repr() result */
2759                ++callresult;
2760                break;
2761            }
2762            case 'U':
2763            {
2764                PyObject *obj = va_arg(vargs, PyObject *);
2765                Py_ssize_t size;
2766                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2767                size = PyUnicode_GET_LENGTH(obj);
2768                _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2769                i += size;
2770                break;
2771            }
2772            case 'V':
2773            {
2774                Py_ssize_t size;
2775                PyObject *obj = va_arg(vargs, PyObject *);
2776                va_arg(vargs, const char *);
2777                if (obj) {
2778                    size = PyUnicode_GET_LENGTH(obj);
2779                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2780                    _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2781                    i += size;
2782                } else {
2783                    size = PyUnicode_GET_LENGTH(*callresult);
2784                    assert(PyUnicode_KIND(*callresult) <=
2785                           PyUnicode_KIND(string));
2786                    _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2787                    i += size;
2788                    Py_DECREF(*callresult);
2789                }
2790                ++callresult;
2791                break;
2792            }
2793            case 'S':
2794            case 'R':
2795            case 'A':
2796            {
2797                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2798                /* unused, since we already have the result */
2799                (void) va_arg(vargs, PyObject *);
2800                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2801                _PyUnicode_FastCopyCharacters(string, i, *callresult, 0,  size);
2802                i += size;
2803                /* We're done with the unicode()/repr() => forget it */
2804                Py_DECREF(*callresult);
2805                /* switch to next unicode()/repr() result */
2806                ++callresult;
2807                break;
2808            }
2809            case '%':
2810                PyUnicode_WRITE(kind, data, i++, '%');
2811                break;
2812            default:
2813            {
2814                Py_ssize_t len = strlen(p);
2815                unicode_write_cstr(string, i, p, len);
2816                i += len;
2817                assert(i == PyUnicode_GET_LENGTH(string));
2818                goto end;
2819            }
2820            }
2821        }
2822        else {
2823            assert(i < PyUnicode_GET_LENGTH(string));
2824            PyUnicode_WRITE(kind, data, i++, *f);
2825        }
2826    }
2827    assert(i == PyUnicode_GET_LENGTH(string));
2828
2829  end:
2830    if (callresults)
2831        PyObject_Free(callresults);
2832    if (numberresults)
2833        PyObject_Free(numberresults);
2834    return unicode_result(string);
2835  fail:
2836    if (callresults) {
2837        PyObject **callresult2 = callresults;
2838        while (callresult2 < callresult) {
2839            Py_XDECREF(*callresult2);
2840            ++callresult2;
2841        }
2842        PyObject_Free(callresults);
2843    }
2844    if (numberresults)
2845        PyObject_Free(numberresults);
2846    return NULL;
2847}
2848
2849PyObject *
2850PyUnicode_FromFormat(const char *format, ...)
2851{
2852    PyObject* ret;
2853    va_list vargs;
2854
2855#ifdef HAVE_STDARG_PROTOTYPES
2856    va_start(vargs, format);
2857#else
2858    va_start(vargs);
2859#endif
2860    ret = PyUnicode_FromFormatV(format, vargs);
2861    va_end(vargs);
2862    return ret;
2863}
2864
2865#ifdef HAVE_WCHAR_H
2866
2867/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2868   convert a Unicode object to a wide character string.
2869
2870   - If w is NULL: return the number of wide characters (including the null
2871     character) required to convert the unicode object. Ignore size argument.
2872
2873   - Otherwise: return the number of wide characters (excluding the null
2874     character) written into w. Write at most size wide characters (including
2875     the null character). */
2876static Py_ssize_t
2877unicode_aswidechar(PyObject *unicode,
2878                   wchar_t *w,
2879                   Py_ssize_t size)
2880{
2881    Py_ssize_t res;
2882    const wchar_t *wstr;
2883
2884    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2885    if (wstr == NULL)
2886        return -1;
2887
2888    if (w != NULL) {
2889        if (size > res)
2890            size = res + 1;
2891        else
2892            res = size;
2893        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2894        return res;
2895    }
2896    else
2897        return res + 1;
2898}
2899
2900Py_ssize_t
2901PyUnicode_AsWideChar(PyObject *unicode,
2902                     wchar_t *w,
2903                     Py_ssize_t size)
2904{
2905    if (unicode == NULL) {
2906        PyErr_BadInternalCall();
2907        return -1;
2908    }
2909    return unicode_aswidechar(unicode, w, size);
2910}
2911
2912wchar_t*
2913PyUnicode_AsWideCharString(PyObject *unicode,
2914                           Py_ssize_t *size)
2915{
2916    wchar_t* buffer;
2917    Py_ssize_t buflen;
2918
2919    if (unicode == NULL) {
2920        PyErr_BadInternalCall();
2921        return NULL;
2922    }
2923
2924    buflen = unicode_aswidechar(unicode, NULL, 0);
2925    if (buflen == -1)
2926        return NULL;
2927    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2928        PyErr_NoMemory();
2929        return NULL;
2930    }
2931
2932    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2933    if (buffer == NULL) {
2934        PyErr_NoMemory();
2935        return NULL;
2936    }
2937    buflen = unicode_aswidechar(unicode, buffer, buflen);
2938    if (buflen == -1) {
2939        PyMem_FREE(buffer);
2940        return NULL;
2941    }
2942    if (size != NULL)
2943        *size = buflen;
2944    return buffer;
2945}
2946
2947#endif /* HAVE_WCHAR_H */
2948
2949PyObject *
2950PyUnicode_FromOrdinal(int ordinal)
2951{
2952    PyObject *v;
2953    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2954        PyErr_SetString(PyExc_ValueError,
2955                        "chr() arg not in range(0x110000)");
2956        return NULL;
2957    }
2958
2959    if (ordinal < 256)
2960        return get_latin1_char(ordinal);
2961
2962    v = PyUnicode_New(1, ordinal);
2963    if (v == NULL)
2964        return NULL;
2965    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2966    assert(_PyUnicode_CheckConsistency(v, 1));
2967    return v;
2968}
2969
2970PyObject *
2971PyUnicode_FromObject(register PyObject *obj)
2972{
2973    /* XXX Perhaps we should make this API an alias of
2974       PyObject_Str() instead ?! */
2975    if (PyUnicode_CheckExact(obj)) {
2976        if (PyUnicode_READY(obj) == -1)
2977            return NULL;
2978        Py_INCREF(obj);
2979        return obj;
2980    }
2981    if (PyUnicode_Check(obj)) {
2982        /* For a Unicode subtype that's not a Unicode object,
2983           return a true Unicode object with the same data. */
2984        return _PyUnicode_Copy(obj);
2985    }
2986    PyErr_Format(PyExc_TypeError,
2987                 "Can't convert '%.100s' object to str implicitly",
2988                 Py_TYPE(obj)->tp_name);
2989    return NULL;
2990}
2991
2992PyObject *
2993PyUnicode_FromEncodedObject(register PyObject *obj,
2994                            const char *encoding,
2995                            const char *errors)
2996{
2997    Py_buffer buffer;
2998    PyObject *v;
2999
3000    if (obj == NULL) {
3001        PyErr_BadInternalCall();
3002        return NULL;
3003    }
3004
3005    /* Decoding bytes objects is the most common case and should be fast */
3006    if (PyBytes_Check(obj)) {
3007        if (PyBytes_GET_SIZE(obj) == 0) {
3008            Py_INCREF(unicode_empty);
3009            v = unicode_empty;
3010        }
3011        else {
3012            v = PyUnicode_Decode(
3013                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3014                    encoding, errors);
3015        }
3016        return v;
3017    }
3018
3019    if (PyUnicode_Check(obj)) {
3020        PyErr_SetString(PyExc_TypeError,
3021                        "decoding str is not supported");
3022        return NULL;
3023    }
3024
3025    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3026    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3027        PyErr_Format(PyExc_TypeError,
3028                     "coercing to str: need bytes, bytearray "
3029                     "or buffer-like object, %.80s found",
3030                     Py_TYPE(obj)->tp_name);
3031        return NULL;
3032    }
3033
3034    if (buffer.len == 0) {
3035        Py_INCREF(unicode_empty);
3036        v = unicode_empty;
3037    }
3038    else
3039        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3040
3041    PyBuffer_Release(&buffer);
3042    return v;
3043}
3044
3045/* Convert encoding to lower case and replace '_' with '-' in order to
3046   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3047   1 on success. */
3048static int
3049normalize_encoding(const char *encoding,
3050                   char *lower,
3051                   size_t lower_len)
3052{
3053    const char *e;
3054    char *l;
3055    char *l_end;
3056
3057    if (encoding == NULL) {
3058        strcpy(lower, "utf-8");
3059        return 1;
3060    }
3061    e = encoding;
3062    l = lower;
3063    l_end = &lower[lower_len - 1];
3064    while (*e) {
3065        if (l == l_end)
3066            return 0;
3067        if (Py_ISUPPER(*e)) {
3068            *l++ = Py_TOLOWER(*e++);
3069        }
3070        else if (*e == '_') {
3071            *l++ = '-';
3072            e++;
3073        }
3074        else {
3075            *l++ = *e++;
3076        }
3077    }
3078    *l = '\0';
3079    return 1;
3080}
3081
3082PyObject *
3083PyUnicode_Decode(const char *s,
3084                 Py_ssize_t size,
3085                 const char *encoding,
3086                 const char *errors)
3087{
3088    PyObject *buffer = NULL, *unicode;
3089    Py_buffer info;
3090    char lower[11];  /* Enough for any encoding shortcut */
3091
3092    /* Shortcuts for common default encodings */
3093    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3094        if ((strcmp(lower, "utf-8") == 0) ||
3095            (strcmp(lower, "utf8") == 0))
3096            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3097        else if ((strcmp(lower, "latin-1") == 0) ||
3098                 (strcmp(lower, "latin1") == 0) ||
3099                 (strcmp(lower, "iso-8859-1") == 0))
3100            return PyUnicode_DecodeLatin1(s, size, errors);
3101#ifdef HAVE_MBCS
3102        else if (strcmp(lower, "mbcs") == 0)
3103            return PyUnicode_DecodeMBCS(s, size, errors);
3104#endif
3105        else if (strcmp(lower, "ascii") == 0)
3106            return PyUnicode_DecodeASCII(s, size, errors);
3107        else if (strcmp(lower, "utf-16") == 0)
3108            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3109        else if (strcmp(lower, "utf-32") == 0)
3110            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3111    }
3112
3113    /* Decode via the codec registry */
3114    buffer = NULL;
3115    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3116        goto onError;
3117    buffer = PyMemoryView_FromBuffer(&info);
3118    if (buffer == NULL)
3119        goto onError;
3120    unicode = PyCodec_Decode(buffer, encoding, errors);
3121    if (unicode == NULL)
3122        goto onError;
3123    if (!PyUnicode_Check(unicode)) {
3124        PyErr_Format(PyExc_TypeError,
3125                     "decoder did not return a str object (type=%.400s)",
3126                     Py_TYPE(unicode)->tp_name);
3127        Py_DECREF(unicode);
3128        goto onError;
3129    }
3130    Py_DECREF(buffer);
3131    return unicode_result(unicode);
3132
3133  onError:
3134    Py_XDECREF(buffer);
3135    return NULL;
3136}
3137
3138PyObject *
3139PyUnicode_AsDecodedObject(PyObject *unicode,
3140                          const char *encoding,
3141                          const char *errors)
3142{
3143    PyObject *v;
3144
3145    if (!PyUnicode_Check(unicode)) {
3146        PyErr_BadArgument();
3147        goto onError;
3148    }
3149
3150    if (encoding == NULL)
3151        encoding = PyUnicode_GetDefaultEncoding();
3152
3153    /* Decode via the codec registry */
3154    v = PyCodec_Decode(unicode, encoding, errors);
3155    if (v == NULL)
3156        goto onError;
3157    return unicode_result(v);
3158
3159  onError:
3160    return NULL;
3161}
3162
3163PyObject *
3164PyUnicode_AsDecodedUnicode(PyObject *unicode,
3165                           const char *encoding,
3166                           const char *errors)
3167{
3168    PyObject *v;
3169
3170    if (!PyUnicode_Check(unicode)) {
3171        PyErr_BadArgument();
3172        goto onError;
3173    }
3174
3175    if (encoding == NULL)
3176        encoding = PyUnicode_GetDefaultEncoding();
3177
3178    /* Decode via the codec registry */
3179    v = PyCodec_Decode(unicode, encoding, errors);
3180    if (v == NULL)
3181        goto onError;
3182    if (!PyUnicode_Check(v)) {
3183        PyErr_Format(PyExc_TypeError,
3184                     "decoder did not return a str object (type=%.400s)",
3185                     Py_TYPE(v)->tp_name);
3186        Py_DECREF(v);
3187        goto onError;
3188    }
3189    return unicode_result(v);
3190
3191  onError:
3192    return NULL;
3193}
3194
3195PyObject *
3196PyUnicode_Encode(const Py_UNICODE *s,
3197                 Py_ssize_t size,
3198                 const char *encoding,
3199                 const char *errors)
3200{
3201    PyObject *v, *unicode;
3202
3203    unicode = PyUnicode_FromUnicode(s, size);
3204    if (unicode == NULL)
3205        return NULL;
3206    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3207    Py_DECREF(unicode);
3208    return v;
3209}
3210
3211PyObject *
3212PyUnicode_AsEncodedObject(PyObject *unicode,
3213                          const char *encoding,
3214                          const char *errors)
3215{
3216    PyObject *v;
3217
3218    if (!PyUnicode_Check(unicode)) {
3219        PyErr_BadArgument();
3220        goto onError;
3221    }
3222
3223    if (encoding == NULL)
3224        encoding = PyUnicode_GetDefaultEncoding();
3225
3226    /* Encode via the codec registry */
3227    v = PyCodec_Encode(unicode, encoding, errors);
3228    if (v == NULL)
3229        goto onError;
3230    return v;
3231
3232  onError:
3233    return NULL;
3234}
3235
3236static size_t
3237wcstombs_errorpos(const wchar_t *wstr)
3238{
3239    size_t len;
3240#if SIZEOF_WCHAR_T == 2
3241    wchar_t buf[3];
3242#else
3243    wchar_t buf[2];
3244#endif
3245    char outbuf[MB_LEN_MAX];
3246    const wchar_t *start, *previous;
3247
3248#if SIZEOF_WCHAR_T == 2
3249    buf[2] = 0;
3250#else
3251    buf[1] = 0;
3252#endif
3253    start = wstr;
3254    while (*wstr != L'\0')
3255    {
3256        previous = wstr;
3257#if SIZEOF_WCHAR_T == 2
3258        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3259            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3260        {
3261            buf[0] = wstr[0];
3262            buf[1] = wstr[1];
3263            wstr += 2;
3264        }
3265        else {
3266            buf[0] = *wstr;
3267            buf[1] = 0;
3268            wstr++;
3269        }
3270#else
3271        buf[0] = *wstr;
3272        wstr++;
3273#endif
3274        len = wcstombs(outbuf, buf, sizeof(outbuf));
3275        if (len == (size_t)-1)
3276            return previous - start;
3277    }
3278
3279    /* failed to find the unencodable character */
3280    return 0;
3281}
3282
3283static int
3284locale_error_handler(const char *errors, int *surrogateescape)
3285{
3286    if (errors == NULL) {
3287        *surrogateescape = 0;
3288        return 0;
3289    }
3290
3291    if (strcmp(errors, "strict") == 0) {
3292        *surrogateescape = 0;
3293        return 0;
3294    }
3295    if (strcmp(errors, "surrogateescape") == 0) {
3296        *surrogateescape = 1;
3297        return 0;
3298    }
3299    PyErr_Format(PyExc_ValueError,
3300                 "only 'strict' and 'surrogateescape' error handlers "
3301                 "are supported, not '%s'",
3302                 errors);
3303    return -1;
3304}
3305
3306PyObject *
3307PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3308{
3309    Py_ssize_t wlen, wlen2;
3310    wchar_t *wstr;
3311    PyObject *bytes = NULL;
3312    char *errmsg;
3313    PyObject *reason;
3314    PyObject *exc;
3315    size_t error_pos;
3316    int surrogateescape;
3317
3318    if (locale_error_handler(errors, &surrogateescape) < 0)
3319        return NULL;
3320
3321    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3322    if (wstr == NULL)
3323        return NULL;
3324
3325    wlen2 = wcslen(wstr);
3326    if (wlen2 != wlen) {
3327        PyMem_Free(wstr);
3328        PyErr_SetString(PyExc_TypeError, "embedded null character");
3329        return NULL;
3330    }
3331
3332    if (surrogateescape) {
3333        /* locale encoding with surrogateescape */
3334        char *str;
3335
3336        str = _Py_wchar2char(wstr, &error_pos);
3337        if (str == NULL) {
3338            if (error_pos == (size_t)-1) {
3339                PyErr_NoMemory();
3340                PyMem_Free(wstr);
3341                return NULL;
3342            }
3343            else {
3344                goto encode_error;
3345            }
3346        }
3347        PyMem_Free(wstr);
3348
3349        bytes = PyBytes_FromString(str);
3350        PyMem_Free(str);
3351    }
3352    else {
3353        size_t len, len2;
3354
3355        len = wcstombs(NULL, wstr, 0);
3356        if (len == (size_t)-1) {
3357            error_pos = (size_t)-1;
3358            goto encode_error;
3359        }
3360
3361        bytes = PyBytes_FromStringAndSize(NULL, len);
3362        if (bytes == NULL) {
3363            PyMem_Free(wstr);
3364            return NULL;
3365        }
3366
3367        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3368        if (len2 == (size_t)-1 || len2 > len) {
3369            error_pos = (size_t)-1;
3370            goto encode_error;
3371        }
3372        PyMem_Free(wstr);
3373    }
3374    return bytes;
3375
3376encode_error:
3377    errmsg = strerror(errno);
3378    assert(errmsg != NULL);
3379
3380    if (error_pos == (size_t)-1)
3381        error_pos = wcstombs_errorpos(wstr);
3382
3383    PyMem_Free(wstr);
3384    Py_XDECREF(bytes);
3385
3386    if (errmsg != NULL) {
3387        size_t errlen;
3388        wstr = _Py_char2wchar(errmsg, &errlen);
3389        if (wstr != NULL) {
3390            reason = PyUnicode_FromWideChar(wstr, errlen);
3391            PyMem_Free(wstr);
3392        } else
3393            errmsg = NULL;
3394    }
3395    if (errmsg == NULL)
3396        reason = PyUnicode_FromString(
3397            "wcstombs() encountered an unencodable "
3398            "wide character");
3399    if (reason == NULL)
3400        return NULL;
3401
3402    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3403                                "locale", unicode,
3404                                (Py_ssize_t)error_pos,
3405                                (Py_ssize_t)(error_pos+1),
3406                                reason);
3407    Py_DECREF(reason);
3408    if (exc != NULL) {
3409        PyCodec_StrictErrors(exc);
3410        Py_XDECREF(exc);
3411    }
3412    return NULL;
3413}
3414
3415PyObject *
3416PyUnicode_EncodeFSDefault(PyObject *unicode)
3417{
3418#ifdef HAVE_MBCS
3419    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3420#elif defined(__APPLE__)
3421    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3422#else
3423    PyInterpreterState *interp = PyThreadState_GET()->interp;
3424    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3425       cannot use it to encode and decode filenames before it is loaded. Load
3426       the Python codec requires to encode at least its own filename. Use the C
3427       version of the locale codec until the codec registry is initialized and
3428       the Python codec is loaded.
3429
3430       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3431       cannot only rely on it: check also interp->fscodec_initialized for
3432       subinterpreters. */
3433    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3434        return PyUnicode_AsEncodedString(unicode,
3435                                         Py_FileSystemDefaultEncoding,
3436                                         "surrogateescape");
3437    }
3438    else {
3439        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3440    }
3441#endif
3442}
3443
3444PyObject *
3445PyUnicode_AsEncodedString(PyObject *unicode,
3446                          const char *encoding,
3447                          const char *errors)
3448{
3449    PyObject *v;
3450    char lower[11];  /* Enough for any encoding shortcut */
3451
3452    if (!PyUnicode_Check(unicode)) {
3453        PyErr_BadArgument();
3454        return NULL;
3455    }
3456
3457    /* Shortcuts for common default encodings */
3458    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3459        if ((strcmp(lower, "utf-8") == 0) ||
3460            (strcmp(lower, "utf8") == 0))
3461        {
3462            if (errors == NULL || strcmp(errors, "strict") == 0)
3463                return _PyUnicode_AsUTF8String(unicode, NULL);
3464            else
3465                return _PyUnicode_AsUTF8String(unicode, errors);
3466        }
3467        else if ((strcmp(lower, "latin-1") == 0) ||
3468                 (strcmp(lower, "latin1") == 0) ||
3469                 (strcmp(lower, "iso-8859-1") == 0))
3470            return _PyUnicode_AsLatin1String(unicode, errors);
3471#ifdef HAVE_MBCS
3472        else if (strcmp(lower, "mbcs") == 0)
3473            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3474#endif
3475        else if (strcmp(lower, "ascii") == 0)
3476            return _PyUnicode_AsASCIIString(unicode, errors);
3477    }
3478
3479    /* Encode via the codec registry */
3480    v = PyCodec_Encode(unicode, encoding, errors);
3481    if (v == NULL)
3482        return NULL;
3483
3484    /* The normal path */
3485    if (PyBytes_Check(v))
3486        return v;
3487
3488    /* If the codec returns a buffer, raise a warning and convert to bytes */
3489    if (PyByteArray_Check(v)) {
3490        int error;
3491        PyObject *b;
3492
3493        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3494            "encoder %s returned bytearray instead of bytes",
3495            encoding);
3496        if (error) {
3497            Py_DECREF(v);
3498            return NULL;
3499        }
3500
3501        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3502        Py_DECREF(v);
3503        return b;
3504    }
3505
3506    PyErr_Format(PyExc_TypeError,
3507                 "encoder did not return a bytes object (type=%.400s)",
3508                 Py_TYPE(v)->tp_name);
3509    Py_DECREF(v);
3510    return NULL;
3511}
3512
3513PyObject *
3514PyUnicode_AsEncodedUnicode(PyObject *unicode,
3515                           const char *encoding,
3516                           const char *errors)
3517{
3518    PyObject *v;
3519
3520    if (!PyUnicode_Check(unicode)) {
3521        PyErr_BadArgument();
3522        goto onError;
3523    }
3524
3525    if (encoding == NULL)
3526        encoding = PyUnicode_GetDefaultEncoding();
3527
3528    /* Encode via the codec registry */
3529    v = PyCodec_Encode(unicode, encoding, errors);
3530    if (v == NULL)
3531        goto onError;
3532    if (!PyUnicode_Check(v)) {
3533        PyErr_Format(PyExc_TypeError,
3534                     "encoder did not return an str object (type=%.400s)",
3535                     Py_TYPE(v)->tp_name);
3536        Py_DECREF(v);
3537        goto onError;
3538    }
3539    return v;
3540
3541  onError:
3542    return NULL;
3543}
3544
3545static size_t
3546mbstowcs_errorpos(const char *str, size_t len)
3547{
3548#ifdef HAVE_MBRTOWC
3549    const char *start = str;
3550    mbstate_t mbs;
3551    size_t converted;
3552    wchar_t ch;
3553
3554    memset(&mbs, 0, sizeof mbs);
3555    while (len)
3556    {
3557        converted = mbrtowc(&ch, (char*)str, len, &mbs);
3558        if (converted == 0)
3559            /* Reached end of string */
3560            break;
3561        if (converted == (size_t)-1 || converted == (size_t)-2) {
3562            /* Conversion error or incomplete character */
3563            return str - start;
3564        }
3565        else {
3566            str += converted;
3567            len -= converted;
3568        }
3569    }
3570    /* failed to find the undecodable byte sequence */
3571    return 0;
3572#endif
3573    return 0;
3574}
3575
3576PyObject*
3577PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3578                              const char *errors)
3579{
3580    wchar_t smallbuf[256];
3581    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3582    wchar_t *wstr;
3583    size_t wlen, wlen2;
3584    PyObject *unicode;
3585    int surrogateescape;
3586    size_t error_pos;
3587    char *errmsg;
3588    PyObject *reason, *exc;
3589
3590    if (locale_error_handler(errors, &surrogateescape) < 0)
3591        return NULL;
3592
3593    if (str[len] != '\0' || len != strlen(str)) {
3594        PyErr_SetString(PyExc_TypeError, "embedded null character");
3595        return NULL;
3596    }
3597
3598    if (surrogateescape)
3599    {
3600        wstr = _Py_char2wchar(str, &wlen);
3601        if (wstr == NULL) {
3602            if (wlen == (size_t)-1)
3603                PyErr_NoMemory();
3604            else
3605                PyErr_SetFromErrno(PyExc_OSError);
3606            return NULL;
3607        }
3608
3609        unicode = PyUnicode_FromWideChar(wstr, wlen);
3610        PyMem_Free(wstr);
3611    }
3612    else {
3613#ifndef HAVE_BROKEN_MBSTOWCS
3614        wlen = mbstowcs(NULL, str, 0);
3615#else
3616        wlen = len;
3617#endif
3618        if (wlen == (size_t)-1)
3619            goto decode_error;
3620        if (wlen+1 <= smallbuf_len) {
3621            wstr = smallbuf;
3622        }
3623        else {
3624            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3625                return PyErr_NoMemory();
3626
3627            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3628            if (!wstr)
3629                return PyErr_NoMemory();
3630        }
3631
3632        /* This shouldn't fail now */
3633        wlen2 = mbstowcs(wstr, str, wlen+1);
3634        if (wlen2 == (size_t)-1) {
3635            if (wstr != smallbuf)
3636                PyMem_Free(wstr);
3637            goto decode_error;
3638        }
3639#ifdef HAVE_BROKEN_MBSTOWCS
3640        assert(wlen2 == wlen);
3641#endif
3642        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3643        if (wstr != smallbuf)
3644            PyMem_Free(wstr);
3645    }
3646    return unicode;
3647
3648decode_error:
3649    errmsg = strerror(errno);
3650    assert(errmsg != NULL);
3651
3652    error_pos = mbstowcs_errorpos(str, len);
3653    if (errmsg != NULL) {
3654        size_t errlen;
3655        wstr = _Py_char2wchar(errmsg, &errlen);
3656        if (wstr != NULL) {
3657            reason = PyUnicode_FromWideChar(wstr, errlen);
3658            PyMem_Free(wstr);
3659        } else
3660            errmsg = NULL;
3661    }
3662    if (errmsg == NULL)
3663        reason = PyUnicode_FromString(
3664            "mbstowcs() encountered an invalid multibyte sequence");
3665    if (reason == NULL)
3666        return NULL;
3667
3668    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3669                                "locale", str, len,
3670                                (Py_ssize_t)error_pos,
3671                                (Py_ssize_t)(error_pos+1),
3672                                reason);
3673    Py_DECREF(reason);
3674    if (exc != NULL) {
3675        PyCodec_StrictErrors(exc);
3676        Py_XDECREF(exc);
3677    }
3678    return NULL;
3679}
3680
3681PyObject*
3682PyUnicode_DecodeLocale(const char *str, const char *errors)
3683{
3684    Py_ssize_t size = (Py_ssize_t)strlen(str);
3685    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3686}
3687
3688
3689PyObject*
3690PyUnicode_DecodeFSDefault(const char *s) {
3691    Py_ssize_t size = (Py_ssize_t)strlen(s);
3692    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3693}
3694
3695PyObject*
3696PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3697{
3698#ifdef HAVE_MBCS
3699    return PyUnicode_DecodeMBCS(s, size, NULL);
3700#elif defined(__APPLE__)
3701    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3702#else
3703    PyInterpreterState *interp = PyThreadState_GET()->interp;
3704    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3705       cannot use it to encode and decode filenames before it is loaded. Load
3706       the Python codec requires to encode at least its own filename. Use the C
3707       version of the locale codec until the codec registry is initialized and
3708       the Python codec is loaded.
3709
3710       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3711       cannot only rely on it: check also interp->fscodec_initialized for
3712       subinterpreters. */
3713    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3714        return PyUnicode_Decode(s, size,
3715                                Py_FileSystemDefaultEncoding,
3716                                "surrogateescape");
3717    }
3718    else {
3719        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3720    }
3721#endif
3722}
3723
3724
3725int
3726_PyUnicode_HasNULChars(PyObject* s)
3727{
3728    static PyObject *nul = NULL;
3729
3730    if (nul == NULL)
3731        nul = PyUnicode_FromStringAndSize("\0", 1);
3732    if (nul == NULL)
3733        return -1;
3734    return PyUnicode_Contains(s, nul);
3735}
3736
3737
3738int
3739PyUnicode_FSConverter(PyObject* arg, void* addr)
3740{
3741    PyObject *output = NULL;
3742    Py_ssize_t size;
3743    void *data;
3744    if (arg == NULL) {
3745        Py_DECREF(*(PyObject**)addr);
3746        return 1;
3747    }
3748    if (PyBytes_Check(arg)) {
3749        output = arg;
3750        Py_INCREF(output);
3751    }
3752    else {
3753        arg = PyUnicode_FromObject(arg);
3754        if (!arg)
3755            return 0;
3756        output = PyUnicode_EncodeFSDefault(arg);
3757        Py_DECREF(arg);
3758        if (!output)
3759            return 0;
3760        if (!PyBytes_Check(output)) {
3761            Py_DECREF(output);
3762            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3763            return 0;
3764        }
3765    }
3766    size = PyBytes_GET_SIZE(output);
3767    data = PyBytes_AS_STRING(output);
3768    if (size != strlen(data)) {
3769        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3770        Py_DECREF(output);
3771        return 0;
3772    }
3773    *(PyObject**)addr = output;
3774    return Py_CLEANUP_SUPPORTED;
3775}
3776
3777
3778int
3779PyUnicode_FSDecoder(PyObject* arg, void* addr)
3780{
3781    PyObject *output = NULL;
3782    if (arg == NULL) {
3783        Py_DECREF(*(PyObject**)addr);
3784        return 1;
3785    }
3786    if (PyUnicode_Check(arg)) {
3787        if (PyUnicode_READY(arg) == -1)
3788            return 0;
3789        output = arg;
3790        Py_INCREF(output);
3791    }
3792    else {
3793        arg = PyBytes_FromObject(arg);
3794        if (!arg)
3795            return 0;
3796        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3797                                                  PyBytes_GET_SIZE(arg));
3798        Py_DECREF(arg);
3799        if (!output)
3800            return 0;
3801        if (!PyUnicode_Check(output)) {
3802            Py_DECREF(output);
3803            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3804            return 0;
3805        }
3806    }
3807    if (PyUnicode_READY(output) == -1) {
3808        Py_DECREF(output);
3809        return 0;
3810    }
3811    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3812                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3813        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3814        Py_DECREF(output);
3815        return 0;
3816    }
3817    *(PyObject**)addr = output;
3818    return Py_CLEANUP_SUPPORTED;
3819}
3820
3821
3822char*
3823PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3824{
3825    PyObject *bytes;
3826
3827    if (!PyUnicode_Check(unicode)) {
3828        PyErr_BadArgument();
3829        return NULL;
3830    }
3831    if (PyUnicode_READY(unicode) == -1)
3832        return NULL;
3833
3834    if (PyUnicode_UTF8(unicode) == NULL) {
3835        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3836        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3837        if (bytes == NULL)
3838            return NULL;
3839        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3840        if (_PyUnicode_UTF8(unicode) == NULL) {
3841            Py_DECREF(bytes);
3842            return NULL;
3843        }
3844        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3845        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3846                  PyBytes_AS_STRING(bytes),
3847                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3848        Py_DECREF(bytes);
3849    }
3850
3851    if (psize)
3852        *psize = PyUnicode_UTF8_LENGTH(unicode);
3853    return PyUnicode_UTF8(unicode);
3854}
3855
3856char*
3857PyUnicode_AsUTF8(PyObject *unicode)
3858{
3859    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3860}
3861
3862Py_UNICODE *
3863PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3864{
3865    const unsigned char *one_byte;
3866#if SIZEOF_WCHAR_T == 4
3867    const Py_UCS2 *two_bytes;
3868#else
3869    const Py_UCS4 *four_bytes;
3870    const Py_UCS4 *ucs4_end;
3871    Py_ssize_t num_surrogates;
3872#endif
3873    wchar_t *w;
3874    wchar_t *wchar_end;
3875
3876    if (!PyUnicode_Check(unicode)) {
3877        PyErr_BadArgument();
3878        return NULL;
3879    }
3880    if (_PyUnicode_WSTR(unicode) == NULL) {
3881        /* Non-ASCII compact unicode object */
3882        assert(_PyUnicode_KIND(unicode) != 0);
3883        assert(PyUnicode_IS_READY(unicode));
3884
3885        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3886#if SIZEOF_WCHAR_T == 2
3887            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3888            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3889            num_surrogates = 0;
3890
3891            for (; four_bytes < ucs4_end; ++four_bytes) {
3892                if (*four_bytes > 0xFFFF)
3893                    ++num_surrogates;
3894            }
3895
3896            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3897                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3898            if (!_PyUnicode_WSTR(unicode)) {
3899                PyErr_NoMemory();
3900                return NULL;
3901            }
3902            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3903
3904            w = _PyUnicode_WSTR(unicode);
3905            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3906            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3907            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3908                if (*four_bytes > 0xFFFF) {
3909                    assert(*four_bytes <= MAX_UNICODE);
3910                    /* encode surrogate pair in this case */
3911                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3912                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3913                }
3914                else
3915                    *w = *four_bytes;
3916
3917                if (w > wchar_end) {
3918                    assert(0 && "Miscalculated string end");
3919                }
3920            }
3921            *w = 0;
3922#else
3923            /* sizeof(wchar_t) == 4 */
3924            Py_FatalError("Impossible unicode object state, wstr and str "
3925                          "should share memory already.");
3926            return NULL;
3927#endif
3928        }
3929        else {
3930            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3931                                                  (_PyUnicode_LENGTH(unicode) + 1));
3932            if (!_PyUnicode_WSTR(unicode)) {
3933                PyErr_NoMemory();
3934                return NULL;
3935            }
3936            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3937                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3938            w = _PyUnicode_WSTR(unicode);
3939            wchar_end = w + _PyUnicode_LENGTH(unicode);
3940
3941            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3942                one_byte = PyUnicode_1BYTE_DATA(unicode);
3943                for (; w < wchar_end; ++one_byte, ++w)
3944                    *w = *one_byte;
3945                /* null-terminate the wstr */
3946                *w = 0;
3947            }
3948            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3949#if SIZEOF_WCHAR_T == 4
3950                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3951                for (; w < wchar_end; ++two_bytes, ++w)
3952                    *w = *two_bytes;
3953                /* null-terminate the wstr */
3954                *w = 0;
3955#else
3956                /* sizeof(wchar_t) == 2 */
3957                PyObject_FREE(_PyUnicode_WSTR(unicode));
3958                _PyUnicode_WSTR(unicode) = NULL;
3959                Py_FatalError("Impossible unicode object state, wstr "
3960                              "and str should share memory already.");
3961                return NULL;
3962#endif
3963            }
3964            else {
3965                assert(0 && "This should never happen.");
3966            }
3967        }
3968    }
3969    if (size != NULL)
3970        *size = PyUnicode_WSTR_LENGTH(unicode);
3971    return _PyUnicode_WSTR(unicode);
3972}
3973
3974Py_UNICODE *
3975PyUnicode_AsUnicode(PyObject *unicode)
3976{
3977    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3978}
3979
3980
3981Py_ssize_t
3982PyUnicode_GetSize(PyObject *unicode)
3983{
3984    if (!PyUnicode_Check(unicode)) {
3985        PyErr_BadArgument();
3986        goto onError;
3987    }
3988    return PyUnicode_GET_SIZE(unicode);
3989
3990  onError:
3991    return -1;
3992}
3993
3994Py_ssize_t
3995PyUnicode_GetLength(PyObject *unicode)
3996{
3997    if (!PyUnicode_Check(unicode)) {
3998        PyErr_BadArgument();
3999        return -1;
4000    }
4001    if (PyUnicode_READY(unicode) == -1)
4002        return -1;
4003    return PyUnicode_GET_LENGTH(unicode);
4004}
4005
4006Py_UCS4
4007PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4008{
4009    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4010        PyErr_BadArgument();
4011        return (Py_UCS4)-1;
4012    }
4013    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4014        PyErr_SetString(PyExc_IndexError, "string index out of range");
4015        return (Py_UCS4)-1;
4016    }
4017    return PyUnicode_READ_CHAR(unicode, index);
4018}
4019
4020int
4021PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4022{
4023    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4024        PyErr_BadArgument();
4025        return -1;
4026    }
4027    assert(PyUnicode_IS_READY(unicode));
4028    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4029        PyErr_SetString(PyExc_IndexError, "string index out of range");
4030        return -1;
4031    }
4032    if (unicode_check_modifiable(unicode))
4033        return -1;
4034    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4035        PyErr_SetString(PyExc_ValueError, "character out of range");
4036        return -1;
4037    }
4038    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4039                    index, ch);
4040    return 0;
4041}
4042
4043const char *
4044PyUnicode_GetDefaultEncoding(void)
4045{
4046    return "utf-8";
4047}
4048
4049/* create or adjust a UnicodeDecodeError */
4050static void
4051make_decode_exception(PyObject **exceptionObject,
4052                      const char *encoding,
4053                      const char *input, Py_ssize_t length,
4054                      Py_ssize_t startpos, Py_ssize_t endpos,
4055                      const char *reason)
4056{
4057    if (*exceptionObject == NULL) {
4058        *exceptionObject = PyUnicodeDecodeError_Create(
4059            encoding, input, length, startpos, endpos, reason);
4060    }
4061    else {
4062        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4063            goto onError;
4064        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4065            goto onError;
4066        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4067            goto onError;
4068    }
4069    return;
4070
4071onError:
4072    Py_DECREF(*exceptionObject);
4073    *exceptionObject = NULL;
4074}
4075
4076/* error handling callback helper:
4077   build arguments, call the callback and check the arguments,
4078   if no exception occurred, copy the replacement to the output
4079   and adjust various state variables.
4080   return 0 on success, -1 on error
4081*/
4082
4083static int
4084unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
4085                                 const char *encoding, const char *reason,
4086                                 const char **input, const char **inend, Py_ssize_t *startinpos,
4087                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4088                                 PyObject **output, Py_ssize_t *outpos)
4089{
4090    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4091
4092    PyObject *restuple = NULL;
4093    PyObject *repunicode = NULL;
4094    Py_ssize_t outsize;
4095    Py_ssize_t insize;
4096    Py_ssize_t requiredsize;
4097    Py_ssize_t newpos;
4098    PyObject *inputobj = NULL;
4099    int res = -1;
4100
4101    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4102        outsize = PyUnicode_GET_LENGTH(*output);
4103    else
4104        outsize = _PyUnicode_WSTR_LENGTH(*output);
4105
4106    if (*errorHandler == NULL) {
4107        *errorHandler = PyCodec_LookupError(errors);
4108        if (*errorHandler == NULL)
4109            goto onError;
4110    }
4111
4112    make_decode_exception(exceptionObject,
4113        encoding,
4114        *input, *inend - *input,
4115        *startinpos, *endinpos,
4116        reason);
4117    if (*exceptionObject == NULL)
4118        goto onError;
4119
4120    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4121    if (restuple == NULL)
4122        goto onError;
4123    if (!PyTuple_Check(restuple)) {
4124        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4125        goto onError;
4126    }
4127    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4128        goto onError;
4129    if (PyUnicode_READY(repunicode) == -1)
4130        goto onError;
4131
4132    /* Copy back the bytes variables, which might have been modified by the
4133       callback */
4134    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4135    if (!inputobj)
4136        goto onError;
4137    if (!PyBytes_Check(inputobj)) {
4138        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4139    }
4140    *input = PyBytes_AS_STRING(inputobj);
4141    insize = PyBytes_GET_SIZE(inputobj);
4142    *inend = *input + insize;
4143    /* we can DECREF safely, as the exception has another reference,
4144       so the object won't go away. */
4145    Py_DECREF(inputobj);
4146
4147    if (newpos<0)
4148        newpos = insize+newpos;
4149    if (newpos<0 || newpos>insize) {
4150        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4151        goto onError;
4152    }
4153
4154    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4155        /* need more space? (at least enough for what we
4156           have+the replacement+the rest of the string (starting
4157           at the new input position), so we won't have to check space
4158           when there are no errors in the rest of the string) */
4159        Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4160        requiredsize = *outpos + replen + insize-newpos;
4161        if (requiredsize > outsize) {
4162            if (requiredsize<2*outsize)
4163                requiredsize = 2*outsize;
4164            if (unicode_resize(output, requiredsize) < 0)
4165                goto onError;
4166        }
4167        if (unicode_widen(output, *outpos,
4168                          PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
4169            goto onError;
4170        _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
4171        *outpos += replen;
4172    }
4173    else {
4174        wchar_t *repwstr;
4175        Py_ssize_t repwlen;
4176        repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4177        if (repwstr == NULL)
4178            goto onError;
4179        /* need more space? (at least enough for what we
4180           have+the replacement+the rest of the string (starting
4181           at the new input position), so we won't have to check space
4182           when there are no errors in the rest of the string) */
4183        requiredsize = *outpos + repwlen + insize-newpos;
4184        if (requiredsize > outsize) {
4185            if (requiredsize < 2*outsize)
4186                requiredsize = 2*outsize;
4187            if (unicode_resize(output, requiredsize) < 0)
4188                goto onError;
4189        }
4190        wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4191        *outpos += repwlen;
4192    }
4193    *endinpos = newpos;
4194    *inptr = *input + newpos;
4195
4196    /* we made it! */
4197    res = 0;
4198
4199  onError:
4200    Py_XDECREF(restuple);
4201    return res;
4202}
4203
4204/* --- UTF-7 Codec -------------------------------------------------------- */
4205
4206/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4207
4208/* Three simple macros defining base-64. */
4209
4210/* Is c a base-64 character? */
4211
4212#define IS_BASE64(c) \
4213    (((c) >= 'A' && (c) <= 'Z') ||     \
4214     ((c) >= 'a' && (c) <= 'z') ||     \
4215     ((c) >= '0' && (c) <= '9') ||     \
4216     (c) == '+' || (c) == '/')
4217
4218/* given that c is a base-64 character, what is its base-64 value? */
4219
4220#define FROM_BASE64(c)                                                  \
4221    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4222     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4223     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4224     (c) == '+' ? 62 : 63)
4225
4226/* What is the base-64 character of the bottom 6 bits of n? */
4227
4228#define TO_BASE64(n)  \
4229    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4230
4231/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4232 * decoded as itself.  We are permissive on decoding; the only ASCII
4233 * byte not decoding to itself is the + which begins a base64
4234 * string. */
4235
4236#define DECODE_DIRECT(c)                                \
4237    ((c) <= 127 && (c) != '+')
4238
4239/* The UTF-7 encoder treats ASCII characters differently according to
4240 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4241 * the above).  See RFC2152.  This array identifies these different
4242 * sets:
4243 * 0 : "Set D"
4244 *     alphanumeric and '(),-./:?
4245 * 1 : "Set O"
4246 *     !"#$%&*;<=>@[]^_`{|}
4247 * 2 : "whitespace"
4248 *     ht nl cr sp
4249 * 3 : special (must be base64 encoded)
4250 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4251 */
4252
4253static
4254char utf7_category[128] = {
4255/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4256    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4257/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4258    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4259/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4260    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4261/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4262    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4263/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4264    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4265/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4266    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4267/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4268    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4269/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4270    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4271};
4272
4273/* ENCODE_DIRECT: this character should be encoded as itself.  The
4274 * answer depends on whether we are encoding set O as itself, and also
4275 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4276 * clear that the answers to these questions vary between
4277 * applications, so this code needs to be flexible.  */
4278
4279#define ENCODE_DIRECT(c, directO, directWS)             \
4280    ((c) < 128 && (c) > 0 &&                            \
4281     ((utf7_category[(c)] == 0) ||                      \
4282      (directWS && (utf7_category[(c)] == 2)) ||        \
4283      (directO && (utf7_category[(c)] == 1))))
4284
4285PyObject *
4286PyUnicode_DecodeUTF7(const char *s,
4287                     Py_ssize_t size,
4288                     const char *errors)
4289{
4290    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4291}
4292
4293/* The decoder.  The only state we preserve is our read position,
4294 * i.e. how many characters we have consumed.  So if we end in the
4295 * middle of a shift sequence we have to back off the read position
4296 * and the output to the beginning of the sequence, otherwise we lose
4297 * all the shift state (seen bits, number of bits seen, high
4298 * surrogate). */
4299
4300PyObject *
4301PyUnicode_DecodeUTF7Stateful(const char *s,
4302                             Py_ssize_t size,
4303                             const char *errors,
4304                             Py_ssize_t *consumed)
4305{
4306    const char *starts = s;
4307    Py_ssize_t startinpos;
4308    Py_ssize_t endinpos;
4309    Py_ssize_t outpos;
4310    const char *e;
4311    PyObject *unicode;
4312    const char *errmsg = "";
4313    int inShift = 0;
4314    Py_ssize_t shiftOutStart;
4315    unsigned int base64bits = 0;
4316    unsigned long base64buffer = 0;
4317    Py_UCS4 surrogate = 0;
4318    PyObject *errorHandler = NULL;
4319    PyObject *exc = NULL;
4320
4321    /* Start off assuming it's all ASCII. Widen later as necessary. */
4322    unicode = PyUnicode_New(size, 127);
4323    if (!unicode)
4324        return NULL;
4325    if (size == 0) {
4326        if (consumed)
4327            *consumed = 0;
4328        return unicode;
4329    }
4330
4331    shiftOutStart = outpos = 0;
4332    e = s + size;
4333
4334    while (s < e) {
4335        Py_UCS4 ch;
4336      restart:
4337        ch = (unsigned char) *s;
4338
4339        if (inShift) { /* in a base-64 section */
4340            if (IS_BASE64(ch)) { /* consume a base-64 character */
4341                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4342                base64bits += 6;
4343                s++;
4344                if (base64bits >= 16) {
4345                    /* we have enough bits for a UTF-16 value */
4346                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4347                    base64bits -= 16;
4348                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4349                    if (surrogate) {
4350                        /* expecting a second surrogate */
4351                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4352                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4353                            if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4354                                goto onError;
4355                            surrogate = 0;
4356                            continue;
4357                        }
4358                        else {
4359                            if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4360                                goto onError;
4361                            surrogate = 0;
4362                        }
4363                    }
4364                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4365                        /* first surrogate */
4366                        surrogate = outCh;
4367                    }
4368                    else {
4369                        if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4370                            goto onError;
4371                    }
4372                }
4373            }
4374            else { /* now leaving a base-64 section */
4375                inShift = 0;
4376                s++;
4377                if (surrogate) {
4378                    if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4379                        goto onError;
4380                    surrogate = 0;
4381                }
4382                if (base64bits > 0) { /* left-over bits */
4383                    if (base64bits >= 6) {
4384                        /* We've seen at least one base-64 character */
4385                        errmsg = "partial character in shift sequence";
4386                        goto utf7Error;
4387                    }
4388                    else {
4389                        /* Some bits remain; they should be zero */
4390                        if (base64buffer != 0) {
4391                            errmsg = "non-zero padding bits in shift sequence";
4392                            goto utf7Error;
4393                        }
4394                    }
4395                }
4396                if (ch != '-') {
4397                    /* '-' is absorbed; other terminating
4398                       characters are preserved */
4399                    if (unicode_putchar(&unicode, &outpos, ch) < 0)
4400                        goto onError;
4401                }
4402            }
4403        }
4404        else if ( ch == '+' ) {
4405            startinpos = s-starts;
4406            s++; /* consume '+' */
4407            if (s < e && *s == '-') { /* '+-' encodes '+' */
4408                s++;
4409                if (unicode_putchar(&unicode, &outpos, '+') < 0)
4410                    goto onError;
4411            }
4412            else { /* begin base64-encoded section */
4413                inShift = 1;
4414                shiftOutStart = outpos;
4415                base64bits = 0;
4416            }
4417        }
4418        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4419            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4420                goto onError;
4421            s++;
4422        }
4423        else {
4424            startinpos = s-starts;
4425            s++;
4426            errmsg = "unexpected special character";
4427            goto utf7Error;
4428        }
4429        continue;
4430utf7Error:
4431        endinpos = s-starts;
4432        if (unicode_decode_call_errorhandler(
4433                errors, &errorHandler,
4434                "utf7", errmsg,
4435                &starts, &e, &startinpos, &endinpos, &exc, &s,
4436                &unicode, &outpos))
4437            goto onError;
4438    }
4439
4440    /* end of string */
4441
4442    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4443        /* if we're in an inconsistent state, that's an error */
4444        if (surrogate ||
4445                (base64bits >= 6) ||
4446                (base64bits > 0 && base64buffer != 0)) {
4447            endinpos = size;
4448            if (unicode_decode_call_errorhandler(
4449                    errors, &errorHandler,
4450                    "utf7", "unterminated shift sequence",
4451                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4452                    &unicode, &outpos))
4453                goto onError;
4454            if (s < e)
4455                goto restart;
4456        }
4457    }
4458
4459    /* return state */
4460    if (consumed) {
4461        if (inShift) {
4462            outpos = shiftOutStart; /* back off output */
4463            *consumed = startinpos;
4464        }
4465        else {
4466            *consumed = s-starts;
4467        }
4468    }
4469
4470    if (unicode_resize(&unicode, outpos) < 0)
4471        goto onError;
4472
4473    Py_XDECREF(errorHandler);
4474    Py_XDECREF(exc);
4475    return unicode_result(unicode);
4476
4477  onError:
4478    Py_XDECREF(errorHandler);
4479    Py_XDECREF(exc);
4480    Py_DECREF(unicode);
4481    return NULL;
4482}
4483
4484
4485PyObject *
4486_PyUnicode_EncodeUTF7(PyObject *str,
4487                      int base64SetO,
4488                      int base64WhiteSpace,
4489                      const char *errors)
4490{
4491    int kind;
4492    void *data;
4493    Py_ssize_t len;
4494    PyObject *v;
4495    Py_ssize_t allocated;
4496    int inShift = 0;
4497    Py_ssize_t i;
4498    unsigned int base64bits = 0;
4499    unsigned long base64buffer = 0;
4500    char * out;
4501    char * start;
4502
4503    if (PyUnicode_READY(str) == -1)
4504        return NULL;
4505    kind = PyUnicode_KIND(str);
4506    data = PyUnicode_DATA(str);
4507    len = PyUnicode_GET_LENGTH(str);
4508
4509    if (len == 0)
4510        return PyBytes_FromStringAndSize(NULL, 0);
4511
4512    /* It might be possible to tighten this worst case */
4513    allocated = 8 * len;
4514    if (allocated / 8 != len)
4515        return PyErr_NoMemory();
4516
4517    v = PyBytes_FromStringAndSize(NULL, allocated);
4518    if (v == NULL)
4519        return NULL;
4520
4521    start = out = PyBytes_AS_STRING(v);
4522    for (i = 0; i < len; ++i) {
4523        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4524
4525        if (inShift) {
4526            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4527                /* shifting out */
4528                if (base64bits) { /* output remaining bits */
4529                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4530                    base64buffer = 0;
4531                    base64bits = 0;
4532                }
4533                inShift = 0;
4534                /* Characters not in the BASE64 set implicitly unshift the sequence
4535                   so no '-' is required, except if the character is itself a '-' */
4536                if (IS_BASE64(ch) || ch == '-') {
4537                    *out++ = '-';
4538                }
4539                *out++ = (char) ch;
4540            }
4541            else {
4542                goto encode_char;
4543            }
4544        }
4545        else { /* not in a shift sequence */
4546            if (ch == '+') {
4547                *out++ = '+';
4548                        *out++ = '-';
4549            }
4550            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4551                *out++ = (char) ch;
4552            }
4553            else {
4554                *out++ = '+';
4555                inShift = 1;
4556                goto encode_char;
4557            }
4558        }
4559        continue;
4560encode_char:
4561        if (ch >= 0x10000) {
4562            assert(ch <= MAX_UNICODE);
4563
4564            /* code first surrogate */
4565            base64bits += 16;
4566            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4567            while (base64bits >= 6) {
4568                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569                base64bits -= 6;
4570            }
4571            /* prepare second surrogate */
4572            ch = Py_UNICODE_LOW_SURROGATE(ch);
4573        }
4574        base64bits += 16;
4575        base64buffer = (base64buffer << 16) | ch;
4576        while (base64bits >= 6) {
4577            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578            base64bits -= 6;
4579        }
4580    }
4581    if (base64bits)
4582        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4583    if (inShift)
4584        *out++ = '-';
4585    if (_PyBytes_Resize(&v, out - start) < 0)
4586        return NULL;
4587    return v;
4588}
4589PyObject *
4590PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4591                     Py_ssize_t size,
4592                     int base64SetO,
4593                     int base64WhiteSpace,
4594                     const char *errors)
4595{
4596    PyObject *result;
4597    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4598    if (tmp == NULL)
4599        return NULL;
4600    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4601                                   base64WhiteSpace, errors);
4602    Py_DECREF(tmp);
4603    return result;
4604}
4605
4606#undef IS_BASE64
4607#undef FROM_BASE64
4608#undef TO_BASE64
4609#undef DECODE_DIRECT
4610#undef ENCODE_DIRECT
4611
4612/* --- UTF-8 Codec -------------------------------------------------------- */
4613
4614PyObject *
4615PyUnicode_DecodeUTF8(const char *s,
4616                     Py_ssize_t size,
4617                     const char *errors)
4618{
4619    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4620}
4621
4622#include "stringlib/asciilib.h"
4623#include "stringlib/codecs.h"
4624#include "stringlib/undef.h"
4625
4626#include "stringlib/ucs1lib.h"
4627#include "stringlib/codecs.h"
4628#include "stringlib/undef.h"
4629
4630#include "stringlib/ucs2lib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
4634#include "stringlib/ucs4lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
4638/* Mask to quickly check whether a C 'long' contains a
4639   non-ASCII, UTF8-encoded char. */
4640#if (SIZEOF_LONG == 8)
4641# define ASCII_CHAR_MASK 0x8080808080808080UL
4642#elif (SIZEOF_LONG == 4)
4643# define ASCII_CHAR_MASK 0x80808080UL
4644#else
4645# error C 'long' size should be either 4 or 8!
4646#endif
4647
4648static Py_ssize_t
4649ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4650{
4651    const char *p = start;
4652    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4653
4654#if SIZEOF_LONG <= SIZEOF_VOID_P
4655    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4656    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4657        /* Fast path, see in STRINGLIB(utf8_decode) for
4658           an explanation. */
4659        /* Help register allocation */
4660        register const char *_p = p;
4661        register Py_UCS1 * q = dest;
4662        while (_p < aligned_end) {
4663            unsigned long value = *(const unsigned long *) _p;
4664            if (value & ASCII_CHAR_MASK)
4665                break;
4666            *((unsigned long *)q) = value;
4667            _p += SIZEOF_LONG;
4668            q += SIZEOF_LONG;
4669        }
4670        p = _p;
4671        while (p < end) {
4672            if ((unsigned char)*p & 0x80)
4673                break;
4674            *q++ = *p++;
4675        }
4676        return p - start;
4677    }
4678#endif
4679    while (p < end) {
4680        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4681           for an explanation. */
4682        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4683            /* Help register allocation */
4684            register const char *_p = p;
4685            while (_p < aligned_end) {
4686                unsigned long value = *(unsigned long *) _p;
4687                if (value & ASCII_CHAR_MASK)
4688                    break;
4689                _p += SIZEOF_LONG;
4690            }
4691            p = _p;
4692            if (_p == end)
4693                break;
4694        }
4695        if ((unsigned char)*p & 0x80)
4696            break;
4697        ++p;
4698    }
4699    memcpy(dest, start, p - start);
4700    return p - start;
4701}
4702
4703PyObject *
4704PyUnicode_DecodeUTF8Stateful(const char *s,
4705                             Py_ssize_t size,
4706                             const char *errors,
4707                             Py_ssize_t *consumed)
4708{
4709    PyObject *unicode;
4710    const char *starts = s;
4711    const char *end = s + size;
4712    Py_ssize_t outpos;
4713
4714    Py_ssize_t startinpos;
4715    Py_ssize_t endinpos;
4716    const char *errmsg = "";
4717    PyObject *errorHandler = NULL;
4718    PyObject *exc = NULL;
4719
4720    if (size == 0) {
4721        if (consumed)
4722            *consumed = 0;
4723        Py_INCREF(unicode_empty);
4724        return unicode_empty;
4725    }
4726
4727    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4728    if (size == 1 && (unsigned char)s[0] < 128) {
4729        if (consumed)
4730            *consumed = 1;
4731        return get_latin1_char((unsigned char)s[0]);
4732    }
4733
4734    unicode = PyUnicode_New(size, 127);
4735    if (!unicode)
4736        return NULL;
4737
4738    outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4739    s += outpos;
4740    while (s < end) {
4741        Py_UCS4 ch;
4742        int kind = PyUnicode_KIND(unicode);
4743        if (kind == PyUnicode_1BYTE_KIND) {
4744            if (PyUnicode_IS_ASCII(unicode))
4745                ch = asciilib_utf8_decode(&s, end,
4746                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4747            else
4748                ch = ucs1lib_utf8_decode(&s, end,
4749                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4750        } else if (kind == PyUnicode_2BYTE_KIND) {
4751            ch = ucs2lib_utf8_decode(&s, end,
4752                    PyUnicode_2BYTE_DATA(unicode), &outpos);
4753        } else {
4754            assert(kind == PyUnicode_4BYTE_KIND);
4755            ch = ucs4lib_utf8_decode(&s, end,
4756                    PyUnicode_4BYTE_DATA(unicode), &outpos);
4757        }
4758
4759        switch (ch) {
4760        case 0:
4761            if (s == end || consumed)
4762                goto End;
4763            errmsg = "unexpected end of data";
4764            startinpos = s - starts;
4765            endinpos = startinpos + 1;
4766            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4767                endinpos++;
4768            break;
4769        case 1:
4770            errmsg = "invalid start byte";
4771            startinpos = s - starts;
4772            endinpos = startinpos + 1;
4773            break;
4774        case 2:
4775            errmsg = "invalid continuation byte";
4776            startinpos = s - starts;
4777            endinpos = startinpos + 1;
4778            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4779                endinpos++;
4780            break;
4781        default:
4782            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4783                goto onError;
4784            continue;
4785        }
4786
4787        if (unicode_decode_call_errorhandler(
4788                errors, &errorHandler,
4789                "utf-8", errmsg,
4790                &starts, &end, &startinpos, &endinpos, &exc, &s,
4791                &unicode, &outpos))
4792            goto onError;
4793    }
4794
4795End:
4796    if (unicode_resize(&unicode, outpos) < 0)
4797        goto onError;
4798
4799    if (consumed)
4800        *consumed = s - starts;
4801
4802    Py_XDECREF(errorHandler);
4803    Py_XDECREF(exc);
4804    assert(_PyUnicode_CheckConsistency(unicode, 1));
4805    return unicode;
4806
4807onError:
4808    Py_XDECREF(errorHandler);
4809    Py_XDECREF(exc);
4810    Py_XDECREF(unicode);
4811    return NULL;
4812}
4813
4814#ifdef __APPLE__
4815
4816/* Simplified UTF-8 decoder using surrogateescape error handler,
4817   used to decode the command line arguments on Mac OS X. */
4818
4819wchar_t*
4820_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4821{
4822    const char *e;
4823    wchar_t *unicode;
4824    Py_ssize_t outpos;
4825
4826    /* Note: size will always be longer than the resulting Unicode
4827       character count */
4828    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4829        PyErr_NoMemory();
4830        return NULL;
4831    }
4832    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4833    if (!unicode)
4834        return NULL;
4835
4836    /* Unpack UTF-8 encoded data */
4837    e = s + size;
4838    outpos = 0;
4839    while (s < e) {
4840        Py_UCS4 ch;
4841#if SIZEOF_WCHAR_T == 4
4842        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4843#else
4844        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4845#endif
4846        if (ch > 0xFF) {
4847#if SIZEOF_WCHAR_T == 4
4848            assert(0);
4849#else
4850            assert(Py_UNICODE_IS_SURROGATE(ch));
4851            /*  compute and append the two surrogates: */
4852            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4853            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4854#endif
4855        }
4856        else {
4857            if (!ch && s == e)
4858                break;
4859            /* surrogateescape */
4860            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4861        }
4862    }
4863    unicode[outpos] = L'\0';
4864    return unicode;
4865}
4866
4867#endif /* __APPLE__ */
4868
4869/* Primary internal function which creates utf8 encoded bytes objects.
4870
4871   Allocation strategy:  if the string is short, convert into a stack buffer
4872   and allocate exactly as much space needed at the end.  Else allocate the
4873   maximum possible needed (4 result bytes per Unicode character), and return
4874   the excess memory at the end.
4875*/
4876PyObject *
4877_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4878{
4879    enum PyUnicode_Kind kind;
4880    void *data;
4881    Py_ssize_t size;
4882
4883    if (!PyUnicode_Check(unicode)) {
4884        PyErr_BadArgument();
4885        return NULL;
4886    }
4887
4888    if (PyUnicode_READY(unicode) == -1)
4889        return NULL;
4890
4891    if (PyUnicode_UTF8(unicode))
4892        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4893                                         PyUnicode_UTF8_LENGTH(unicode));
4894
4895    kind = PyUnicode_KIND(unicode);
4896    data = PyUnicode_DATA(unicode);
4897    size = PyUnicode_GET_LENGTH(unicode);
4898
4899    switch (kind) {
4900    default:
4901        assert(0);
4902    case PyUnicode_1BYTE_KIND:
4903        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4904        assert(!PyUnicode_IS_ASCII(unicode));
4905        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4906    case PyUnicode_2BYTE_KIND:
4907        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4908    case PyUnicode_4BYTE_KIND:
4909        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4910    }
4911}
4912
4913PyObject *
4914PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4915                     Py_ssize_t size,
4916                     const char *errors)
4917{
4918    PyObject *v, *unicode;
4919
4920    unicode = PyUnicode_FromUnicode(s, size);
4921    if (unicode == NULL)
4922        return NULL;
4923    v = _PyUnicode_AsUTF8String(unicode, errors);
4924    Py_DECREF(unicode);
4925    return v;
4926}
4927
4928PyObject *
4929PyUnicode_AsUTF8String(PyObject *unicode)
4930{
4931    return _PyUnicode_AsUTF8String(unicode, NULL);
4932}
4933
4934/* --- UTF-32 Codec ------------------------------------------------------- */
4935
4936PyObject *
4937PyUnicode_DecodeUTF32(const char *s,
4938                      Py_ssize_t size,
4939                      const char *errors,
4940                      int *byteorder)
4941{
4942    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4943}
4944
4945PyObject *
4946PyUnicode_DecodeUTF32Stateful(const char *s,
4947                              Py_ssize_t size,
4948                              const char *errors,
4949                              int *byteorder,
4950                              Py_ssize_t *consumed)
4951{
4952    const char *starts = s;
4953    Py_ssize_t startinpos;
4954    Py_ssize_t endinpos;
4955    Py_ssize_t outpos;
4956    PyObject *unicode;
4957    const unsigned char *q, *e;
4958    int bo = 0;       /* assume native ordering by default */
4959    const char *errmsg = "";
4960    /* Offsets from q for retrieving bytes in the right order. */
4961#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4962    int iorder[] = {0, 1, 2, 3};
4963#else
4964    int iorder[] = {3, 2, 1, 0};
4965#endif
4966    PyObject *errorHandler = NULL;
4967    PyObject *exc = NULL;
4968
4969    q = (unsigned char *)s;
4970    e = q + size;
4971
4972    if (byteorder)
4973        bo = *byteorder;
4974
4975    /* Check for BOM marks (U+FEFF) in the input and adjust current
4976       byte order setting accordingly. In native mode, the leading BOM
4977       mark is skipped, in all other modes, it is copied to the output
4978       stream as-is (giving a ZWNBSP character). */
4979    if (bo == 0) {
4980        if (size >= 4) {
4981            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4982                (q[iorder[1]] << 8) | q[iorder[0]];
4983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4984            if (bom == 0x0000FEFF) {
4985                q += 4;
4986                bo = -1;
4987            }
4988            else if (bom == 0xFFFE0000) {
4989                q += 4;
4990                bo = 1;
4991            }
4992#else
4993            if (bom == 0x0000FEFF) {
4994                q += 4;
4995                bo = 1;
4996            }
4997            else if (bom == 0xFFFE0000) {
4998                q += 4;
4999                bo = -1;
5000            }
5001#endif
5002        }
5003    }
5004
5005    if (bo == -1) {
5006        /* force LE */
5007        iorder[0] = 0;
5008        iorder[1] = 1;
5009        iorder[2] = 2;
5010        iorder[3] = 3;
5011    }
5012    else if (bo == 1) {
5013        /* force BE */
5014        iorder[0] = 3;
5015        iorder[1] = 2;
5016        iorder[2] = 1;
5017        iorder[3] = 0;
5018    }
5019
5020    /* This might be one to much, because of a BOM */
5021    unicode = PyUnicode_New((size+3)/4, 127);
5022    if (!unicode)
5023        return NULL;
5024    if (size == 0)
5025        return unicode;
5026    outpos = 0;
5027
5028    while (q < e) {
5029        Py_UCS4 ch;
5030        /* remaining bytes at the end? (size should be divisible by 4) */
5031        if (e-q<4) {
5032            if (consumed)
5033                break;
5034            errmsg = "truncated data";
5035            startinpos = ((const char *)q)-starts;
5036            endinpos = ((const char *)e)-starts;
5037            goto utf32Error;
5038            /* The remaining input chars are ignored if the callback
5039               chooses to skip the input */
5040        }
5041        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5042            (q[iorder[1]] << 8) | q[iorder[0]];
5043
5044        if (ch >= 0x110000)
5045        {
5046            errmsg = "codepoint not in range(0x110000)";
5047            startinpos = ((const char *)q)-starts;
5048            endinpos = startinpos+4;
5049            goto utf32Error;
5050        }
5051        if (unicode_putchar(&unicode, &outpos, ch) < 0)
5052            goto onError;
5053        q += 4;
5054        continue;
5055      utf32Error:
5056        if (unicode_decode_call_errorhandler(
5057                errors, &errorHandler,
5058                "utf32", errmsg,
5059                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5060                &unicode, &outpos))
5061            goto onError;
5062    }
5063
5064    if (byteorder)
5065        *byteorder = bo;
5066
5067    if (consumed)
5068        *consumed = (const char *)q-starts;
5069
5070    /* Adjust length */
5071    if (unicode_resize(&unicode, outpos) < 0)
5072        goto onError;
5073
5074    Py_XDECREF(errorHandler);
5075    Py_XDECREF(exc);
5076    return unicode_result(unicode);
5077
5078  onError:
5079    Py_DECREF(unicode);
5080    Py_XDECREF(errorHandler);
5081    Py_XDECREF(exc);
5082    return NULL;
5083}
5084
5085PyObject *
5086_PyUnicode_EncodeUTF32(PyObject *str,
5087                       const char *errors,
5088                       int byteorder)
5089{
5090    int kind;
5091    void *data;
5092    Py_ssize_t len;
5093    PyObject *v;
5094    unsigned char *p;
5095    Py_ssize_t nsize, bytesize, i;
5096    /* Offsets from p for storing byte pairs in the right order. */
5097#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5098    int iorder[] = {0, 1, 2, 3};
5099#else
5100    int iorder[] = {3, 2, 1, 0};
5101#endif
5102
5103#define STORECHAR(CH)                           \
5104    do {                                        \
5105        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5106        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5107        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5108        p[iorder[0]] = (CH) & 0xff;             \
5109        p += 4;                                 \
5110    } while(0)
5111
5112    if (!PyUnicode_Check(str)) {
5113        PyErr_BadArgument();
5114        return NULL;
5115    }
5116    if (PyUnicode_READY(str) == -1)
5117        return NULL;
5118    kind = PyUnicode_KIND(str);
5119    data = PyUnicode_DATA(str);
5120    len = PyUnicode_GET_LENGTH(str);
5121
5122    nsize = len + (byteorder == 0);
5123    bytesize = nsize * 4;
5124    if (bytesize / 4 != nsize)
5125        return PyErr_NoMemory();
5126    v = PyBytes_FromStringAndSize(NULL, bytesize);
5127    if (v == NULL)
5128        return NULL;
5129
5130    p = (unsigned char *)PyBytes_AS_STRING(v);
5131    if (byteorder == 0)
5132        STORECHAR(0xFEFF);
5133    if (len == 0)
5134        goto done;
5135
5136    if (byteorder == -1) {
5137        /* force LE */
5138        iorder[0] = 0;
5139        iorder[1] = 1;
5140        iorder[2] = 2;
5141        iorder[3] = 3;
5142    }
5143    else if (byteorder == 1) {
5144        /* force BE */
5145        iorder[0] = 3;
5146        iorder[1] = 2;
5147        iorder[2] = 1;
5148        iorder[3] = 0;
5149    }
5150
5151    for (i = 0; i < len; i++)
5152        STORECHAR(PyUnicode_READ(kind, data, i));
5153
5154  done:
5155    return v;
5156#undef STORECHAR
5157}
5158
5159PyObject *
5160PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5161                      Py_ssize_t size,
5162                      const char *errors,
5163                      int byteorder)
5164{
5165    PyObject *result;
5166    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5167    if (tmp == NULL)
5168        return NULL;
5169    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5170    Py_DECREF(tmp);
5171    return result;
5172}
5173
5174PyObject *
5175PyUnicode_AsUTF32String(PyObject *unicode)
5176{
5177    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5178}
5179
5180/* --- UTF-16 Codec ------------------------------------------------------- */
5181
5182PyObject *
5183PyUnicode_DecodeUTF16(const char *s,
5184                      Py_ssize_t size,
5185                      const char *errors,
5186                      int *byteorder)
5187{
5188    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5189}
5190
5191PyObject *
5192PyUnicode_DecodeUTF16Stateful(const char *s,
5193                              Py_ssize_t size,
5194                              const char *errors,
5195                              int *byteorder,
5196                              Py_ssize_t *consumed)
5197{
5198    const char *starts = s;
5199    Py_ssize_t startinpos;
5200    Py_ssize_t endinpos;
5201    Py_ssize_t outpos;
5202    PyObject *unicode;
5203    const unsigned char *q, *e;
5204    int bo = 0;       /* assume native ordering by default */
5205    int native_ordering;
5206    const char *errmsg = "";
5207    PyObject *errorHandler = NULL;
5208    PyObject *exc = NULL;
5209
5210    q = (unsigned char *)s;
5211    e = q + size;
5212
5213    if (byteorder)
5214        bo = *byteorder;
5215
5216    /* Check for BOM marks (U+FEFF) in the input and adjust current
5217       byte order setting accordingly. In native mode, the leading BOM
5218       mark is skipped, in all other modes, it is copied to the output
5219       stream as-is (giving a ZWNBSP character). */
5220    if (bo == 0 && size >= 2) {
5221        const Py_UCS4 bom = (q[1] << 8) | q[0];
5222        if (bom == 0xFEFF) {
5223            q += 2;
5224            bo = -1;
5225        }
5226        else if (bom == 0xFFFE) {
5227            q += 2;
5228            bo = 1;
5229        }
5230        if (byteorder)
5231            *byteorder = bo;
5232    }
5233
5234    if (q == e) {
5235        if (consumed)
5236            *consumed = size;
5237        Py_INCREF(unicode_empty);
5238        return unicode_empty;
5239    }
5240
5241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5242    native_ordering = bo <= 0;
5243#else
5244    native_ordering = bo >= 0;
5245#endif
5246
5247    /* Note: size will always be longer than the resulting Unicode
5248       character count */
5249    unicode = PyUnicode_New((e - q + 1) / 2, 127);
5250    if (!unicode)
5251        return NULL;
5252
5253    outpos = 0;
5254    while (1) {
5255        Py_UCS4 ch = 0;
5256        if (e - q >= 2) {
5257            int kind = PyUnicode_KIND(unicode);
5258            if (kind == PyUnicode_1BYTE_KIND) {
5259                if (PyUnicode_IS_ASCII(unicode))
5260                    ch = asciilib_utf16_decode(&q, e,
5261                            PyUnicode_1BYTE_DATA(unicode), &outpos,
5262                            native_ordering);
5263                else
5264                    ch = ucs1lib_utf16_decode(&q, e,
5265                            PyUnicode_1BYTE_DATA(unicode), &outpos,
5266                            native_ordering);
5267            } else if (kind == PyUnicode_2BYTE_KIND) {
5268                ch = ucs2lib_utf16_decode(&q, e,
5269                        PyUnicode_2BYTE_DATA(unicode), &outpos,
5270                        native_ordering);
5271            } else {
5272                assert(kind == PyUnicode_4BYTE_KIND);
5273                ch = ucs4lib_utf16_decode(&q, e,
5274                        PyUnicode_4BYTE_DATA(unicode), &outpos,
5275                        native_ordering);
5276            }
5277        }
5278
5279        switch (ch)
5280        {
5281        case 0:
5282            /* remaining byte at the end? (size should be even) */
5283            if (q == e || consumed)
5284                goto End;
5285            errmsg = "truncated data";
5286            startinpos = ((const char *)q) - starts;
5287            endinpos = ((const char *)e) - starts;
5288            break;
5289            /* The remaining input chars are ignored if the callback
5290               chooses to skip the input */
5291        case 1:
5292            errmsg = "unexpected end of data";
5293            startinpos = ((const char *)q) - 2 - starts;
5294            endinpos = ((const char *)e) - starts;
5295            break;
5296        case 2:
5297            errmsg = "illegal encoding";
5298            startinpos = ((const char *)q) - 2 - starts;
5299            endinpos = startinpos + 2;
5300            break;
5301        case 3:
5302            errmsg = "illegal UTF-16 surrogate";
5303            startinpos = ((const char *)q) - 4 - starts;
5304            endinpos = startinpos + 2;
5305            break;
5306        default:
5307            if (unicode_putchar(&unicode, &outpos, ch) < 0)
5308                goto onError;
5309            continue;
5310        }
5311
5312        if (unicode_decode_call_errorhandler(
5313                errors,
5314                &errorHandler,
5315                "utf16", errmsg,
5316                &starts,
5317                (const char **)&e,
5318                &startinpos,
5319                &endinpos,
5320                &exc,
5321                (const char **)&q,
5322                &unicode,
5323                &outpos))
5324            goto onError;
5325    }
5326
5327End:
5328    if (consumed)
5329        *consumed = (const char *)q-starts;
5330
5331    /* Adjust length */
5332    if (unicode_resize(&unicode, outpos) < 0)
5333        goto onError;
5334
5335    Py_XDECREF(errorHandler);
5336    Py_XDECREF(exc);
5337    return unicode_result(unicode);
5338
5339  onError:
5340    Py_DECREF(unicode);
5341    Py_XDECREF(errorHandler);
5342    Py_XDECREF(exc);
5343    return NULL;
5344}
5345
5346PyObject *
5347_PyUnicode_EncodeUTF16(PyObject *str,
5348                       const char *errors,
5349                       int byteorder)
5350{
5351    enum PyUnicode_Kind kind;
5352    const void *data;
5353    Py_ssize_t len;
5354    PyObject *v;
5355    unsigned short *out;
5356    Py_ssize_t bytesize;
5357    Py_ssize_t pairs;
5358#ifdef WORDS_BIGENDIAN
5359    int native_ordering = byteorder >= 0;
5360#else
5361    int native_ordering = byteorder <= 0;
5362#endif
5363
5364    if (!PyUnicode_Check(str)) {
5365        PyErr_BadArgument();
5366        return NULL;
5367    }
5368    if (PyUnicode_READY(str) == -1)
5369        return NULL;
5370    kind = PyUnicode_KIND(str);
5371    data = PyUnicode_DATA(str);
5372    len = PyUnicode_GET_LENGTH(str);
5373
5374    pairs = 0;
5375    if (kind == PyUnicode_4BYTE_KIND) {
5376        const Py_UCS4 *in = (const Py_UCS4 *)data;
5377        const Py_UCS4 *end = in + len;
5378        while (in < end)
5379            if (*in++ >= 0x10000)
5380                pairs++;
5381    }
5382    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5383        return PyErr_NoMemory();
5384    bytesize = (len + pairs + (byteorder == 0)) * 2;
5385    v = PyBytes_FromStringAndSize(NULL, bytesize);
5386    if (v == NULL)
5387        return NULL;
5388
5389    /* output buffer is 2-bytes aligned */
5390    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5391    out = (unsigned short *)PyBytes_AS_STRING(v);
5392    if (byteorder == 0)
5393        *out++ = 0xFEFF;
5394    if (len == 0)
5395        goto done;
5396
5397    switch (kind) {
5398    case PyUnicode_1BYTE_KIND: {
5399        ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5400        break;
5401    }
5402    case PyUnicode_2BYTE_KIND: {
5403        ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5404        break;
5405    }
5406    case PyUnicode_4BYTE_KIND: {
5407        ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5408        break;
5409    }
5410    default:
5411        assert(0);
5412    }
5413
5414  done:
5415    return v;
5416}
5417
5418PyObject *
5419PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5420                      Py_ssize_t size,
5421                      const char *errors,
5422                      int byteorder)
5423{
5424    PyObject *result;
5425    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5426    if (tmp == NULL)
5427        return NULL;
5428    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5429    Py_DECREF(tmp);
5430    return result;
5431}
5432
5433PyObject *
5434PyUnicode_AsUTF16String(PyObject *unicode)
5435{
5436    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5437}
5438
5439/* --- Unicode Escape Codec ----------------------------------------------- */
5440
5441/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5442   if all the escapes in the string make it still a valid ASCII string.
5443   Returns -1 if any escapes were found which cause the string to
5444   pop out of ASCII range.  Otherwise returns the length of the
5445   required buffer to hold the string.
5446   */
5447static Py_ssize_t
5448length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5449{
5450    const unsigned char *p = (const unsigned char *)s;
5451    const unsigned char *end = p + size;
5452    Py_ssize_t length = 0;
5453
5454    if (size < 0)
5455        return -1;
5456
5457    for (; p < end; ++p) {
5458        if (*p > 127) {
5459            /* Non-ASCII */
5460            return -1;
5461        }
5462        else if (*p != '\\') {
5463            /* Normal character */
5464            ++length;
5465        }
5466        else {
5467            /* Backslash-escape, check next char */
5468            ++p;
5469            /* Escape sequence reaches till end of string or
5470               non-ASCII follow-up. */
5471            if (p >= end || *p > 127)
5472                return -1;
5473            switch (*p) {
5474            case '\n':
5475                /* backslash + \n result in zero characters */
5476                break;
5477            case '\\': case '\'': case '\"':
5478            case 'b': case 'f': case 't':
5479            case 'n': case 'r': case 'v': case 'a':
5480                ++length;
5481                break;
5482            case '0': case '1': case '2': case '3':
5483            case '4': case '5': case '6': case '7':
5484            case 'x': case 'u': case 'U': case 'N':
5485                /* these do not guarantee ASCII characters */
5486                return -1;
5487            default:
5488                /* count the backslash + the other character */
5489                length += 2;
5490            }
5491        }
5492    }
5493    return length;
5494}
5495
5496static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5497
5498PyObject *
5499PyUnicode_DecodeUnicodeEscape(const char *s,
5500                              Py_ssize_t size,
5501                              const char *errors)
5502{
5503    const char *starts = s;
5504    Py_ssize_t startinpos;
5505    Py_ssize_t endinpos;
5506    int j;
5507    PyObject *v;
5508    const char *end;
5509    char* message;
5510    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5511    PyObject *errorHandler = NULL;
5512    PyObject *exc = NULL;
5513    Py_ssize_t len;
5514    Py_ssize_t i;
5515
5516    len = length_of_escaped_ascii_string(s, size);
5517
5518    /* After length_of_escaped_ascii_string() there are two alternatives,
5519       either the string is pure ASCII with named escapes like \n, etc.
5520       and we determined it's exact size (common case)
5521       or it contains \x, \u, ... escape sequences.  then we create a
5522       legacy wchar string and resize it at the end of this function. */
5523    if (len >= 0) {
5524        v = PyUnicode_New(len, 127);
5525        if (!v)
5526            goto onError;
5527        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5528    }
5529    else {
5530        /* Escaped strings will always be longer than the resulting
5531           Unicode string, so we start with size here and then reduce the
5532           length after conversion to the true value.
5533           (but if the error callback returns a long replacement string
5534           we'll have to allocate more space) */
5535        v = PyUnicode_New(size, 127);
5536        if (!v)
5537            goto onError;
5538        len = size;
5539    }
5540
5541    if (size == 0)
5542        return v;
5543    i = 0;
5544    end = s + size;
5545
5546    while (s < end) {
5547        unsigned char c;
5548        Py_UCS4 x;
5549        int digits;
5550
5551        /* The only case in which i == ascii_length is a backslash
5552           followed by a newline. */
5553        assert(i <= len);
5554
5555        /* Non-escape characters are interpreted as Unicode ordinals */
5556        if (*s != '\\') {
5557            if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5558                goto onError;
5559            continue;
5560        }
5561
5562        startinpos = s-starts;
5563        /* \ - Escapes */
5564        s++;
5565        c = *s++;
5566        if (s > end)
5567            c = '\0'; /* Invalid after \ */
5568
5569        /* The only case in which i == ascii_length is a backslash
5570           followed by a newline. */
5571        assert(i < len || (i == len && c == '\n'));
5572
5573        switch (c) {
5574
5575            /* \x escapes */
5576#define WRITECHAR(ch)                                   \
5577            do {                                        \
5578                if (unicode_putchar(&v, &i, ch) < 0)    \
5579                    goto onError;                       \
5580            }while(0)
5581
5582        case '\n': break;
5583        case '\\': WRITECHAR('\\'); break;
5584        case '\'': WRITECHAR('\''); break;
5585        case '\"': WRITECHAR('\"'); break;
5586        case 'b': WRITECHAR('\b'); break;
5587        /* FF */
5588        case 'f': WRITECHAR('\014'); break;
5589        case 't': WRITECHAR('\t'); break;
5590        case 'n': WRITECHAR('\n'); break;
5591        case 'r': WRITECHAR('\r'); break;
5592        /* VT */
5593        case 'v': WRITECHAR('\013'); break;
5594        /* BEL, not classic C */
5595        case 'a': WRITECHAR('\007'); break;
5596
5597            /* \OOO (octal) escapes */
5598        case '0': case '1': case '2': case '3':
5599        case '4': case '5': case '6': case '7':
5600            x = s[-1] - '0';
5601            if (s < end && '0' <= *s && *s <= '7') {
5602                x = (x<<3) + *s++ - '0';
5603                if (s < end && '0' <= *s && *s <= '7')
5604                    x = (x<<3) + *s++ - '0';
5605            }
5606            WRITECHAR(x);
5607            break;
5608
5609            /* hex escapes */
5610            /* \xXX */
5611        case 'x':
5612            digits = 2;
5613            message = "truncated \\xXX escape";
5614            goto hexescape;
5615
5616            /* \uXXXX */
5617        case 'u':
5618            digits = 4;
5619            message = "truncated \\uXXXX escape";
5620            goto hexescape;
5621
5622            /* \UXXXXXXXX */
5623        case 'U':
5624            digits = 8;
5625            message = "truncated \\UXXXXXXXX escape";
5626        hexescape:
5627            chr = 0;
5628            if (s+digits>end) {
5629                endinpos = size;
5630                if (unicode_decode_call_errorhandler(
5631                        errors, &errorHandler,
5632                        "unicodeescape", "end of string in escape sequence",
5633                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5634                        &v, &i))
5635                    goto onError;
5636                goto nextByte;
5637            }
5638            for (j = 0; j < digits; ++j) {
5639                c = (unsigned char) s[j];
5640                if (!Py_ISXDIGIT(c)) {
5641                    endinpos = (s+j+1)-starts;
5642                    if (unicode_decode_call_errorhandler(
5643                            errors, &errorHandler,
5644                            "unicodeescape", message,
5645                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5646                            &v, &i))
5647                        goto onError;
5648                    len = PyUnicode_GET_LENGTH(v);
5649                    goto nextByte;
5650                }
5651                chr = (chr<<4) & ~0xF;
5652                if (c >= '0' && c <= '9')
5653                    chr += c - '0';
5654                else if (c >= 'a' && c <= 'f')
5655                    chr += 10 + c - 'a';
5656                else
5657                    chr += 10 + c - 'A';
5658            }
5659            s += j;
5660            if (chr == 0xffffffff && PyErr_Occurred())
5661                /* _decoding_error will have already written into the
5662                   target buffer. */
5663                break;
5664        store:
5665            /* when we get here, chr is a 32-bit unicode character */
5666            if (chr <= MAX_UNICODE) {
5667                WRITECHAR(chr);
5668            } else {
5669                endinpos = s-starts;
5670                if (unicode_decode_call_errorhandler(
5671                        errors, &errorHandler,
5672                        "unicodeescape", "illegal Unicode character",
5673                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5674                        &v, &i))
5675                    goto onError;
5676            }
5677            break;
5678
5679            /* \N{name} */
5680        case 'N':
5681            message = "malformed \\N character escape";
5682            if (ucnhash_CAPI == NULL) {
5683                /* load the unicode data module */
5684                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5685                                                PyUnicodeData_CAPSULE_NAME, 1);
5686                if (ucnhash_CAPI == NULL)
5687                    goto ucnhashError;
5688            }
5689            if (*s == '{') {
5690                const char *start = s+1;
5691                /* look for the closing brace */
5692                while (*s != '}' && s < end)
5693                    s++;
5694                if (s > start && s < end && *s == '}') {
5695                    /* found a name.  look it up in the unicode database */
5696                    message = "unknown Unicode character name";
5697                    s++;
5698                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5699                                              &chr, 0))
5700                        goto store;
5701                }
5702            }
5703            endinpos = s-starts;
5704            if (unicode_decode_call_errorhandler(
5705                    errors, &errorHandler,
5706                    "unicodeescape", message,
5707                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5708                    &v, &i))
5709                goto onError;
5710            break;
5711
5712        default:
5713            if (s > end) {
5714                message = "\\ at end of string";
5715                s--;
5716                endinpos = s-starts;
5717                if (unicode_decode_call_errorhandler(
5718                        errors, &errorHandler,
5719                        "unicodeescape", message,
5720                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5721                        &v, &i))
5722                    goto onError;
5723            }
5724            else {
5725                WRITECHAR('\\');
5726                WRITECHAR(s[-1]);
5727            }
5728            break;
5729        }
5730      nextByte:
5731        ;
5732    }
5733#undef WRITECHAR
5734
5735    if (unicode_resize(&v, i) < 0)
5736        goto onError;
5737    Py_XDECREF(errorHandler);
5738    Py_XDECREF(exc);
5739    return unicode_result(v);
5740
5741  ucnhashError:
5742    PyErr_SetString(
5743        PyExc_UnicodeError,
5744        "\\N escapes not supported (can't load unicodedata module)"
5745        );
5746    Py_XDECREF(v);
5747    Py_XDECREF(errorHandler);
5748    Py_XDECREF(exc);
5749    return NULL;
5750
5751  onError:
5752    Py_XDECREF(v);
5753    Py_XDECREF(errorHandler);
5754    Py_XDECREF(exc);
5755    return NULL;
5756}
5757
5758/* Return a Unicode-Escape string version of the Unicode object.
5759
5760   If quotes is true, the string is enclosed in u"" or u'' quotes as
5761   appropriate.
5762
5763*/
5764
5765PyObject *
5766PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5767{
5768    Py_ssize_t i, len;
5769    PyObject *repr;
5770    char *p;
5771    int kind;
5772    void *data;
5773    Py_ssize_t expandsize = 0;
5774
5775    /* Initial allocation is based on the longest-possible unichr
5776       escape.
5777
5778       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5779       unichr, so in this case it's the longest unichr escape. In
5780       narrow (UTF-16) builds this is five chars per source unichr
5781       since there are two unichrs in the surrogate pair, so in narrow
5782       (UTF-16) builds it's not the longest unichr escape.
5783
5784       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5785       so in the narrow (UTF-16) build case it's the longest unichr
5786       escape.
5787    */
5788
5789    if (!PyUnicode_Check(unicode)) {
5790        PyErr_BadArgument();
5791        return NULL;
5792    }
5793    if (PyUnicode_READY(unicode) == -1)
5794        return NULL;
5795    len = PyUnicode_GET_LENGTH(unicode);
5796    kind = PyUnicode_KIND(unicode);
5797    data = PyUnicode_DATA(unicode);
5798    switch (kind) {
5799    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5800    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5801    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5802    }
5803
5804    if (len == 0)
5805        return PyBytes_FromStringAndSize(NULL, 0);
5806
5807    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5808        return PyErr_NoMemory();
5809
5810    repr = PyBytes_FromStringAndSize(NULL,
5811                                     2
5812                                     + expandsize*len
5813                                     + 1);
5814    if (repr == NULL)
5815        return NULL;
5816
5817    p = PyBytes_AS_STRING(repr);
5818
5819    for (i = 0; i < len; i++) {
5820        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5821
5822        /* Escape backslashes */
5823        if (ch == '\\') {
5824            *p++ = '\\';
5825            *p++ = (char) ch;
5826            continue;
5827        }
5828
5829        /* Map 21-bit characters to '\U00xxxxxx' */
5830        else if (ch >= 0x10000) {
5831            assert(ch <= MAX_UNICODE);
5832            *p++ = '\\';
5833            *p++ = 'U';
5834            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5835            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5836            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5837            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5838            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5839            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5840            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5841            *p++ = Py_hexdigits[ch & 0x0000000F];
5842            continue;
5843        }
5844
5845        /* Map 16-bit characters to '\uxxxx' */
5846        if (ch >= 256) {
5847            *p++ = '\\';
5848            *p++ = 'u';
5849            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5850            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5851            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5852            *p++ = Py_hexdigits[ch & 0x000F];
5853        }
5854
5855        /* Map special whitespace to '\t', \n', '\r' */
5856        else if (ch == '\t') {
5857            *p++ = '\\';
5858            *p++ = 't';
5859        }
5860        else if (ch == '\n') {
5861            *p++ = '\\';
5862            *p++ = 'n';
5863        }
5864        else if (ch == '\r') {
5865            *p++ = '\\';
5866            *p++ = 'r';
5867        }
5868
5869        /* Map non-printable US ASCII to '\xhh' */
5870        else if (ch < ' ' || ch >= 0x7F) {
5871            *p++ = '\\';
5872            *p++ = 'x';
5873            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5874            *p++ = Py_hexdigits[ch & 0x000F];
5875        }
5876
5877        /* Copy everything else as-is */
5878        else
5879            *p++ = (char) ch;
5880    }
5881
5882    assert(p - PyBytes_AS_STRING(repr) > 0);
5883    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5884        return NULL;
5885    return repr;
5886}
5887
5888PyObject *
5889PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5890                              Py_ssize_t size)
5891{
5892    PyObject *result;
5893    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5894    if (tmp == NULL)
5895        return NULL;
5896    result = PyUnicode_AsUnicodeEscapeString(tmp);
5897    Py_DECREF(tmp);
5898    return result;
5899}
5900
5901/* --- Raw Unicode Escape Codec ------------------------------------------- */
5902
5903PyObject *
5904PyUnicode_DecodeRawUnicodeEscape(const char *s,
5905                                 Py_ssize_t size,
5906                                 const char *errors)
5907{
5908    const char *starts = s;
5909    Py_ssize_t startinpos;
5910    Py_ssize_t endinpos;
5911    Py_ssize_t outpos;
5912    PyObject *v;
5913    const char *end;
5914    const char *bs;
5915    PyObject *errorHandler = NULL;
5916    PyObject *exc = NULL;
5917
5918    /* Escaped strings will always be longer than the resulting
5919       Unicode string, so we start with size here and then reduce the
5920       length after conversion to the true value. (But decoding error
5921       handler might have to resize the string) */
5922    v = PyUnicode_New(size, 127);
5923    if (v == NULL)
5924        goto onError;
5925    if (size == 0)
5926        return v;
5927    outpos = 0;
5928    end = s + size;
5929    while (s < end) {
5930        unsigned char c;
5931        Py_UCS4 x;
5932        int i;
5933        int count;
5934
5935        /* Non-escape characters are interpreted as Unicode ordinals */
5936        if (*s != '\\') {
5937            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5938                goto onError;
5939            continue;
5940        }
5941        startinpos = s-starts;
5942
5943        /* \u-escapes are only interpreted iff the number of leading
5944           backslashes if odd */
5945        bs = s;
5946        for (;s < end;) {
5947            if (*s != '\\')
5948                break;
5949            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5950                goto onError;
5951        }
5952        if (((s - bs) & 1) == 0 ||
5953            s >= end ||
5954            (*s != 'u' && *s != 'U')) {
5955            continue;
5956        }
5957        outpos--;
5958        count = *s=='u' ? 4 : 8;
5959        s++;
5960
5961        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5962        for (x = 0, i = 0; i < count; ++i, ++s) {
5963            c = (unsigned char)*s;
5964            if (!Py_ISXDIGIT(c)) {
5965                endinpos = s-starts;
5966                if (unicode_decode_call_errorhandler(
5967                        errors, &errorHandler,
5968                        "rawunicodeescape", "truncated \\uXXXX",
5969                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5970                        &v, &outpos))
5971                    goto onError;
5972                goto nextByte;
5973            }
5974            x = (x<<4) & ~0xF;
5975            if (c >= '0' && c <= '9')
5976                x += c - '0';
5977            else if (c >= 'a' && c <= 'f')
5978                x += 10 + c - 'a';
5979            else
5980                x += 10 + c - 'A';
5981        }
5982        if (x <= MAX_UNICODE) {
5983            if (unicode_putchar(&v, &outpos, x) < 0)
5984                goto onError;
5985        } else {
5986            endinpos = s-starts;
5987            if (unicode_decode_call_errorhandler(
5988                    errors, &errorHandler,
5989                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5990                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5991                    &v, &outpos))
5992                goto onError;
5993        }
5994      nextByte:
5995        ;
5996    }
5997    if (unicode_resize(&v, outpos) < 0)
5998        goto onError;
5999    Py_XDECREF(errorHandler);
6000    Py_XDECREF(exc);
6001    return unicode_result(v);
6002
6003  onError:
6004    Py_XDECREF(v);
6005    Py_XDECREF(errorHandler);
6006    Py_XDECREF(exc);
6007    return NULL;
6008}
6009
6010
6011PyObject *
6012PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6013{
6014    PyObject *repr;
6015    char *p;
6016    char *q;
6017    Py_ssize_t expandsize, pos;
6018    int kind;
6019    void *data;
6020    Py_ssize_t len;
6021
6022    if (!PyUnicode_Check(unicode)) {
6023        PyErr_BadArgument();
6024        return NULL;
6025    }
6026    if (PyUnicode_READY(unicode) == -1)
6027        return NULL;
6028    kind = PyUnicode_KIND(unicode);
6029    data = PyUnicode_DATA(unicode);
6030    len = PyUnicode_GET_LENGTH(unicode);
6031    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6032       bytes, and 1 byte characters 4. */
6033    expandsize = kind * 2 + 2;
6034
6035    if (len > PY_SSIZE_T_MAX / expandsize)
6036        return PyErr_NoMemory();
6037
6038    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6039    if (repr == NULL)
6040        return NULL;
6041    if (len == 0)
6042        return repr;
6043
6044    p = q = PyBytes_AS_STRING(repr);
6045    for (pos = 0; pos < len; pos++) {
6046        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6047        /* Map 32-bit characters to '\Uxxxxxxxx' */
6048        if (ch >= 0x10000) {
6049            assert(ch <= MAX_UNICODE);
6050            *p++ = '\\';
6051            *p++ = 'U';
6052            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6053            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6054            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6055            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6056            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6057            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6058            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6059            *p++ = Py_hexdigits[ch & 15];
6060        }
6061        /* Map 16-bit characters to '\uxxxx' */
6062        else if (ch >= 256) {
6063            *p++ = '\\';
6064            *p++ = 'u';
6065            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6066            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6067            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6068            *p++ = Py_hexdigits[ch & 15];
6069        }
6070        /* Copy everything else as-is */
6071        else
6072            *p++ = (char) ch;
6073    }
6074
6075    assert(p > q);
6076    if (_PyBytes_Resize(&repr, p - q) < 0)
6077        return NULL;
6078    return repr;
6079}
6080
6081PyObject *
6082PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6083                                 Py_ssize_t size)
6084{
6085    PyObject *result;
6086    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6087    if (tmp == NULL)
6088        return NULL;
6089    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6090    Py_DECREF(tmp);
6091    return result;
6092}
6093
6094/* --- Unicode Internal Codec ------------------------------------------- */
6095
6096PyObject *
6097_PyUnicode_DecodeUnicodeInternal(const char *s,
6098                                 Py_ssize_t size,
6099                                 const char *errors)
6100{
6101    const char *starts = s;
6102    Py_ssize_t startinpos;
6103    Py_ssize_t endinpos;
6104    Py_ssize_t outpos;
6105    PyObject *v;
6106    const char *end;
6107    const char *reason;
6108    PyObject *errorHandler = NULL;
6109    PyObject *exc = NULL;
6110
6111    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6112                     "unicode_internal codec has been deprecated",
6113                     1))
6114        return NULL;
6115
6116    /* XXX overflow detection missing */
6117    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
6118    if (v == NULL)
6119        goto onError;
6120    if (PyUnicode_GET_LENGTH(v) == 0)
6121        return v;
6122    outpos = 0;
6123    end = s + size;
6124
6125    while (s < end) {
6126        Py_UNICODE uch;
6127        Py_UCS4 ch;
6128        /* We copy the raw representation one byte at a time because the
6129           pointer may be unaligned (see test_codeccallbacks). */
6130        ((char *) &uch)[0] = s[0];
6131        ((char *) &uch)[1] = s[1];
6132#ifdef Py_UNICODE_WIDE
6133        ((char *) &uch)[2] = s[2];
6134        ((char *) &uch)[3] = s[3];
6135#endif
6136        ch = uch;
6137
6138        /* We have to sanity check the raw data, otherwise doom looms for
6139           some malformed UCS-4 data. */
6140        if (
6141#ifdef Py_UNICODE_WIDE
6142            ch > 0x10ffff ||
6143#endif
6144            end-s < Py_UNICODE_SIZE
6145            )
6146        {
6147            startinpos = s - starts;
6148            if (end-s < Py_UNICODE_SIZE) {
6149                endinpos = end-starts;
6150                reason = "truncated input";
6151            }
6152            else {
6153                endinpos = s - starts + Py_UNICODE_SIZE;
6154                reason = "illegal code point (> 0x10FFFF)";
6155            }
6156            if (unicode_decode_call_errorhandler(
6157                    errors, &errorHandler,
6158                    "unicode_internal", reason,
6159                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6160                    &v, &outpos))
6161                goto onError;
6162            continue;
6163        }
6164
6165        s += Py_UNICODE_SIZE;
6166#ifndef Py_UNICODE_WIDE
6167        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
6168        {
6169            Py_UNICODE uch2;
6170            ((char *) &uch2)[0] = s[0];
6171            ((char *) &uch2)[1] = s[1];
6172            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6173            {
6174                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6175                s += Py_UNICODE_SIZE;
6176            }
6177        }
6178#endif
6179
6180        if (unicode_putchar(&v, &outpos, ch) < 0)
6181            goto onError;
6182    }
6183
6184    if (unicode_resize(&v, outpos) < 0)
6185        goto onError;
6186    Py_XDECREF(errorHandler);
6187    Py_XDECREF(exc);
6188    return unicode_result(v);
6189
6190  onError:
6191    Py_XDECREF(v);
6192    Py_XDECREF(errorHandler);
6193    Py_XDECREF(exc);
6194    return NULL;
6195}
6196
6197/* --- Latin-1 Codec ------------------------------------------------------ */
6198
6199PyObject *
6200PyUnicode_DecodeLatin1(const char *s,
6201                       Py_ssize_t size,
6202                       const char *errors)
6203{
6204    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6205    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6206}
6207
6208/* create or adjust a UnicodeEncodeError */
6209static void
6210make_encode_exception(PyObject **exceptionObject,
6211                      const char *encoding,
6212                      PyObject *unicode,
6213                      Py_ssize_t startpos, Py_ssize_t endpos,
6214                      const char *reason)
6215{
6216    if (*exceptionObject == NULL) {
6217        *exceptionObject = PyObject_CallFunction(
6218            PyExc_UnicodeEncodeError, "sOnns",
6219            encoding, unicode, startpos, endpos, reason);
6220    }
6221    else {
6222        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6223            goto onError;
6224        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6225            goto onError;
6226        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6227            goto onError;
6228        return;
6229      onError:
6230        Py_DECREF(*exceptionObject);
6231        *exceptionObject = NULL;
6232    }
6233}
6234
6235/* raises a UnicodeEncodeError */
6236static void
6237raise_encode_exception(PyObject **exceptionObject,
6238                       const char *encoding,
6239                       PyObject *unicode,
6240                       Py_ssize_t startpos, Py_ssize_t endpos,
6241                       const char *reason)
6242{
6243    make_encode_exception(exceptionObject,
6244                          encoding, unicode, startpos, endpos, reason);
6245    if (*exceptionObject != NULL)
6246        PyCodec_StrictErrors(*exceptionObject);
6247}
6248
6249/* error handling callback helper:
6250   build arguments, call the callback and check the arguments,
6251   put the result into newpos and return the replacement string, which
6252   has to be freed by the caller */
6253static PyObject *
6254unicode_encode_call_errorhandler(const char *errors,
6255                                 PyObject **errorHandler,
6256                                 const char *encoding, const char *reason,
6257                                 PyObject *unicode, PyObject **exceptionObject,
6258                                 Py_ssize_t startpos, Py_ssize_t endpos,
6259                                 Py_ssize_t *newpos)
6260{
6261    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6262    Py_ssize_t len;
6263    PyObject *restuple;
6264    PyObject *resunicode;
6265
6266    if (*errorHandler == NULL) {
6267        *errorHandler = PyCodec_LookupError(errors);
6268        if (*errorHandler == NULL)
6269            return NULL;
6270    }
6271
6272    if (PyUnicode_READY(unicode) == -1)
6273        return NULL;
6274    len = PyUnicode_GET_LENGTH(unicode);
6275
6276    make_encode_exception(exceptionObject,
6277                          encoding, unicode, startpos, endpos, reason);
6278    if (*exceptionObject == NULL)
6279        return NULL;
6280
6281    restuple = PyObject_CallFunctionObjArgs(
6282        *errorHandler, *exceptionObject, NULL);
6283    if (restuple == NULL)
6284        return NULL;
6285    if (!PyTuple_Check(restuple)) {
6286        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6287        Py_DECREF(restuple);
6288        return NULL;
6289    }
6290    if (!PyArg_ParseTuple(restuple, argparse,
6291                          &resunicode, newpos)) {
6292        Py_DECREF(restuple);
6293        return NULL;
6294    }
6295    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6296        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6297        Py_DECREF(restuple);
6298        return NULL;
6299    }
6300    if (*newpos<0)
6301        *newpos = len + *newpos;
6302    if (*newpos<0 || *newpos>len) {
6303        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6304        Py_DECREF(restuple);
6305        return NULL;
6306    }
6307    Py_INCREF(resunicode);
6308    Py_DECREF(restuple);
6309    return resunicode;
6310}
6311
6312static PyObject *
6313unicode_encode_ucs1(PyObject *unicode,
6314                    const char *errors,
6315                    unsigned int limit)
6316{
6317    /* input state */
6318    Py_ssize_t pos=0, size;
6319    int kind;
6320    void *data;
6321    /* output object */
6322    PyObject *res;
6323    /* pointer into the output */
6324    char *str;
6325    /* current output position */
6326    Py_ssize_t ressize;
6327    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6328    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6329    PyObject *errorHandler = NULL;
6330    PyObject *exc = NULL;
6331    /* the following variable is used for caching string comparisons
6332     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6333    int known_errorHandler = -1;
6334
6335    if (PyUnicode_READY(unicode) == -1)
6336        return NULL;
6337    size = PyUnicode_GET_LENGTH(unicode);
6338    kind = PyUnicode_KIND(unicode);
6339    data = PyUnicode_DATA(unicode);
6340    /* allocate enough for a simple encoding without
6341       replacements, if we need more, we'll resize */
6342    if (size == 0)
6343        return PyBytes_FromStringAndSize(NULL, 0);
6344    res = PyBytes_FromStringAndSize(NULL, size);
6345    if (res == NULL)
6346        return NULL;
6347    str = PyBytes_AS_STRING(res);
6348    ressize = size;
6349
6350    while (pos < size) {
6351        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6352
6353        /* can we encode this? */
6354        if (c<limit) {
6355            /* no overflow check, because we know that the space is enough */
6356            *str++ = (char)c;
6357            ++pos;
6358        }
6359        else {
6360            Py_ssize_t requiredsize;
6361            PyObject *repunicode;
6362            Py_ssize_t repsize, newpos, respos, i;
6363            /* startpos for collecting unencodable chars */
6364            Py_ssize_t collstart = pos;
6365            Py_ssize_t collend = pos;
6366            /* find all unecodable characters */
6367            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6368                ++collend;
6369            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6370            if (known_errorHandler==-1) {
6371                if ((errors==NULL) || (!strcmp(errors, "strict")))
6372                    known_errorHandler = 1;
6373                else if (!strcmp(errors, "replace"))
6374                    known_errorHandler = 2;
6375                else if (!strcmp(errors, "ignore"))
6376                    known_errorHandler = 3;
6377                else if (!strcmp(errors, "xmlcharrefreplace"))
6378                    known_errorHandler = 4;
6379                else
6380                    known_errorHandler = 0;
6381            }
6382            switch (known_errorHandler) {
6383            case 1: /* strict */
6384                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6385                goto onError;
6386            case 2: /* replace */
6387                while (collstart++<collend)
6388                    *str++ = '?'; /* fall through */
6389            case 3: /* ignore */
6390                pos = collend;
6391                break;
6392            case 4: /* xmlcharrefreplace */
6393                respos = str - PyBytes_AS_STRING(res);
6394                /* determine replacement size */
6395                for (i = collstart, repsize = 0; i < collend; ++i) {
6396                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6397                    if (ch < 10)
6398                        repsize += 2+1+1;
6399                    else if (ch < 100)
6400                        repsize += 2+2+1;
6401                    else if (ch < 1000)
6402                        repsize += 2+3+1;
6403                    else if (ch < 10000)
6404                        repsize += 2+4+1;
6405                    else if (ch < 100000)
6406                        repsize += 2+5+1;
6407                    else if (ch < 1000000)
6408                        repsize += 2+6+1;
6409                    else {
6410                        assert(ch <= MAX_UNICODE);
6411                        repsize += 2+7+1;
6412                    }
6413                }
6414                requiredsize = respos+repsize+(size-collend);
6415                if (requiredsize > ressize) {
6416                    if (requiredsize<2*ressize)
6417                        requiredsize = 2*ressize;
6418                    if (_PyBytes_Resize(&res, requiredsize))
6419                        goto onError;
6420                    str = PyBytes_AS_STRING(res) + respos;
6421                    ressize = requiredsize;
6422                }
6423                /* generate replacement */
6424                for (i = collstart; i < collend; ++i) {
6425                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6426                }
6427                pos = collend;
6428                break;
6429            default:
6430                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6431                                                              encoding, reason, unicode, &exc,
6432                                                              collstart, collend, &newpos);
6433                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6434                                           PyUnicode_READY(repunicode) == -1))
6435                    goto onError;
6436                if (PyBytes_Check(repunicode)) {
6437                    /* Directly copy bytes result to output. */
6438                    repsize = PyBytes_Size(repunicode);
6439                    if (repsize > 1) {
6440                        /* Make room for all additional bytes. */
6441                        respos = str - PyBytes_AS_STRING(res);
6442                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6443                            Py_DECREF(repunicode);
6444                            goto onError;
6445                        }
6446                        str = PyBytes_AS_STRING(res) + respos;
6447                        ressize += repsize-1;
6448                    }
6449                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6450                    str += repsize;
6451                    pos = newpos;
6452                    Py_DECREF(repunicode);
6453                    break;
6454                }
6455                /* need more space? (at least enough for what we
6456                   have+the replacement+the rest of the string, so
6457                   we won't have to check space for encodable characters) */
6458                respos = str - PyBytes_AS_STRING(res);
6459                repsize = PyUnicode_GET_LENGTH(repunicode);
6460                requiredsize = respos+repsize+(size-collend);
6461                if (requiredsize > ressize) {
6462                    if (requiredsize<2*ressize)
6463                        requiredsize = 2*ressize;
6464                    if (_PyBytes_Resize(&res, requiredsize)) {
6465                        Py_DECREF(repunicode);
6466                        goto onError;
6467                    }
6468                    str = PyBytes_AS_STRING(res) + respos;
6469                    ressize = requiredsize;
6470                }
6471                /* check if there is anything unencodable in the replacement
6472                   and copy it to the output */
6473                for (i = 0; repsize-->0; ++i, ++str) {
6474                    c = PyUnicode_READ_CHAR(repunicode, i);
6475                    if (c >= limit) {
6476                        raise_encode_exception(&exc, encoding, unicode,
6477                                               pos, pos+1, reason);
6478                        Py_DECREF(repunicode);
6479                        goto onError;
6480                    }
6481                    *str = (char)c;
6482                }
6483                pos = newpos;
6484                Py_DECREF(repunicode);
6485            }
6486        }
6487    }
6488    /* Resize if we allocated to much */
6489    size = str - PyBytes_AS_STRING(res);
6490    if (size < ressize) { /* If this falls res will be NULL */
6491        assert(size >= 0);
6492        if (_PyBytes_Resize(&res, size) < 0)
6493            goto onError;
6494    }
6495
6496    Py_XDECREF(errorHandler);
6497    Py_XDECREF(exc);
6498    return res;
6499
6500  onError:
6501    Py_XDECREF(res);
6502    Py_XDECREF(errorHandler);
6503    Py_XDECREF(exc);
6504    return NULL;
6505}
6506
6507/* Deprecated */
6508PyObject *
6509PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6510                       Py_ssize_t size,
6511                       const char *errors)
6512{
6513    PyObject *result;
6514    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6515    if (unicode == NULL)
6516        return NULL;
6517    result = unicode_encode_ucs1(unicode, errors, 256);
6518    Py_DECREF(unicode);
6519    return result;
6520}
6521
6522PyObject *
6523_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6524{
6525    if (!PyUnicode_Check(unicode)) {
6526        PyErr_BadArgument();
6527        return NULL;
6528    }
6529    if (PyUnicode_READY(unicode) == -1)
6530        return NULL;
6531    /* Fast path: if it is a one-byte string, construct
6532       bytes object directly. */
6533    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6534        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6535                                         PyUnicode_GET_LENGTH(unicode));
6536    /* Non-Latin-1 characters present. Defer to above function to
6537       raise the exception. */
6538    return unicode_encode_ucs1(unicode, errors, 256);
6539}
6540
6541PyObject*
6542PyUnicode_AsLatin1String(PyObject *unicode)
6543{
6544    return _PyUnicode_AsLatin1String(unicode, NULL);
6545}
6546
6547/* --- 7-bit ASCII Codec -------------------------------------------------- */
6548
6549PyObject *
6550PyUnicode_DecodeASCII(const char *s,
6551                      Py_ssize_t size,
6552                      const char *errors)
6553{
6554    const char *starts = s;
6555    PyObject *unicode;
6556    int kind;
6557    void *data;
6558    Py_ssize_t startinpos;
6559    Py_ssize_t endinpos;
6560    Py_ssize_t outpos;
6561    const char *e;
6562    PyObject *errorHandler = NULL;
6563    PyObject *exc = NULL;
6564
6565    if (size == 0) {
6566        Py_INCREF(unicode_empty);
6567        return unicode_empty;
6568    }
6569
6570    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6571    if (size == 1 && (unsigned char)s[0] < 128)
6572        return get_latin1_char((unsigned char)s[0]);
6573
6574    unicode = PyUnicode_New(size, 127);
6575    if (unicode == NULL)
6576        goto onError;
6577
6578    e = s + size;
6579    data = PyUnicode_1BYTE_DATA(unicode);
6580    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6581    if (outpos == size)
6582        return unicode;
6583
6584    s += outpos;
6585    kind = PyUnicode_1BYTE_KIND;
6586    while (s < e) {
6587        register unsigned char c = (unsigned char)*s;
6588        if (c < 128) {
6589            PyUnicode_WRITE(kind, data, outpos++, c);
6590            ++s;
6591        }
6592        else {
6593            startinpos = s-starts;
6594            endinpos = startinpos + 1;
6595            if (unicode_decode_call_errorhandler(
6596                    errors, &errorHandler,
6597                    "ascii", "ordinal not in range(128)",
6598                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6599                    &unicode, &outpos))
6600                goto onError;
6601            kind = PyUnicode_KIND(unicode);
6602            data = PyUnicode_DATA(unicode);
6603        }
6604    }
6605    if (unicode_resize(&unicode, outpos) < 0)
6606        goto onError;
6607    Py_XDECREF(errorHandler);
6608    Py_XDECREF(exc);
6609    assert(_PyUnicode_CheckConsistency(unicode, 1));
6610    return unicode;
6611
6612  onError:
6613    Py_XDECREF(unicode);
6614    Py_XDECREF(errorHandler);
6615    Py_XDECREF(exc);
6616    return NULL;
6617}
6618
6619/* Deprecated */
6620PyObject *
6621PyUnicode_EncodeASCII(const Py_UNICODE *p,
6622                      Py_ssize_t size,
6623                      const char *errors)
6624{
6625    PyObject *result;
6626    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6627    if (unicode == NULL)
6628        return NULL;
6629    result = unicode_encode_ucs1(unicode, errors, 128);
6630    Py_DECREF(unicode);
6631    return result;
6632}
6633
6634PyObject *
6635_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6636{
6637    if (!PyUnicode_Check(unicode)) {
6638        PyErr_BadArgument();
6639        return NULL;
6640    }
6641    if (PyUnicode_READY(unicode) == -1)
6642        return NULL;
6643    /* Fast path: if it is an ASCII-only string, construct bytes object
6644       directly. Else defer to above function to raise the exception. */
6645    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6646        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6647                                         PyUnicode_GET_LENGTH(unicode));
6648    return unicode_encode_ucs1(unicode, errors, 128);
6649}
6650
6651PyObject *
6652PyUnicode_AsASCIIString(PyObject *unicode)
6653{
6654    return _PyUnicode_AsASCIIString(unicode, NULL);
6655}
6656
6657#ifdef HAVE_MBCS
6658
6659/* --- MBCS codecs for Windows -------------------------------------------- */
6660
6661#if SIZEOF_INT < SIZEOF_SIZE_T
6662#define NEED_RETRY
6663#endif
6664
6665#ifndef WC_ERR_INVALID_CHARS
6666#  define WC_ERR_INVALID_CHARS 0x0080
6667#endif
6668
6669static char*
6670code_page_name(UINT code_page, PyObject **obj)
6671{
6672    *obj = NULL;
6673    if (code_page == CP_ACP)
6674        return "mbcs";
6675    if (code_page == CP_UTF7)
6676        return "CP_UTF7";
6677    if (code_page == CP_UTF8)
6678        return "CP_UTF8";
6679
6680    *obj = PyBytes_FromFormat("cp%u", code_page);
6681    if (*obj == NULL)
6682        return NULL;
6683    return PyBytes_AS_STRING(*obj);
6684}
6685
6686static int
6687is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6688{
6689    const char *curr = s + offset;
6690    const char *prev;
6691
6692    if (!IsDBCSLeadByteEx(code_page, *curr))
6693        return 0;
6694
6695    prev = CharPrevExA(code_page, s, curr, 0);
6696    if (prev == curr)
6697        return 1;
6698    /* FIXME: This code is limited to "true" double-byte encodings,
6699       as it assumes an incomplete character consists of a single
6700       byte. */
6701    if (curr - prev == 2)
6702        return 1;
6703    if (!IsDBCSLeadByteEx(code_page, *prev))
6704        return 1;
6705    return 0;
6706}
6707
6708static DWORD
6709decode_code_page_flags(UINT code_page)
6710{
6711    if (code_page == CP_UTF7) {
6712        /* The CP_UTF7 decoder only supports flags=0 */
6713        return 0;
6714    }
6715    else
6716        return MB_ERR_INVALID_CHARS;
6717}
6718
6719/*
6720 * Decode a byte string from a Windows code page into unicode object in strict
6721 * mode.
6722 *
6723 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6724 * WindowsError and returns -1 on other error.
6725 */
6726static int
6727decode_code_page_strict(UINT code_page,
6728                        PyObject **v,
6729                        const char *in,
6730                        int insize)
6731{
6732    const DWORD flags = decode_code_page_flags(code_page);
6733    wchar_t *out;
6734    DWORD outsize;
6735
6736    /* First get the size of the result */
6737    assert(insize > 0);
6738    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6739    if (outsize <= 0)
6740        goto error;
6741
6742    if (*v == NULL) {
6743        /* Create unicode object */
6744        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6745        *v = (PyObject*)_PyUnicode_New(outsize);
6746        if (*v == NULL)
6747            return -1;
6748        out = PyUnicode_AS_UNICODE(*v);
6749    }
6750    else {
6751        /* Extend unicode object */
6752        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6753        if (unicode_resize(v, n + outsize) < 0)
6754            return -1;
6755        out = PyUnicode_AS_UNICODE(*v) + n;
6756    }
6757
6758    /* Do the conversion */
6759    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6760    if (outsize <= 0)
6761        goto error;
6762    return insize;
6763
6764error:
6765    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6766        return -2;
6767    PyErr_SetFromWindowsErr(0);
6768    return -1;
6769}
6770
6771/*
6772 * Decode a byte string from a code page into unicode object with an error
6773 * handler.
6774 *
6775 * Returns consumed size if succeed, or raise a WindowsError or
6776 * UnicodeDecodeError exception and returns -1 on error.
6777 */
6778static int
6779decode_code_page_errors(UINT code_page,
6780                        PyObject **v,
6781                        const char *in, const int size,
6782                        const char *errors)
6783{
6784    const char *startin = in;
6785    const char *endin = in + size;
6786    const DWORD flags = decode_code_page_flags(code_page);
6787    /* Ideally, we should get reason from FormatMessage. This is the Windows
6788       2000 English version of the message. */
6789    const char *reason = "No mapping for the Unicode character exists "
6790                         "in the target code page.";
6791    /* each step cannot decode more than 1 character, but a character can be
6792       represented as a surrogate pair */
6793    wchar_t buffer[2], *startout, *out;
6794    int insize, outsize;
6795    PyObject *errorHandler = NULL;
6796    PyObject *exc = NULL;
6797    PyObject *encoding_obj = NULL;
6798    char *encoding;
6799    DWORD err;
6800    int ret = -1;
6801
6802    assert(size > 0);
6803
6804    encoding = code_page_name(code_page, &encoding_obj);
6805    if (encoding == NULL)
6806        return -1;
6807
6808    if (errors == NULL || strcmp(errors, "strict") == 0) {
6809        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6810           UnicodeDecodeError. */
6811        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6812        if (exc != NULL) {
6813            PyCodec_StrictErrors(exc);
6814            Py_CLEAR(exc);
6815        }
6816        goto error;
6817    }
6818
6819    if (*v == NULL) {
6820        /* Create unicode object */
6821        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6822            PyErr_NoMemory();
6823            goto error;
6824        }
6825        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6826        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6827        if (*v == NULL)
6828            goto error;
6829        startout = PyUnicode_AS_UNICODE(*v);
6830    }
6831    else {
6832        /* Extend unicode object */
6833        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6834        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6835            PyErr_NoMemory();
6836            goto error;
6837        }
6838        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6839            goto error;
6840        startout = PyUnicode_AS_UNICODE(*v) + n;
6841    }
6842
6843    /* Decode the byte string character per character */
6844    out = startout;
6845    while (in < endin)
6846    {
6847        /* Decode a character */
6848        insize = 1;
6849        do
6850        {
6851            outsize = MultiByteToWideChar(code_page, flags,
6852                                          in, insize,
6853                                          buffer, Py_ARRAY_LENGTH(buffer));
6854            if (outsize > 0)
6855                break;
6856            err = GetLastError();
6857            if (err != ERROR_NO_UNICODE_TRANSLATION
6858                && err != ERROR_INSUFFICIENT_BUFFER)
6859            {
6860                PyErr_SetFromWindowsErr(0);
6861                goto error;
6862            }
6863            insize++;
6864        }
6865        /* 4=maximum length of a UTF-8 sequence */
6866        while (insize <= 4 && (in + insize) <= endin);
6867
6868        if (outsize <= 0) {
6869            Py_ssize_t startinpos, endinpos, outpos;
6870
6871            startinpos = in - startin;
6872            endinpos = startinpos + 1;
6873            outpos = out - PyUnicode_AS_UNICODE(*v);
6874            if (unicode_decode_call_errorhandler(
6875                    errors, &errorHandler,
6876                    encoding, reason,
6877                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
6878                    v, &outpos))
6879            {
6880                goto error;
6881            }
6882            out = PyUnicode_AS_UNICODE(*v) + outpos;
6883        }
6884        else {
6885            in += insize;
6886            memcpy(out, buffer, outsize * sizeof(wchar_t));
6887            out += outsize;
6888        }
6889    }
6890
6891    /* write a NUL character at the end */
6892    *out = 0;
6893
6894    /* Extend unicode object */
6895    outsize = out - startout;
6896    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
6897    if (unicode_resize(v, outsize) < 0)
6898        goto error;
6899    ret = size;
6900
6901error:
6902    Py_XDECREF(encoding_obj);
6903    Py_XDECREF(errorHandler);
6904    Py_XDECREF(exc);
6905    return ret;
6906}
6907
6908static PyObject *
6909decode_code_page_stateful(int code_page,
6910                          const char *s, Py_ssize_t size,
6911                          const char *errors, Py_ssize_t *consumed)
6912{
6913    PyObject *v = NULL;
6914    int chunk_size, final, converted, done;
6915
6916    if (code_page < 0) {
6917        PyErr_SetString(PyExc_ValueError, "invalid code page number");
6918        return NULL;
6919    }
6920
6921    if (consumed)
6922        *consumed = 0;
6923
6924    do
6925    {
6926#ifdef NEED_RETRY
6927        if (size > INT_MAX) {
6928            chunk_size = INT_MAX;
6929            final = 0;
6930            done = 0;
6931        }
6932        else
6933#endif
6934        {
6935            chunk_size = (int)size;
6936            final = (consumed == NULL);
6937            done = 1;
6938        }
6939
6940        /* Skip trailing lead-byte unless 'final' is set */
6941        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6942            --chunk_size;
6943
6944        if (chunk_size == 0 && done) {
6945            if (v != NULL)
6946                break;
6947            Py_INCREF(unicode_empty);
6948            return unicode_empty;
6949        }
6950
6951
6952        converted = decode_code_page_strict(code_page, &v,
6953                                            s, chunk_size);
6954        if (converted == -2)
6955            converted = decode_code_page_errors(code_page, &v,
6956                                                s, chunk_size,
6957                                                errors);
6958        assert(converted != 0);
6959
6960        if (converted < 0) {
6961            Py_XDECREF(v);
6962            return NULL;
6963        }
6964
6965        if (consumed)
6966            *consumed += converted;
6967
6968        s += converted;
6969        size -= converted;
6970    } while (!done);
6971
6972    return unicode_result(v);
6973}
6974
6975PyObject *
6976PyUnicode_DecodeCodePageStateful(int code_page,
6977                                 const char *s,
6978                                 Py_ssize_t size,
6979                                 const char *errors,
6980                                 Py_ssize_t *consumed)
6981{
6982    return decode_code_page_stateful(code_page, s, size, errors, consumed);
6983}
6984
6985PyObject *
6986PyUnicode_DecodeMBCSStateful(const char *s,
6987                             Py_ssize_t size,
6988                             const char *errors,
6989                             Py_ssize_t *consumed)
6990{
6991    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6992}
6993
6994PyObject *
6995PyUnicode_DecodeMBCS(const char *s,
6996                     Py_ssize_t size,
6997                     const char *errors)
6998{
6999    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7000}
7001
7002static DWORD
7003encode_code_page_flags(UINT code_page, const char *errors)
7004{
7005    if (code_page == CP_UTF8) {
7006        if (winver.dwMajorVersion >= 6)
7007            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7008               and later */
7009            return WC_ERR_INVALID_CHARS;
7010        else
7011            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7012            return 0;
7013    }
7014    else if (code_page == CP_UTF7) {
7015        /* CP_UTF7 only supports flags=0 */
7016        return 0;
7017    }
7018    else {
7019        if (errors != NULL && strcmp(errors, "replace") == 0)
7020            return 0;
7021        else
7022            return WC_NO_BEST_FIT_CHARS;
7023    }
7024}
7025
7026/*
7027 * Encode a Unicode string to a Windows code page into a byte string in strict
7028 * mode.
7029 *
7030 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7031 * a WindowsError and returns -1 on other error.
7032 */
7033static int
7034encode_code_page_strict(UINT code_page, PyObject **outbytes,
7035                        PyObject *unicode, Py_ssize_t offset, int len,
7036                        const char* errors)
7037{
7038    BOOL usedDefaultChar = FALSE;
7039    BOOL *pusedDefaultChar = &usedDefaultChar;
7040    int outsize;
7041    PyObject *exc = NULL;
7042    wchar_t *p;
7043    Py_ssize_t size;
7044    const DWORD flags = encode_code_page_flags(code_page, NULL);
7045    char *out;
7046    /* Create a substring so that we can get the UTF-16 representation
7047       of just the slice under consideration. */
7048    PyObject *substring;
7049
7050    assert(len > 0);
7051
7052    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7053        pusedDefaultChar = &usedDefaultChar;
7054    else
7055        pusedDefaultChar = NULL;
7056
7057    substring = PyUnicode_Substring(unicode, offset, offset+len);
7058    if (substring == NULL)
7059        return -1;
7060    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7061    if (p == NULL) {
7062        Py_DECREF(substring);
7063        return -1;
7064    }
7065
7066    /* First get the size of the result */
7067    outsize = WideCharToMultiByte(code_page, flags,
7068                                  p, size,
7069                                  NULL, 0,
7070                                  NULL, pusedDefaultChar);
7071    if (outsize <= 0)
7072        goto error;
7073    /* If we used a default char, then we failed! */
7074    if (pusedDefaultChar && *pusedDefaultChar) {
7075        Py_DECREF(substring);
7076        return -2;
7077    }
7078
7079    if (*outbytes == NULL) {
7080        /* Create string object */
7081        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7082        if (*outbytes == NULL) {
7083            Py_DECREF(substring);
7084            return -1;
7085        }
7086        out = PyBytes_AS_STRING(*outbytes);
7087    }
7088    else {
7089        /* Extend string object */
7090        const Py_ssize_t n = PyBytes_Size(*outbytes);
7091        if (outsize > PY_SSIZE_T_MAX - n) {
7092            PyErr_NoMemory();
7093            Py_DECREF(substring);
7094            return -1;
7095        }
7096        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7097            Py_DECREF(substring);
7098            return -1;
7099        }
7100        out = PyBytes_AS_STRING(*outbytes) + n;
7101    }
7102
7103    /* Do the conversion */
7104    outsize = WideCharToMultiByte(code_page, flags,
7105                                  p, size,
7106                                  out, outsize,
7107                                  NULL, pusedDefaultChar);
7108    Py_CLEAR(substring);
7109    if (outsize <= 0)
7110        goto error;
7111    if (pusedDefaultChar && *pusedDefaultChar)
7112        return -2;
7113    return 0;
7114
7115error:
7116    Py_XDECREF(substring);
7117    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7118        return -2;
7119    PyErr_SetFromWindowsErr(0);
7120    return -1;
7121}
7122
7123/*
7124 * Encode a Unicode string to a Windows code page into a byte string using a
7125 * error handler.
7126 *
7127 * Returns consumed characters if succeed, or raise a WindowsError and returns
7128 * -1 on other error.
7129 */
7130static int
7131encode_code_page_errors(UINT code_page, PyObject **outbytes,
7132                        PyObject *unicode, Py_ssize_t unicode_offset,
7133                        Py_ssize_t insize, const char* errors)
7134{
7135    const DWORD flags = encode_code_page_flags(code_page, errors);
7136    Py_ssize_t pos = unicode_offset;
7137    Py_ssize_t endin = unicode_offset + insize;
7138    /* Ideally, we should get reason from FormatMessage. This is the Windows
7139       2000 English version of the message. */
7140    const char *reason = "invalid character";
7141    /* 4=maximum length of a UTF-8 sequence */
7142    char buffer[4];
7143    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7144    Py_ssize_t outsize;
7145    char *out;
7146    PyObject *errorHandler = NULL;
7147    PyObject *exc = NULL;
7148    PyObject *encoding_obj = NULL;
7149    char *encoding;
7150    Py_ssize_t newpos, newoutsize;
7151    PyObject *rep;
7152    int ret = -1;
7153
7154    assert(insize > 0);
7155
7156    encoding = code_page_name(code_page, &encoding_obj);
7157    if (encoding == NULL)
7158        return -1;
7159
7160    if (errors == NULL || strcmp(errors, "strict") == 0) {
7161        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7162           then we raise a UnicodeEncodeError. */
7163        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7164        if (exc != NULL) {
7165            PyCodec_StrictErrors(exc);
7166            Py_DECREF(exc);
7167        }
7168        Py_XDECREF(encoding_obj);
7169        return -1;
7170    }
7171
7172    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7173        pusedDefaultChar = &usedDefaultChar;
7174    else
7175        pusedDefaultChar = NULL;
7176
7177    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7178        PyErr_NoMemory();
7179        goto error;
7180    }
7181    outsize = insize * Py_ARRAY_LENGTH(buffer);
7182
7183    if (*outbytes == NULL) {
7184        /* Create string object */
7185        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7186        if (*outbytes == NULL)
7187            goto error;
7188        out = PyBytes_AS_STRING(*outbytes);
7189    }
7190    else {
7191        /* Extend string object */
7192        Py_ssize_t n = PyBytes_Size(*outbytes);
7193        if (n > PY_SSIZE_T_MAX - outsize) {
7194            PyErr_NoMemory();
7195            goto error;
7196        }
7197        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7198            goto error;
7199        out = PyBytes_AS_STRING(*outbytes) + n;
7200    }
7201
7202    /* Encode the string character per character */
7203    while (pos < endin)
7204    {
7205        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7206        wchar_t chars[2];
7207        int charsize;
7208        if (ch < 0x10000) {
7209            chars[0] = (wchar_t)ch;
7210            charsize = 1;
7211        }
7212        else {
7213            ch -= 0x10000;
7214            chars[0] = 0xd800 + (ch >> 10);
7215            chars[1] = 0xdc00 + (ch & 0x3ff);
7216            charsize = 2;
7217        }
7218
7219        outsize = WideCharToMultiByte(code_page, flags,
7220                                      chars, charsize,
7221                                      buffer, Py_ARRAY_LENGTH(buffer),
7222                                      NULL, pusedDefaultChar);
7223        if (outsize > 0) {
7224            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7225            {
7226                pos++;
7227                memcpy(out, buffer, outsize);
7228                out += outsize;
7229                continue;
7230            }
7231        }
7232        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7233            PyErr_SetFromWindowsErr(0);
7234            goto error;
7235        }
7236
7237        rep = unicode_encode_call_errorhandler(
7238                  errors, &errorHandler, encoding, reason,
7239                  unicode, &exc,
7240                  pos, pos + 1, &newpos);
7241        if (rep == NULL)
7242            goto error;
7243        pos = newpos;
7244
7245        if (PyBytes_Check(rep)) {
7246            outsize = PyBytes_GET_SIZE(rep);
7247            if (outsize != 1) {
7248                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7249                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7250                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7251                    Py_DECREF(rep);
7252                    goto error;
7253                }
7254                out = PyBytes_AS_STRING(*outbytes) + offset;
7255            }
7256            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7257            out += outsize;
7258        }
7259        else {
7260            Py_ssize_t i;
7261            enum PyUnicode_Kind kind;
7262            void *data;
7263
7264            if (PyUnicode_READY(rep) == -1) {
7265                Py_DECREF(rep);
7266                goto error;
7267            }
7268
7269            outsize = PyUnicode_GET_LENGTH(rep);
7270            if (outsize != 1) {
7271                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7272                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7273                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7274                    Py_DECREF(rep);
7275                    goto error;
7276                }
7277                out = PyBytes_AS_STRING(*outbytes) + offset;
7278            }
7279            kind = PyUnicode_KIND(rep);
7280            data = PyUnicode_DATA(rep);
7281            for (i=0; i < outsize; i++) {
7282                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7283                if (ch > 127) {
7284                    raise_encode_exception(&exc,
7285                        encoding, unicode,
7286                        pos, pos + 1,
7287                        "unable to encode error handler result to ASCII");
7288                    Py_DECREF(rep);
7289                    goto error;
7290                }
7291                *out = (unsigned char)ch;
7292                out++;
7293            }
7294        }
7295        Py_DECREF(rep);
7296    }
7297    /* write a NUL byte */
7298    *out = 0;
7299    outsize = out - PyBytes_AS_STRING(*outbytes);
7300    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7301    if (_PyBytes_Resize(outbytes, outsize) < 0)
7302        goto error;
7303    ret = 0;
7304
7305error:
7306    Py_XDECREF(encoding_obj);
7307    Py_XDECREF(errorHandler);
7308    Py_XDECREF(exc);
7309    return ret;
7310}
7311
7312static PyObject *
7313encode_code_page(int code_page,
7314                 PyObject *unicode,
7315                 const char *errors)
7316{
7317    Py_ssize_t len;
7318    PyObject *outbytes = NULL;
7319    Py_ssize_t offset;
7320    int chunk_len, ret, done;
7321
7322    if (PyUnicode_READY(unicode) == -1)
7323        return NULL;
7324    len = PyUnicode_GET_LENGTH(unicode);
7325
7326    if (code_page < 0) {
7327        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7328        return NULL;
7329    }
7330
7331    if (len == 0)
7332        return PyBytes_FromStringAndSize(NULL, 0);
7333
7334    offset = 0;
7335    do
7336    {
7337#ifdef NEED_RETRY
7338        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7339           chunks. */
7340        if (len > INT_MAX/2) {
7341            chunk_len = INT_MAX/2;
7342            done = 0;
7343        }
7344        else
7345#endif
7346        {
7347            chunk_len = (int)len;
7348            done = 1;
7349        }
7350
7351        ret = encode_code_page_strict(code_page, &outbytes,
7352                                      unicode, offset, chunk_len,
7353                                      errors);
7354        if (ret == -2)
7355            ret = encode_code_page_errors(code_page, &outbytes,
7356                                          unicode, offset,
7357                                          chunk_len, errors);
7358        if (ret < 0) {
7359            Py_XDECREF(outbytes);
7360            return NULL;
7361        }
7362
7363        offset += chunk_len;
7364        len -= chunk_len;
7365    } while (!done);
7366
7367    return outbytes;
7368}
7369
7370PyObject *
7371PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7372                     Py_ssize_t size,
7373                     const char *errors)
7374{
7375    PyObject *unicode, *res;
7376    unicode = PyUnicode_FromUnicode(p, size);
7377    if (unicode == NULL)
7378        return NULL;
7379    res = encode_code_page(CP_ACP, unicode, errors);
7380    Py_DECREF(unicode);
7381    return res;
7382}
7383
7384PyObject *
7385PyUnicode_EncodeCodePage(int code_page,
7386                         PyObject *unicode,
7387                         const char *errors)
7388{
7389    return encode_code_page(code_page, unicode, errors);
7390}
7391
7392PyObject *
7393PyUnicode_AsMBCSString(PyObject *unicode)
7394{
7395    if (!PyUnicode_Check(unicode)) {
7396        PyErr_BadArgument();
7397        return NULL;
7398    }
7399    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7400}
7401
7402#undef NEED_RETRY
7403
7404#endif /* HAVE_MBCS */
7405
7406/* --- Character Mapping Codec -------------------------------------------- */
7407
7408PyObject *
7409PyUnicode_DecodeCharmap(const char *s,
7410                        Py_ssize_t size,
7411                        PyObject *mapping,
7412                        const char *errors)
7413{
7414    const char *starts = s;
7415    Py_ssize_t startinpos;
7416    Py_ssize_t endinpos;
7417    Py_ssize_t outpos;
7418    const char *e;
7419    PyObject *v;
7420    Py_ssize_t extrachars = 0;
7421    PyObject *errorHandler = NULL;
7422    PyObject *exc = NULL;
7423
7424    /* Default to Latin-1 */
7425    if (mapping == NULL)
7426        return PyUnicode_DecodeLatin1(s, size, errors);
7427
7428    v = PyUnicode_New(size, 127);
7429    if (v == NULL)
7430        goto onError;
7431    if (size == 0)
7432        return v;
7433    outpos = 0;
7434    e = s + size;
7435    if (PyUnicode_CheckExact(mapping)) {
7436        Py_ssize_t maplen;
7437        enum PyUnicode_Kind mapkind;
7438        void *mapdata;
7439        Py_UCS4 x;
7440
7441        if (PyUnicode_READY(mapping) == -1)
7442            return NULL;
7443
7444        maplen = PyUnicode_GET_LENGTH(mapping);
7445        mapdata = PyUnicode_DATA(mapping);
7446        mapkind = PyUnicode_KIND(mapping);
7447        while (s < e) {
7448            unsigned char ch;
7449            if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7450                enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7451                if (outkind == PyUnicode_1BYTE_KIND) {
7452                    void *outdata = PyUnicode_DATA(v);
7453                    Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7454                    while (s < e) {
7455                        unsigned char ch = *s;
7456                        x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7457                        if (x > maxchar)
7458                            goto Error;
7459                        PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7460                        ++s;
7461                    }
7462                    break;
7463                }
7464                else if (outkind == PyUnicode_2BYTE_KIND) {
7465                    void *outdata = PyUnicode_DATA(v);
7466                    while (s < e) {
7467                        unsigned char ch = *s;
7468                        x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7469                        if (x == 0xFFFE)
7470                            goto Error;
7471                        PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7472                        ++s;
7473                    }
7474                    break;
7475                }
7476            }
7477            ch = *s;
7478
7479            if (ch < maplen)
7480                x = PyUnicode_READ(mapkind, mapdata, ch);
7481            else
7482                x = 0xfffe; /* invalid value */
7483Error:
7484            if (x == 0xfffe)
7485            {
7486                /* undefined mapping */
7487                startinpos = s-starts;
7488                endinpos = startinpos+1;
7489                if (unicode_decode_call_errorhandler(
7490                        errors, &errorHandler,
7491                        "charmap", "character maps to <undefined>",
7492                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7493                        &v, &outpos)) {
7494                    goto onError;
7495                }
7496                continue;
7497            }
7498
7499            if (unicode_putchar(&v, &outpos, x) < 0)
7500                goto onError;
7501            ++s;
7502        }
7503    }
7504    else {
7505        while (s < e) {
7506            unsigned char ch = *s;
7507            PyObject *w, *x;
7508
7509            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7510            w = PyLong_FromLong((long)ch);
7511            if (w == NULL)
7512                goto onError;
7513            x = PyObject_GetItem(mapping, w);
7514            Py_DECREF(w);
7515            if (x == NULL) {
7516                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7517                    /* No mapping found means: mapping is undefined. */
7518                    PyErr_Clear();
7519                    x = Py_None;
7520                    Py_INCREF(x);
7521                } else
7522                    goto onError;
7523            }
7524
7525            /* Apply mapping */
7526            if (PyLong_Check(x)) {
7527                long value = PyLong_AS_LONG(x);
7528                if (value < 0 || value > MAX_UNICODE) {
7529                    PyErr_Format(PyExc_TypeError,
7530                                 "character mapping must be in range(0x%lx)",
7531                                 (unsigned long)MAX_UNICODE + 1);
7532                    Py_DECREF(x);
7533                    goto onError;
7534                }
7535                if (unicode_putchar(&v, &outpos, value) < 0)
7536                    goto onError;
7537            }
7538            else if (x == Py_None) {
7539                /* undefined mapping */
7540                startinpos = s-starts;
7541                endinpos = startinpos+1;
7542                if (unicode_decode_call_errorhandler(
7543                        errors, &errorHandler,
7544                        "charmap", "character maps to <undefined>",
7545                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7546                        &v, &outpos)) {
7547                    Py_DECREF(x);
7548                    goto onError;
7549                }
7550                Py_DECREF(x);
7551                continue;
7552            }
7553            else if (PyUnicode_Check(x)) {
7554                Py_ssize_t targetsize;
7555
7556                if (PyUnicode_READY(x) == -1)
7557                    goto onError;
7558                targetsize = PyUnicode_GET_LENGTH(x);
7559
7560                if (targetsize == 1) {
7561                    /* 1-1 mapping */
7562                    if (unicode_putchar(&v, &outpos,
7563                                        PyUnicode_READ_CHAR(x, 0)) < 0)
7564                        goto onError;
7565                }
7566                else if (targetsize > 1) {
7567                    /* 1-n mapping */
7568                    if (targetsize > extrachars) {
7569                        /* resize first */
7570                        Py_ssize_t needed = (targetsize - extrachars) + \
7571                            (targetsize << 2);
7572                        extrachars += needed;
7573                        /* XXX overflow detection missing */
7574                        if (unicode_resize(&v,
7575                                           PyUnicode_GET_LENGTH(v) + needed) < 0)
7576                        {
7577                            Py_DECREF(x);
7578                            goto onError;
7579                        }
7580                    }
7581                    if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7582                        goto onError;
7583                    PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7584                    outpos += targetsize;
7585                    extrachars -= targetsize;
7586                }
7587                /* 1-0 mapping: skip the character */
7588            }
7589            else {
7590                /* wrong return value */
7591                PyErr_SetString(PyExc_TypeError,
7592                                "character mapping must return integer, None or str");
7593                Py_DECREF(x);
7594                goto onError;
7595            }
7596            Py_DECREF(x);
7597            ++s;
7598        }
7599    }
7600    if (unicode_resize(&v, outpos) < 0)
7601        goto onError;
7602    Py_XDECREF(errorHandler);
7603    Py_XDECREF(exc);
7604    return unicode_result(v);
7605
7606  onError:
7607    Py_XDECREF(errorHandler);
7608    Py_XDECREF(exc);
7609    Py_XDECREF(v);
7610    return NULL;
7611}
7612
7613/* Charmap encoding: the lookup table */
7614
7615struct encoding_map {
7616    PyObject_HEAD
7617    unsigned char level1[32];
7618    int count2, count3;
7619    unsigned char level23[1];
7620};
7621
7622static PyObject*
7623encoding_map_size(PyObject *obj, PyObject* args)
7624{
7625    struct encoding_map *map = (struct encoding_map*)obj;
7626    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7627                           128*map->count3);
7628}
7629
7630static PyMethodDef encoding_map_methods[] = {
7631    {"size", encoding_map_size, METH_NOARGS,
7632     PyDoc_STR("Return the size (in bytes) of this object") },
7633    { 0 }
7634};
7635
7636static void
7637encoding_map_dealloc(PyObject* o)
7638{
7639    PyObject_FREE(o);
7640}
7641
7642static PyTypeObject EncodingMapType = {
7643    PyVarObject_HEAD_INIT(NULL, 0)
7644    "EncodingMap",          /*tp_name*/
7645    sizeof(struct encoding_map),   /*tp_basicsize*/
7646    0,                      /*tp_itemsize*/
7647    /* methods */
7648    encoding_map_dealloc,   /*tp_dealloc*/
7649    0,                      /*tp_print*/
7650    0,                      /*tp_getattr*/
7651    0,                      /*tp_setattr*/
7652    0,                      /*tp_reserved*/
7653    0,                      /*tp_repr*/
7654    0,                      /*tp_as_number*/
7655    0,                      /*tp_as_sequence*/
7656    0,                      /*tp_as_mapping*/
7657    0,                      /*tp_hash*/
7658    0,                      /*tp_call*/
7659    0,                      /*tp_str*/
7660    0,                      /*tp_getattro*/
7661    0,                      /*tp_setattro*/
7662    0,                      /*tp_as_buffer*/
7663    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7664    0,                      /*tp_doc*/
7665    0,                      /*tp_traverse*/
7666    0,                      /*tp_clear*/
7667    0,                      /*tp_richcompare*/
7668    0,                      /*tp_weaklistoffset*/
7669    0,                      /*tp_iter*/
7670    0,                      /*tp_iternext*/
7671    encoding_map_methods,   /*tp_methods*/
7672    0,                      /*tp_members*/
7673    0,                      /*tp_getset*/
7674    0,                      /*tp_base*/
7675    0,                      /*tp_dict*/
7676    0,                      /*tp_descr_get*/
7677    0,                      /*tp_descr_set*/
7678    0,                      /*tp_dictoffset*/
7679    0,                      /*tp_init*/
7680    0,                      /*tp_alloc*/
7681    0,                      /*tp_new*/
7682    0,                      /*tp_free*/
7683    0,                      /*tp_is_gc*/
7684};
7685
7686PyObject*
7687PyUnicode_BuildEncodingMap(PyObject* string)
7688{
7689    PyObject *result;
7690    struct encoding_map *mresult;
7691    int i;
7692    int need_dict = 0;
7693    unsigned char level1[32];
7694    unsigned char level2[512];
7695    unsigned char *mlevel1, *mlevel2, *mlevel3;
7696    int count2 = 0, count3 = 0;
7697    int kind;
7698    void *data;
7699    Py_ssize_t length;
7700    Py_UCS4 ch;
7701
7702    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7703        PyErr_BadArgument();
7704        return NULL;
7705    }
7706    kind = PyUnicode_KIND(string);
7707    data = PyUnicode_DATA(string);
7708    length = PyUnicode_GET_LENGTH(string);
7709    length = Py_MIN(length, 256);
7710    memset(level1, 0xFF, sizeof level1);
7711    memset(level2, 0xFF, sizeof level2);
7712
7713    /* If there isn't a one-to-one mapping of NULL to \0,
7714       or if there are non-BMP characters, we need to use
7715       a mapping dictionary. */
7716    if (PyUnicode_READ(kind, data, 0) != 0)
7717        need_dict = 1;
7718    for (i = 1; i < length; i++) {
7719        int l1, l2;
7720        ch = PyUnicode_READ(kind, data, i);
7721        if (ch == 0 || ch > 0xFFFF) {
7722            need_dict = 1;
7723            break;
7724        }
7725        if (ch == 0xFFFE)
7726            /* unmapped character */
7727            continue;
7728        l1 = ch >> 11;
7729        l2 = ch >> 7;
7730        if (level1[l1] == 0xFF)
7731            level1[l1] = count2++;
7732        if (level2[l2] == 0xFF)
7733            level2[l2] = count3++;
7734    }
7735
7736    if (count2 >= 0xFF || count3 >= 0xFF)
7737        need_dict = 1;
7738
7739    if (need_dict) {
7740        PyObject *result = PyDict_New();
7741        PyObject *key, *value;
7742        if (!result)
7743            return NULL;
7744        for (i = 0; i < length; i++) {
7745            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7746            value = PyLong_FromLong(i);
7747            if (!key || !value)
7748                goto failed1;
7749            if (PyDict_SetItem(result, key, value) == -1)
7750                goto failed1;
7751            Py_DECREF(key);
7752            Py_DECREF(value);
7753        }
7754        return result;
7755      failed1:
7756        Py_XDECREF(key);
7757        Py_XDECREF(value);
7758        Py_DECREF(result);
7759        return NULL;
7760    }
7761
7762    /* Create a three-level trie */
7763    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7764                             16*count2 + 128*count3 - 1);
7765    if (!result)
7766        return PyErr_NoMemory();
7767    PyObject_Init(result, &EncodingMapType);
7768    mresult = (struct encoding_map*)result;
7769    mresult->count2 = count2;
7770    mresult->count3 = count3;
7771    mlevel1 = mresult->level1;
7772    mlevel2 = mresult->level23;
7773    mlevel3 = mresult->level23 + 16*count2;
7774    memcpy(mlevel1, level1, 32);
7775    memset(mlevel2, 0xFF, 16*count2);
7776    memset(mlevel3, 0, 128*count3);
7777    count3 = 0;
7778    for (i = 1; i < length; i++) {
7779        int o1, o2, o3, i2, i3;
7780        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7781        if (ch == 0xFFFE)
7782            /* unmapped character */
7783            continue;
7784        o1 = ch>>11;
7785        o2 = (ch>>7) & 0xF;
7786        i2 = 16*mlevel1[o1] + o2;
7787        if (mlevel2[i2] == 0xFF)
7788            mlevel2[i2] = count3++;
7789        o3 = ch & 0x7F;
7790        i3 = 128*mlevel2[i2] + o3;
7791        mlevel3[i3] = i;
7792    }
7793    return result;
7794}
7795
7796static int
7797encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7798{
7799    struct encoding_map *map = (struct encoding_map*)mapping;
7800    int l1 = c>>11;
7801    int l2 = (c>>7) & 0xF;
7802    int l3 = c & 0x7F;
7803    int i;
7804
7805    if (c > 0xFFFF)
7806        return -1;
7807    if (c == 0)
7808        return 0;
7809    /* level 1*/
7810    i = map->level1[l1];
7811    if (i == 0xFF) {
7812        return -1;
7813    }
7814    /* level 2*/
7815    i = map->level23[16*i+l2];
7816    if (i == 0xFF) {
7817        return -1;
7818    }
7819    /* level 3 */
7820    i = map->level23[16*map->count2 + 128*i + l3];
7821    if (i == 0) {
7822        return -1;
7823    }
7824    return i;
7825}
7826
7827/* Lookup the character ch in the mapping. If the character
7828   can't be found, Py_None is returned (or NULL, if another
7829   error occurred). */
7830static PyObject *
7831charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7832{
7833    PyObject *w = PyLong_FromLong((long)c);
7834    PyObject *x;
7835
7836    if (w == NULL)
7837        return NULL;
7838    x = PyObject_GetItem(mapping, w);
7839    Py_DECREF(w);
7840    if (x == NULL) {
7841        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7842            /* No mapping found means: mapping is undefined. */
7843            PyErr_Clear();
7844            x = Py_None;
7845            Py_INCREF(x);
7846            return x;
7847        } else
7848            return NULL;
7849    }
7850    else if (x == Py_None)
7851        return x;
7852    else if (PyLong_Check(x)) {
7853        long value = PyLong_AS_LONG(x);
7854        if (value < 0 || value > 255) {
7855            PyErr_SetString(PyExc_TypeError,
7856                            "character mapping must be in range(256)");
7857            Py_DECREF(x);
7858            return NULL;
7859        }
7860        return x;
7861    }
7862    else if (PyBytes_Check(x))
7863        return x;
7864    else {
7865        /* wrong return value */
7866        PyErr_Format(PyExc_TypeError,
7867                     "character mapping must return integer, bytes or None, not %.400s",
7868                     x->ob_type->tp_name);
7869        Py_DECREF(x);
7870        return NULL;
7871    }
7872}
7873
7874static int
7875charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7876{
7877    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7878    /* exponentially overallocate to minimize reallocations */
7879    if (requiredsize < 2*outsize)
7880        requiredsize = 2*outsize;
7881    if (_PyBytes_Resize(outobj, requiredsize))
7882        return -1;
7883    return 0;
7884}
7885
7886typedef enum charmapencode_result {
7887    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7888} charmapencode_result;
7889/* lookup the character, put the result in the output string and adjust
7890   various state variables. Resize the output bytes object if not enough
7891   space is available. Return a new reference to the object that
7892   was put in the output buffer, or Py_None, if the mapping was undefined
7893   (in which case no character was written) or NULL, if a
7894   reallocation error occurred. The caller must decref the result */
7895static charmapencode_result
7896charmapencode_output(Py_UCS4 c, PyObject *mapping,
7897                     PyObject **outobj, Py_ssize_t *outpos)
7898{
7899    PyObject *rep;
7900    char *outstart;
7901    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7902
7903    if (Py_TYPE(mapping) == &EncodingMapType) {
7904        int res = encoding_map_lookup(c, mapping);
7905        Py_ssize_t requiredsize = *outpos+1;
7906        if (res == -1)
7907            return enc_FAILED;
7908        if (outsize<requiredsize)
7909            if (charmapencode_resize(outobj, outpos, requiredsize))
7910                return enc_EXCEPTION;
7911        outstart = PyBytes_AS_STRING(*outobj);
7912        outstart[(*outpos)++] = (char)res;
7913        return enc_SUCCESS;
7914    }
7915
7916    rep = charmapencode_lookup(c, mapping);
7917    if (rep==NULL)
7918        return enc_EXCEPTION;
7919    else if (rep==Py_None) {
7920        Py_DECREF(rep);
7921        return enc_FAILED;
7922    } else {
7923        if (PyLong_Check(rep)) {
7924            Py_ssize_t requiredsize = *outpos+1;
7925            if (outsize<requiredsize)
7926                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7927                    Py_DECREF(rep);
7928                    return enc_EXCEPTION;
7929                }
7930            outstart = PyBytes_AS_STRING(*outobj);
7931            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7932        }
7933        else {
7934            const char *repchars = PyBytes_AS_STRING(rep);
7935            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7936            Py_ssize_t requiredsize = *outpos+repsize;
7937            if (outsize<requiredsize)
7938                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7939                    Py_DECREF(rep);
7940                    return enc_EXCEPTION;
7941                }
7942            outstart = PyBytes_AS_STRING(*outobj);
7943            memcpy(outstart + *outpos, repchars, repsize);
7944            *outpos += repsize;
7945        }
7946    }
7947    Py_DECREF(rep);
7948    return enc_SUCCESS;
7949}
7950
7951/* handle an error in PyUnicode_EncodeCharmap
7952   Return 0 on success, -1 on error */
7953static int
7954charmap_encoding_error(
7955    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
7956    PyObject **exceptionObject,
7957    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7958    PyObject **res, Py_ssize_t *respos)
7959{
7960    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7961    Py_ssize_t size, repsize;
7962    Py_ssize_t newpos;
7963    enum PyUnicode_Kind kind;
7964    void *data;
7965    Py_ssize_t index;
7966    /* startpos for collecting unencodable chars */
7967    Py_ssize_t collstartpos = *inpos;
7968    Py_ssize_t collendpos = *inpos+1;
7969    Py_ssize_t collpos;
7970    char *encoding = "charmap";
7971    char *reason = "character maps to <undefined>";
7972    charmapencode_result x;
7973    Py_UCS4 ch;
7974    int val;
7975
7976    if (PyUnicode_READY(unicode) == -1)
7977        return -1;
7978    size = PyUnicode_GET_LENGTH(unicode);
7979    /* find all unencodable characters */
7980    while (collendpos < size) {
7981        PyObject *rep;
7982        if (Py_TYPE(mapping) == &EncodingMapType) {
7983            ch = PyUnicode_READ_CHAR(unicode, collendpos);
7984            val = encoding_map_lookup(ch, mapping);
7985            if (val != -1)
7986                break;
7987            ++collendpos;
7988            continue;
7989        }
7990
7991        ch = PyUnicode_READ_CHAR(unicode, collendpos);
7992        rep = charmapencode_lookup(ch, mapping);
7993        if (rep==NULL)
7994            return -1;
7995        else if (rep!=Py_None) {
7996            Py_DECREF(rep);
7997            break;
7998        }
7999        Py_DECREF(rep);
8000        ++collendpos;
8001    }
8002    /* cache callback name lookup
8003     * (if not done yet, i.e. it's the first error) */
8004    if (*known_errorHandler==-1) {
8005        if ((errors==NULL) || (!strcmp(errors, "strict")))
8006            *known_errorHandler = 1;
8007        else if (!strcmp(errors, "replace"))
8008            *known_errorHandler = 2;
8009        else if (!strcmp(errors, "ignore"))
8010            *known_errorHandler = 3;
8011        else if (!strcmp(errors, "xmlcharrefreplace"))
8012            *known_errorHandler = 4;
8013        else
8014            *known_errorHandler = 0;
8015    }
8016    switch (*known_errorHandler) {
8017    case 1: /* strict */
8018        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8019        return -1;
8020    case 2: /* replace */
8021        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8022            x = charmapencode_output('?', mapping, res, respos);
8023            if (x==enc_EXCEPTION) {
8024                return -1;
8025            }
8026            else if (x==enc_FAILED) {
8027                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8028                return -1;
8029            }
8030        }
8031        /* fall through */
8032    case 3: /* ignore */
8033        *inpos = collendpos;
8034        break;
8035    case 4: /* xmlcharrefreplace */
8036        /* generate replacement (temporarily (mis)uses p) */
8037        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8038            char buffer[2+29+1+1];
8039            char *cp;
8040            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8041            for (cp = buffer; *cp; ++cp) {
8042                x = charmapencode_output(*cp, mapping, res, respos);
8043                if (x==enc_EXCEPTION)
8044                    return -1;
8045                else if (x==enc_FAILED) {
8046                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8047                    return -1;
8048                }
8049            }
8050        }
8051        *inpos = collendpos;
8052        break;
8053    default:
8054        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8055                                                      encoding, reason, unicode, exceptionObject,
8056                                                      collstartpos, collendpos, &newpos);
8057        if (repunicode == NULL)
8058            return -1;
8059        if (PyBytes_Check(repunicode)) {
8060            /* Directly copy bytes result to output. */
8061            Py_ssize_t outsize = PyBytes_Size(*res);
8062            Py_ssize_t requiredsize;
8063            repsize = PyBytes_Size(repunicode);
8064            requiredsize = *respos + repsize;
8065            if (requiredsize > outsize)
8066                /* Make room for all additional bytes. */
8067                if (charmapencode_resize(res, respos, requiredsize)) {
8068                    Py_DECREF(repunicode);
8069                    return -1;
8070                }
8071            memcpy(PyBytes_AsString(*res) + *respos,
8072                   PyBytes_AsString(repunicode),  repsize);
8073            *respos += repsize;
8074            *inpos = newpos;
8075            Py_DECREF(repunicode);
8076            break;
8077        }
8078        /* generate replacement  */
8079        if (PyUnicode_READY(repunicode) == -1) {
8080            Py_DECREF(repunicode);
8081            return -1;
8082        }
8083        repsize = PyUnicode_GET_LENGTH(repunicode);
8084        data = PyUnicode_DATA(repunicode);
8085        kind = PyUnicode_KIND(repunicode);
8086        for (index = 0; index < repsize; index++) {
8087            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8088            x = charmapencode_output(repch, mapping, res, respos);
8089            if (x==enc_EXCEPTION) {
8090                Py_DECREF(repunicode);
8091                return -1;
8092            }
8093            else if (x==enc_FAILED) {
8094                Py_DECREF(repunicode);
8095                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8096                return -1;
8097            }
8098        }
8099        *inpos = newpos;
8100        Py_DECREF(repunicode);
8101    }
8102    return 0;
8103}
8104
8105PyObject *
8106_PyUnicode_EncodeCharmap(PyObject *unicode,
8107                         PyObject *mapping,
8108                         const char *errors)
8109{
8110    /* output object */
8111    PyObject *res = NULL;
8112    /* current input position */
8113    Py_ssize_t inpos = 0;
8114    Py_ssize_t size;
8115    /* current output position */
8116    Py_ssize_t respos = 0;
8117    PyObject *errorHandler = NULL;
8118    PyObject *exc = NULL;
8119    /* the following variable is used for caching string comparisons
8120     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8121     * 3=ignore, 4=xmlcharrefreplace */
8122    int known_errorHandler = -1;
8123
8124    if (PyUnicode_READY(unicode) == -1)
8125        return NULL;
8126    size = PyUnicode_GET_LENGTH(unicode);
8127
8128    /* Default to Latin-1 */
8129    if (mapping == NULL)
8130        return unicode_encode_ucs1(unicode, errors, 256);
8131
8132    /* allocate enough for a simple encoding without
8133       replacements, if we need more, we'll resize */
8134    res = PyBytes_FromStringAndSize(NULL, size);
8135    if (res == NULL)
8136        goto onError;
8137    if (size == 0)
8138        return res;
8139
8140    while (inpos<size) {
8141        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
8142        /* try to encode it */
8143        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8144        if (x==enc_EXCEPTION) /* error */
8145            goto onError;
8146        if (x==enc_FAILED) { /* unencodable character */
8147            if (charmap_encoding_error(unicode, &inpos, mapping,
8148                                       &exc,
8149                                       &known_errorHandler, &errorHandler, errors,
8150                                       &res, &respos)) {
8151                goto onError;
8152            }
8153        }
8154        else
8155            /* done with this character => adjust input position */
8156            ++inpos;
8157    }
8158
8159    /* Resize if we allocated to much */
8160    if (respos<PyBytes_GET_SIZE(res))
8161        if (_PyBytes_Resize(&res, respos) < 0)
8162            goto onError;
8163
8164    Py_XDECREF(exc);
8165    Py_XDECREF(errorHandler);
8166    return res;
8167
8168  onError:
8169    Py_XDECREF(res);
8170    Py_XDECREF(exc);
8171    Py_XDECREF(errorHandler);
8172    return NULL;
8173}
8174
8175/* Deprecated */
8176PyObject *
8177PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8178                        Py_ssize_t size,
8179                        PyObject *mapping,
8180                        const char *errors)
8181{
8182    PyObject *result;
8183    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8184    if (unicode == NULL)
8185        return NULL;
8186    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8187    Py_DECREF(unicode);
8188    return result;
8189}
8190
8191PyObject *
8192PyUnicode_AsCharmapString(PyObject *unicode,
8193                          PyObject *mapping)
8194{
8195    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8196        PyErr_BadArgument();
8197        return NULL;
8198    }
8199    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8200}
8201
8202/* create or adjust a UnicodeTranslateError */
8203static void
8204make_translate_exception(PyObject **exceptionObject,
8205                         PyObject *unicode,
8206                         Py_ssize_t startpos, Py_ssize_t endpos,
8207                         const char *reason)
8208{
8209    if (*exceptionObject == NULL) {
8210        *exceptionObject = _PyUnicodeTranslateError_Create(
8211            unicode, startpos, endpos, reason);
8212    }
8213    else {
8214        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8215            goto onError;
8216        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8217            goto onError;
8218        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8219            goto onError;
8220        return;
8221      onError:
8222        Py_DECREF(*exceptionObject);
8223        *exceptionObject = NULL;
8224    }
8225}
8226
8227/* raises a UnicodeTranslateError */
8228static void
8229raise_translate_exception(PyObject **exceptionObject,
8230                          PyObject *unicode,
8231                          Py_ssize_t startpos, Py_ssize_t endpos,
8232                          const char *reason)
8233{
8234    make_translate_exception(exceptionObject,
8235                             unicode, startpos, endpos, reason);
8236    if (*exceptionObject != NULL)
8237        PyCodec_StrictErrors(*exceptionObject);
8238}
8239
8240/* error handling callback helper:
8241   build arguments, call the callback and check the arguments,
8242   put the result into newpos and return the replacement string, which
8243   has to be freed by the caller */
8244static PyObject *
8245unicode_translate_call_errorhandler(const char *errors,
8246                                    PyObject **errorHandler,
8247                                    const char *reason,
8248                                    PyObject *unicode, PyObject **exceptionObject,
8249                                    Py_ssize_t startpos, Py_ssize_t endpos,
8250                                    Py_ssize_t *newpos)
8251{
8252    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8253
8254    Py_ssize_t i_newpos;
8255    PyObject *restuple;
8256    PyObject *resunicode;
8257
8258    if (*errorHandler == NULL) {
8259        *errorHandler = PyCodec_LookupError(errors);
8260        if (*errorHandler == NULL)
8261            return NULL;
8262    }
8263
8264    make_translate_exception(exceptionObject,
8265                             unicode, startpos, endpos, reason);
8266    if (*exceptionObject == NULL)
8267        return NULL;
8268
8269    restuple = PyObject_CallFunctionObjArgs(
8270        *errorHandler, *exceptionObject, NULL);
8271    if (restuple == NULL)
8272        return NULL;
8273    if (!PyTuple_Check(restuple)) {
8274        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8275        Py_DECREF(restuple);
8276        return NULL;
8277    }
8278    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8279                          &resunicode, &i_newpos)) {
8280        Py_DECREF(restuple);
8281        return NULL;
8282    }
8283    if (i_newpos<0)
8284        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8285    else
8286        *newpos = i_newpos;
8287    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8288        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8289        Py_DECREF(restuple);
8290        return NULL;
8291    }
8292    Py_INCREF(resunicode);
8293    Py_DECREF(restuple);
8294    return resunicode;
8295}
8296
8297/* Lookup the character ch in the mapping and put the result in result,
8298   which must be decrefed by the caller.
8299   Return 0 on success, -1 on error */
8300static int
8301charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8302{
8303    PyObject *w = PyLong_FromLong((long)c);
8304    PyObject *x;
8305
8306    if (w == NULL)
8307        return -1;
8308    x = PyObject_GetItem(mapping, w);
8309    Py_DECREF(w);
8310    if (x == NULL) {
8311        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8312            /* No mapping found means: use 1:1 mapping. */
8313            PyErr_Clear();
8314            *result = NULL;
8315            return 0;
8316        } else
8317            return -1;
8318    }
8319    else if (x == Py_None) {
8320        *result = x;
8321        return 0;
8322    }
8323    else if (PyLong_Check(x)) {
8324        long value = PyLong_AS_LONG(x);
8325        long max = PyUnicode_GetMax();
8326        if (value < 0 || value > max) {
8327            PyErr_Format(PyExc_TypeError,
8328                         "character mapping must be in range(0x%x)", max+1);
8329            Py_DECREF(x);
8330            return -1;
8331        }
8332        *result = x;
8333        return 0;
8334    }
8335    else if (PyUnicode_Check(x)) {
8336        *result = x;
8337        return 0;
8338    }
8339    else {
8340        /* wrong return value */
8341        PyErr_SetString(PyExc_TypeError,
8342                        "character mapping must return integer, None or str");
8343        Py_DECREF(x);
8344        return -1;
8345    }
8346}
8347/* ensure that *outobj is at least requiredsize characters long,
8348   if not reallocate and adjust various state variables.
8349   Return 0 on success, -1 on error */
8350static int
8351charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8352                               Py_ssize_t requiredsize)
8353{
8354    Py_ssize_t oldsize = *psize;
8355    Py_UCS4 *new_outobj;
8356    if (requiredsize > oldsize) {
8357        /* exponentially overallocate to minimize reallocations */
8358        if (requiredsize < 2 * oldsize)
8359            requiredsize = 2 * oldsize;
8360        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8361        if (new_outobj == 0)
8362            return -1;
8363        *outobj = new_outobj;
8364        *psize = requiredsize;
8365    }
8366    return 0;
8367}
8368/* lookup the character, put the result in the output string and adjust
8369   various state variables. Return a new reference to the object that
8370   was put in the output buffer in *result, or Py_None, if the mapping was
8371   undefined (in which case no character was written).
8372   The called must decref result.
8373   Return 0 on success, -1 on error. */
8374static int
8375charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8376                        PyObject *mapping, Py_UCS4 **output,
8377                        Py_ssize_t *osize, Py_ssize_t *opos,
8378                        PyObject **res)
8379{
8380    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8381    if (charmaptranslate_lookup(curinp, mapping, res))
8382        return -1;
8383    if (*res==NULL) {
8384        /* not found => default to 1:1 mapping */
8385        (*output)[(*opos)++] = curinp;
8386    }
8387    else if (*res==Py_None)
8388        ;
8389    else if (PyLong_Check(*res)) {
8390        /* no overflow check, because we know that the space is enough */
8391        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8392    }
8393    else if (PyUnicode_Check(*res)) {
8394        Py_ssize_t repsize;
8395        if (PyUnicode_READY(*res) == -1)
8396            return -1;
8397        repsize = PyUnicode_GET_LENGTH(*res);
8398        if (repsize==1) {
8399            /* no overflow check, because we know that the space is enough */
8400            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8401        }
8402        else if (repsize!=0) {
8403            /* more than one character */
8404            Py_ssize_t requiredsize = *opos +
8405                (PyUnicode_GET_LENGTH(input) - ipos) +
8406                repsize - 1;
8407            Py_ssize_t i;
8408            if (charmaptranslate_makespace(output, osize, requiredsize))
8409                return -1;
8410            for(i = 0; i < repsize; i++)
8411                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8412        }
8413    }
8414    else
8415        return -1;
8416    return 0;
8417}
8418
8419PyObject *
8420_PyUnicode_TranslateCharmap(PyObject *input,
8421                            PyObject *mapping,
8422                            const char *errors)
8423{
8424    /* input object */
8425    char *idata;
8426    Py_ssize_t size, i;
8427    int kind;
8428    /* output buffer */
8429    Py_UCS4 *output = NULL;
8430    Py_ssize_t osize;
8431    PyObject *res;
8432    /* current output position */
8433    Py_ssize_t opos;
8434    char *reason = "character maps to <undefined>";
8435    PyObject *errorHandler = NULL;
8436    PyObject *exc = NULL;
8437    /* the following variable is used for caching string comparisons
8438     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8439     * 3=ignore, 4=xmlcharrefreplace */
8440    int known_errorHandler = -1;
8441
8442    if (mapping == NULL) {
8443        PyErr_BadArgument();
8444        return NULL;
8445    }
8446
8447    if (PyUnicode_READY(input) == -1)
8448        return NULL;
8449    idata = (char*)PyUnicode_DATA(input);
8450    kind = PyUnicode_KIND(input);
8451    size = PyUnicode_GET_LENGTH(input);
8452    i = 0;
8453
8454    if (size == 0) {
8455        Py_INCREF(input);
8456        return input;
8457    }
8458
8459    /* allocate enough for a simple 1:1 translation without
8460       replacements, if we need more, we'll resize */
8461    osize = size;
8462    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8463    opos = 0;
8464    if (output == NULL) {
8465        PyErr_NoMemory();
8466        goto onError;
8467    }
8468
8469    while (i<size) {
8470        /* try to encode it */
8471        PyObject *x = NULL;
8472        if (charmaptranslate_output(input, i, mapping,
8473                                    &output, &osize, &opos, &x)) {
8474            Py_XDECREF(x);
8475            goto onError;
8476        }
8477        Py_XDECREF(x);
8478        if (x!=Py_None) /* it worked => adjust input pointer */
8479            ++i;
8480        else { /* untranslatable character */
8481            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8482            Py_ssize_t repsize;
8483            Py_ssize_t newpos;
8484            Py_ssize_t uni2;
8485            /* startpos for collecting untranslatable chars */
8486            Py_ssize_t collstart = i;
8487            Py_ssize_t collend = i+1;
8488            Py_ssize_t coll;
8489
8490            /* find all untranslatable characters */
8491            while (collend < size) {
8492                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8493                    goto onError;
8494                Py_XDECREF(x);
8495                if (x!=Py_None)
8496                    break;
8497                ++collend;
8498            }
8499            /* cache callback name lookup
8500             * (if not done yet, i.e. it's the first error) */
8501            if (known_errorHandler==-1) {
8502                if ((errors==NULL) || (!strcmp(errors, "strict")))
8503                    known_errorHandler = 1;
8504                else if (!strcmp(errors, "replace"))
8505                    known_errorHandler = 2;
8506                else if (!strcmp(errors, "ignore"))
8507                    known_errorHandler = 3;
8508                else if (!strcmp(errors, "xmlcharrefreplace"))
8509                    known_errorHandler = 4;
8510                else
8511                    known_errorHandler = 0;
8512            }
8513            switch (known_errorHandler) {
8514            case 1: /* strict */
8515                raise_translate_exception(&exc, input, collstart,
8516                                          collend, reason);
8517                goto onError;
8518            case 2: /* replace */
8519                /* No need to check for space, this is a 1:1 replacement */
8520                for (coll = collstart; coll<collend; coll++)
8521                    output[opos++] = '?';
8522                /* fall through */
8523            case 3: /* ignore */
8524                i = collend;
8525                break;
8526            case 4: /* xmlcharrefreplace */
8527                /* generate replacement (temporarily (mis)uses i) */
8528                for (i = collstart; i < collend; ++i) {
8529                    char buffer[2+29+1+1];
8530                    char *cp;
8531                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8532                    if (charmaptranslate_makespace(&output, &osize,
8533                                                   opos+strlen(buffer)+(size-collend)))
8534                        goto onError;
8535                    for (cp = buffer; *cp; ++cp)
8536                        output[opos++] = *cp;
8537                }
8538                i = collend;
8539                break;
8540            default:
8541                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8542                                                                 reason, input, &exc,
8543                                                                 collstart, collend, &newpos);
8544                if (repunicode == NULL)
8545                    goto onError;
8546                if (PyUnicode_READY(repunicode) == -1) {
8547                    Py_DECREF(repunicode);
8548                    goto onError;
8549                }
8550                /* generate replacement  */
8551                repsize = PyUnicode_GET_LENGTH(repunicode);
8552                if (charmaptranslate_makespace(&output, &osize,
8553                                               opos+repsize+(size-collend))) {
8554                    Py_DECREF(repunicode);
8555                    goto onError;
8556                }
8557                for (uni2 = 0; repsize-->0; ++uni2)
8558                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8559                i = newpos;
8560                Py_DECREF(repunicode);
8561            }
8562        }
8563    }
8564    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8565    if (!res)
8566        goto onError;
8567    PyMem_Free(output);
8568    Py_XDECREF(exc);
8569    Py_XDECREF(errorHandler);
8570    return res;
8571
8572  onError:
8573    PyMem_Free(output);
8574    Py_XDECREF(exc);
8575    Py_XDECREF(errorHandler);
8576    return NULL;
8577}
8578
8579/* Deprecated. Use PyUnicode_Translate instead. */
8580PyObject *
8581PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8582                           Py_ssize_t size,
8583                           PyObject *mapping,
8584                           const char *errors)
8585{
8586    PyObject *result;
8587    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8588    if (!unicode)
8589        return NULL;
8590    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8591    Py_DECREF(unicode);
8592    return result;
8593}
8594
8595PyObject *
8596PyUnicode_Translate(PyObject *str,
8597                    PyObject *mapping,
8598                    const char *errors)
8599{
8600    PyObject *result;
8601
8602    str = PyUnicode_FromObject(str);
8603    if (str == NULL)
8604        return NULL;
8605    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8606    Py_DECREF(str);
8607    return result;
8608}
8609
8610static Py_UCS4
8611fix_decimal_and_space_to_ascii(PyObject *self)
8612{
8613    /* No need to call PyUnicode_READY(self) because this function is only
8614       called as a callback from fixup() which does it already. */
8615    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8616    const int kind = PyUnicode_KIND(self);
8617    void *data = PyUnicode_DATA(self);
8618    Py_UCS4 maxchar = 127, ch, fixed;
8619    int modified = 0;
8620    Py_ssize_t i;
8621
8622    for (i = 0; i < len; ++i) {
8623        ch = PyUnicode_READ(kind, data, i);
8624        fixed = 0;
8625        if (ch > 127) {
8626            if (Py_UNICODE_ISSPACE(ch))
8627                fixed = ' ';
8628            else {
8629                const int decimal = Py_UNICODE_TODECIMAL(ch);
8630                if (decimal >= 0)
8631                    fixed = '0' + decimal;
8632            }
8633            if (fixed != 0) {
8634                modified = 1;
8635                maxchar = MAX_MAXCHAR(maxchar, fixed);
8636                PyUnicode_WRITE(kind, data, i, fixed);
8637            }
8638            else
8639                maxchar = MAX_MAXCHAR(maxchar, ch);
8640        }
8641    }
8642
8643    return (modified) ? maxchar : 0;
8644}
8645
8646PyObject *
8647_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8648{
8649    if (!PyUnicode_Check(unicode)) {
8650        PyErr_BadInternalCall();
8651        return NULL;
8652    }
8653    if (PyUnicode_READY(unicode) == -1)
8654        return NULL;
8655    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8656        /* If the string is already ASCII, just return the same string */
8657        Py_INCREF(unicode);
8658        return unicode;
8659    }
8660    return fixup(unicode, fix_decimal_and_space_to_ascii);
8661}
8662
8663PyObject *
8664PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8665                                  Py_ssize_t length)
8666{
8667    PyObject *decimal;
8668    Py_ssize_t i;
8669    Py_UCS4 maxchar;
8670    enum PyUnicode_Kind kind;
8671    void *data;
8672
8673    maxchar = 127;
8674    for (i = 0; i < length; i++) {
8675        Py_UNICODE ch = s[i];
8676        if (ch > 127) {
8677            int decimal = Py_UNICODE_TODECIMAL(ch);
8678            if (decimal >= 0)
8679                ch = '0' + decimal;
8680            maxchar = MAX_MAXCHAR(maxchar, ch);
8681        }
8682    }
8683
8684    /* Copy to a new string */
8685    decimal = PyUnicode_New(length, maxchar);
8686    if (decimal == NULL)
8687        return decimal;
8688    kind = PyUnicode_KIND(decimal);
8689    data = PyUnicode_DATA(decimal);
8690    /* Iterate over code points */
8691    for (i = 0; i < length; i++) {
8692        Py_UNICODE ch = s[i];
8693        if (ch > 127) {
8694            int decimal = Py_UNICODE_TODECIMAL(ch);
8695            if (decimal >= 0)
8696                ch = '0' + decimal;
8697        }
8698        PyUnicode_WRITE(kind, data, i, ch);
8699    }
8700    return unicode_result(decimal);
8701}
8702/* --- Decimal Encoder ---------------------------------------------------- */
8703
8704int
8705PyUnicode_EncodeDecimal(Py_UNICODE *s,
8706                        Py_ssize_t length,
8707                        char *output,
8708                        const char *errors)
8709{
8710    PyObject *unicode;
8711    Py_ssize_t i;
8712    enum PyUnicode_Kind kind;
8713    void *data;
8714
8715    if (output == NULL) {
8716        PyErr_BadArgument();
8717        return -1;
8718    }
8719
8720    unicode = PyUnicode_FromUnicode(s, length);
8721    if (unicode == NULL)
8722        return -1;
8723
8724    if (PyUnicode_READY(unicode) == -1) {
8725        Py_DECREF(unicode);
8726        return -1;
8727    }
8728    kind = PyUnicode_KIND(unicode);
8729    data = PyUnicode_DATA(unicode);
8730
8731    for (i=0; i < length; ) {
8732        PyObject *exc;
8733        Py_UCS4 ch;
8734        int decimal;
8735        Py_ssize_t startpos;
8736
8737        ch = PyUnicode_READ(kind, data, i);
8738
8739        if (Py_UNICODE_ISSPACE(ch)) {
8740            *output++ = ' ';
8741            i++;
8742            continue;
8743        }
8744        decimal = Py_UNICODE_TODECIMAL(ch);
8745        if (decimal >= 0) {
8746            *output++ = '0' + decimal;
8747            i++;
8748            continue;
8749        }
8750        if (0 < ch && ch < 256) {
8751            *output++ = (char)ch;
8752            i++;
8753            continue;
8754        }
8755
8756        startpos = i;
8757        exc = NULL;
8758        raise_encode_exception(&exc, "decimal", unicode,
8759                               startpos, startpos+1,
8760                               "invalid decimal Unicode string");
8761        Py_XDECREF(exc);
8762        Py_DECREF(unicode);
8763        return -1;
8764    }
8765    /* 0-terminate the output string */
8766    *output++ = '\0';
8767    Py_DECREF(unicode);
8768    return 0;
8769}
8770
8771/* --- Helpers ------------------------------------------------------------ */
8772
8773static Py_ssize_t
8774any_find_slice(int direction, PyObject* s1, PyObject* s2,
8775               Py_ssize_t start,
8776               Py_ssize_t end)
8777{
8778    int kind1, kind2, kind;
8779    void *buf1, *buf2;
8780    Py_ssize_t len1, len2, result;
8781
8782    kind1 = PyUnicode_KIND(s1);
8783    kind2 = PyUnicode_KIND(s2);
8784    kind = kind1 > kind2 ? kind1 : kind2;
8785    buf1 = PyUnicode_DATA(s1);
8786    buf2 = PyUnicode_DATA(s2);
8787    if (kind1 != kind)
8788        buf1 = _PyUnicode_AsKind(s1, kind);
8789    if (!buf1)
8790        return -2;
8791    if (kind2 != kind)
8792        buf2 = _PyUnicode_AsKind(s2, kind);
8793    if (!buf2) {
8794        if (kind1 != kind) PyMem_Free(buf1);
8795        return -2;
8796    }
8797    len1 = PyUnicode_GET_LENGTH(s1);
8798    len2 = PyUnicode_GET_LENGTH(s2);
8799
8800    if (direction > 0) {
8801        switch (kind) {
8802        case PyUnicode_1BYTE_KIND:
8803            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8804                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8805            else
8806                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8807            break;
8808        case PyUnicode_2BYTE_KIND:
8809            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8810            break;
8811        case PyUnicode_4BYTE_KIND:
8812            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8813            break;
8814        default:
8815            assert(0); result = -2;
8816        }
8817    }
8818    else {
8819        switch (kind) {
8820        case PyUnicode_1BYTE_KIND:
8821            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8822                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8823            else
8824                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8825            break;
8826        case PyUnicode_2BYTE_KIND:
8827            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8828            break;
8829        case PyUnicode_4BYTE_KIND:
8830            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8831            break;
8832        default:
8833            assert(0); result = -2;
8834        }
8835    }
8836
8837    if (kind1 != kind)
8838        PyMem_Free(buf1);
8839    if (kind2 != kind)
8840        PyMem_Free(buf2);
8841
8842    return result;
8843}
8844
8845Py_ssize_t
8846_PyUnicode_InsertThousandsGrouping(
8847    PyObject *unicode, Py_ssize_t index,
8848    Py_ssize_t n_buffer,
8849    void *digits, Py_ssize_t n_digits,
8850    Py_ssize_t min_width,
8851    const char *grouping, PyObject *thousands_sep,
8852    Py_UCS4 *maxchar)
8853{
8854    unsigned int kind, thousands_sep_kind;
8855    char *data, *thousands_sep_data;
8856    Py_ssize_t thousands_sep_len;
8857    Py_ssize_t len;
8858
8859    if (unicode != NULL) {
8860        kind = PyUnicode_KIND(unicode);
8861        data = (char *) PyUnicode_DATA(unicode) + index * kind;
8862    }
8863    else {
8864        kind = PyUnicode_1BYTE_KIND;
8865        data = NULL;
8866    }
8867    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8868    thousands_sep_data = PyUnicode_DATA(thousands_sep);
8869    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8870    if (unicode != NULL && thousands_sep_kind != kind) {
8871        if (thousands_sep_kind < kind) {
8872            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8873            if (!thousands_sep_data)
8874                return -1;
8875        }
8876        else {
8877            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8878            if (!data)
8879                return -1;
8880        }
8881    }
8882
8883    switch (kind) {
8884    case PyUnicode_1BYTE_KIND:
8885        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8886            len = asciilib_InsertThousandsGrouping(
8887                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
8888                min_width, grouping,
8889                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8890        else
8891            len = ucs1lib_InsertThousandsGrouping(
8892                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8893                min_width, grouping,
8894                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8895        break;
8896    case PyUnicode_2BYTE_KIND:
8897        len = ucs2lib_InsertThousandsGrouping(
8898            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
8899            min_width, grouping,
8900            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
8901        break;
8902    case PyUnicode_4BYTE_KIND:
8903        len = ucs4lib_InsertThousandsGrouping(
8904            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
8905            min_width, grouping,
8906            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
8907        break;
8908    default:
8909        assert(0);
8910        return -1;
8911    }
8912    if (unicode != NULL && thousands_sep_kind != kind) {
8913        if (thousands_sep_kind < kind)
8914            PyMem_Free(thousands_sep_data);
8915        else
8916            PyMem_Free(data);
8917    }
8918    if (unicode == NULL) {
8919        *maxchar = 127;
8920        if (len != n_digits) {
8921            *maxchar = MAX_MAXCHAR(*maxchar,
8922                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
8923        }
8924    }
8925    return len;
8926}
8927
8928
8929/* helper macro to fixup start/end slice values */
8930#define ADJUST_INDICES(start, end, len)         \
8931    if (end > len)                              \
8932        end = len;                              \
8933    else if (end < 0) {                         \
8934        end += len;                             \
8935        if (end < 0)                            \
8936            end = 0;                            \
8937    }                                           \
8938    if (start < 0) {                            \
8939        start += len;                           \
8940        if (start < 0)                          \
8941            start = 0;                          \
8942    }
8943
8944Py_ssize_t
8945PyUnicode_Count(PyObject *str,
8946                PyObject *substr,
8947                Py_ssize_t start,
8948                Py_ssize_t end)
8949{
8950    Py_ssize_t result;
8951    PyObject* str_obj;
8952    PyObject* sub_obj;
8953    int kind1, kind2, kind;
8954    void *buf1 = NULL, *buf2 = NULL;
8955    Py_ssize_t len1, len2;
8956
8957    str_obj = PyUnicode_FromObject(str);
8958    if (!str_obj)
8959        return -1;
8960    sub_obj = PyUnicode_FromObject(substr);
8961    if (!sub_obj) {
8962        Py_DECREF(str_obj);
8963        return -1;
8964    }
8965    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
8966        Py_DECREF(sub_obj);
8967        Py_DECREF(str_obj);
8968        return -1;
8969    }
8970
8971    kind1 = PyUnicode_KIND(str_obj);
8972    kind2 = PyUnicode_KIND(sub_obj);
8973    kind = kind1;
8974    buf1 = PyUnicode_DATA(str_obj);
8975    buf2 = PyUnicode_DATA(sub_obj);
8976    if (kind2 != kind) {
8977        if (kind2 > kind) {
8978            Py_DECREF(sub_obj);
8979            Py_DECREF(str_obj);
8980            return 0;
8981        }
8982        buf2 = _PyUnicode_AsKind(sub_obj, kind);
8983    }
8984    if (!buf2)
8985        goto onError;
8986    len1 = PyUnicode_GET_LENGTH(str_obj);
8987    len2 = PyUnicode_GET_LENGTH(sub_obj);
8988
8989    ADJUST_INDICES(start, end, len1);
8990    switch (kind) {
8991    case PyUnicode_1BYTE_KIND:
8992        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8993            result = asciilib_count(
8994                ((Py_UCS1*)buf1) + start, end - start,
8995                buf2, len2, PY_SSIZE_T_MAX
8996                );
8997        else
8998            result = ucs1lib_count(
8999                ((Py_UCS1*)buf1) + start, end - start,
9000                buf2, len2, PY_SSIZE_T_MAX
9001                );
9002        break;
9003    case PyUnicode_2BYTE_KIND:
9004        result = ucs2lib_count(
9005            ((Py_UCS2*)buf1) + start, end - start,
9006            buf2, len2, PY_SSIZE_T_MAX
9007            );
9008        break;
9009    case PyUnicode_4BYTE_KIND:
9010        result = ucs4lib_count(
9011            ((Py_UCS4*)buf1) + start, end - start,
9012            buf2, len2, PY_SSIZE_T_MAX
9013            );
9014        break;
9015    default:
9016        assert(0); result = 0;
9017    }
9018
9019    Py_DECREF(sub_obj);
9020    Py_DECREF(str_obj);
9021
9022    if (kind2 != kind)
9023        PyMem_Free(buf2);
9024
9025    return result;
9026  onError:
9027    Py_DECREF(sub_obj);
9028    Py_DECREF(str_obj);
9029    if (kind2 != kind && buf2)
9030        PyMem_Free(buf2);
9031    return -1;
9032}
9033
9034Py_ssize_t
9035PyUnicode_Find(PyObject *str,
9036               PyObject *sub,
9037               Py_ssize_t start,
9038               Py_ssize_t end,
9039               int direction)
9040{
9041    Py_ssize_t result;
9042
9043    str = PyUnicode_FromObject(str);
9044    if (!str)
9045        return -2;
9046    sub = PyUnicode_FromObject(sub);
9047    if (!sub) {
9048        Py_DECREF(str);
9049        return -2;
9050    }
9051    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9052        Py_DECREF(sub);
9053        Py_DECREF(str);
9054        return -2;
9055    }
9056
9057    result = any_find_slice(direction,
9058        str, sub, start, end
9059        );
9060
9061    Py_DECREF(str);
9062    Py_DECREF(sub);
9063
9064    return result;
9065}
9066
9067Py_ssize_t
9068PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9069                   Py_ssize_t start, Py_ssize_t end,
9070                   int direction)
9071{
9072    int kind;
9073    Py_ssize_t result;
9074    if (PyUnicode_READY(str) == -1)
9075        return -2;
9076    if (start < 0 || end < 0) {
9077        PyErr_SetString(PyExc_IndexError, "string index out of range");
9078        return -2;
9079    }
9080    if (end > PyUnicode_GET_LENGTH(str))
9081        end = PyUnicode_GET_LENGTH(str);
9082    kind = PyUnicode_KIND(str);
9083    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9084                      kind, end-start, ch, direction);
9085    if (result == -1)
9086        return -1;
9087    else
9088        return start + result;
9089}
9090
9091static int
9092tailmatch(PyObject *self,
9093          PyObject *substring,
9094          Py_ssize_t start,
9095          Py_ssize_t end,
9096          int direction)
9097{
9098    int kind_self;
9099    int kind_sub;
9100    void *data_self;
9101    void *data_sub;
9102    Py_ssize_t offset;
9103    Py_ssize_t i;
9104    Py_ssize_t end_sub;
9105
9106    if (PyUnicode_READY(self) == -1 ||
9107        PyUnicode_READY(substring) == -1)
9108        return 0;
9109
9110    if (PyUnicode_GET_LENGTH(substring) == 0)
9111        return 1;
9112
9113    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9114    end -= PyUnicode_GET_LENGTH(substring);
9115    if (end < start)
9116        return 0;
9117
9118    kind_self = PyUnicode_KIND(self);
9119    data_self = PyUnicode_DATA(self);
9120    kind_sub = PyUnicode_KIND(substring);
9121    data_sub = PyUnicode_DATA(substring);
9122    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9123
9124    if (direction > 0)
9125        offset = end;
9126    else
9127        offset = start;
9128
9129    if (PyUnicode_READ(kind_self, data_self, offset) ==
9130        PyUnicode_READ(kind_sub, data_sub, 0) &&
9131        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9132        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9133        /* If both are of the same kind, memcmp is sufficient */
9134        if (kind_self == kind_sub) {
9135            return ! memcmp((char *)data_self +
9136                                (offset * PyUnicode_KIND(substring)),
9137                            data_sub,
9138                            PyUnicode_GET_LENGTH(substring) *
9139                                PyUnicode_KIND(substring));
9140        }
9141        /* otherwise we have to compare each character by first accesing it */
9142        else {
9143            /* We do not need to compare 0 and len(substring)-1 because
9144               the if statement above ensured already that they are equal
9145               when we end up here. */
9146            /* TODO: honor direction and do a forward or backwards search */
9147            for (i = 1; i < end_sub; ++i) {
9148                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9149                    PyUnicode_READ(kind_sub, data_sub, i))
9150                    return 0;
9151            }
9152            return 1;
9153        }
9154    }
9155
9156    return 0;
9157}
9158
9159Py_ssize_t
9160PyUnicode_Tailmatch(PyObject *str,
9161                    PyObject *substr,
9162                    Py_ssize_t start,
9163                    Py_ssize_t end,
9164                    int direction)
9165{
9166    Py_ssize_t result;
9167
9168    str = PyUnicode_FromObject(str);
9169    if (str == NULL)
9170        return -1;
9171    substr = PyUnicode_FromObject(substr);
9172    if (substr == NULL) {
9173        Py_DECREF(str);
9174        return -1;
9175    }
9176
9177    result = tailmatch(str, substr,
9178                       start, end, direction);
9179    Py_DECREF(str);
9180    Py_DECREF(substr);
9181    return result;
9182}
9183
9184/* Apply fixfct filter to the Unicode object self and return a
9185   reference to the modified object */
9186
9187static PyObject *
9188fixup(PyObject *self,
9189      Py_UCS4 (*fixfct)(PyObject *s))
9190{
9191    PyObject *u;
9192    Py_UCS4 maxchar_old, maxchar_new = 0;
9193    PyObject *v;
9194
9195    u = _PyUnicode_Copy(self);
9196    if (u == NULL)
9197        return NULL;
9198    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9199
9200    /* fix functions return the new maximum character in a string,
9201       if the kind of the resulting unicode object does not change,
9202       everything is fine.  Otherwise we need to change the string kind
9203       and re-run the fix function. */
9204    maxchar_new = fixfct(u);
9205
9206    if (maxchar_new == 0) {
9207        /* no changes */;
9208        if (PyUnicode_CheckExact(self)) {
9209            Py_DECREF(u);
9210            Py_INCREF(self);
9211            return self;
9212        }
9213        else
9214            return u;
9215    }
9216
9217    maxchar_new = align_maxchar(maxchar_new);
9218
9219    if (maxchar_new == maxchar_old)
9220        return u;
9221
9222    /* In case the maximum character changed, we need to
9223       convert the string to the new category. */
9224    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9225    if (v == NULL) {
9226        Py_DECREF(u);
9227        return NULL;
9228    }
9229    if (maxchar_new > maxchar_old) {
9230        /* If the maxchar increased so that the kind changed, not all
9231           characters are representable anymore and we need to fix the
9232           string again. This only happens in very few cases. */
9233        _PyUnicode_FastCopyCharacters(v, 0,
9234                                      self, 0, PyUnicode_GET_LENGTH(self));
9235        maxchar_old = fixfct(v);
9236        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9237    }
9238    else {
9239        _PyUnicode_FastCopyCharacters(v, 0,
9240                                      u, 0, PyUnicode_GET_LENGTH(self));
9241    }
9242    Py_DECREF(u);
9243    assert(_PyUnicode_CheckConsistency(v, 1));
9244    return v;
9245}
9246
9247static PyObject *
9248ascii_upper_or_lower(PyObject *self, int lower)
9249{
9250    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9251    char *resdata, *data = PyUnicode_DATA(self);
9252    PyObject *res;
9253
9254    res = PyUnicode_New(len, 127);
9255    if (res == NULL)
9256        return NULL;
9257    resdata = PyUnicode_DATA(res);
9258    if (lower)
9259        _Py_bytes_lower(resdata, data, len);
9260    else
9261        _Py_bytes_upper(resdata, data, len);
9262    return res;
9263}
9264
9265static Py_UCS4
9266handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9267{
9268    Py_ssize_t j;
9269    int final_sigma;
9270    Py_UCS4 c;
9271    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9272
9273     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9274
9275    where ! is a negation and \p{xxx} is a character with property xxx.
9276    */
9277    for (j = i - 1; j >= 0; j--) {
9278        c = PyUnicode_READ(kind, data, j);
9279        if (!_PyUnicode_IsCaseIgnorable(c))
9280            break;
9281    }
9282    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9283    if (final_sigma) {
9284        for (j = i + 1; j < length; j++) {
9285            c = PyUnicode_READ(kind, data, j);
9286            if (!_PyUnicode_IsCaseIgnorable(c))
9287                break;
9288        }
9289        final_sigma = j == length || !_PyUnicode_IsCased(c);
9290    }
9291    return (final_sigma) ? 0x3C2 : 0x3C3;
9292}
9293
9294static int
9295lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9296           Py_UCS4 c, Py_UCS4 *mapped)
9297{
9298    /* Obscure special case. */
9299    if (c == 0x3A3) {
9300        mapped[0] = handle_capital_sigma(kind, data, length, i);
9301        return 1;
9302    }
9303    return _PyUnicode_ToLowerFull(c, mapped);
9304}
9305
9306static Py_ssize_t
9307do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9308{
9309    Py_ssize_t i, k = 0;
9310    int n_res, j;
9311    Py_UCS4 c, mapped[3];
9312
9313    c = PyUnicode_READ(kind, data, 0);
9314    n_res = _PyUnicode_ToUpperFull(c, mapped);
9315    for (j = 0; j < n_res; j++) {
9316        *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9317        res[k++] = mapped[j];
9318    }
9319    for (i = 1; i < length; i++) {
9320        c = PyUnicode_READ(kind, data, i);
9321        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9322        for (j = 0; j < n_res; j++) {
9323            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9324            res[k++] = mapped[j];
9325        }
9326    }
9327    return k;
9328}
9329
9330static Py_ssize_t
9331do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9332    Py_ssize_t i, k = 0;
9333
9334    for (i = 0; i < length; i++) {
9335        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9336        int n_res, j;
9337        if (Py_UNICODE_ISUPPER(c)) {
9338            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9339        }
9340        else if (Py_UNICODE_ISLOWER(c)) {
9341            n_res = _PyUnicode_ToUpperFull(c, mapped);
9342        }
9343        else {
9344            n_res = 1;
9345            mapped[0] = c;
9346        }
9347        for (j = 0; j < n_res; j++) {
9348            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9349            res[k++] = mapped[j];
9350        }
9351    }
9352    return k;
9353}
9354
9355static Py_ssize_t
9356do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9357                  Py_UCS4 *maxchar, int lower)
9358{
9359    Py_ssize_t i, k = 0;
9360
9361    for (i = 0; i < length; i++) {
9362        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9363        int n_res, j;
9364        if (lower)
9365            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9366        else
9367            n_res = _PyUnicode_ToUpperFull(c, mapped);
9368        for (j = 0; j < n_res; j++) {
9369            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9370            res[k++] = mapped[j];
9371        }
9372    }
9373    return k;
9374}
9375
9376static Py_ssize_t
9377do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9378{
9379    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9380}
9381
9382static Py_ssize_t
9383do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9384{
9385    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9386}
9387
9388static Py_ssize_t
9389do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9390{
9391    Py_ssize_t i, k = 0;
9392
9393    for (i = 0; i < length; i++) {
9394        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9395        Py_UCS4 mapped[3];
9396        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9397        for (j = 0; j < n_res; j++) {
9398            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9399            res[k++] = mapped[j];
9400        }
9401    }
9402    return k;
9403}
9404
9405static Py_ssize_t
9406do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9407{
9408    Py_ssize_t i, k = 0;
9409    int previous_is_cased;
9410
9411    previous_is_cased = 0;
9412    for (i = 0; i < length; i++) {
9413        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9414        Py_UCS4 mapped[3];
9415        int n_res, j;
9416
9417        if (previous_is_cased)
9418            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9419        else
9420            n_res = _PyUnicode_ToTitleFull(c, mapped);
9421
9422        for (j = 0; j < n_res; j++) {
9423            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9424            res[k++] = mapped[j];
9425        }
9426
9427        previous_is_cased = _PyUnicode_IsCased(c);
9428    }
9429    return k;
9430}
9431
9432static PyObject *
9433case_operation(PyObject *self,
9434               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9435{
9436    PyObject *res = NULL;
9437    Py_ssize_t length, newlength = 0;
9438    int kind, outkind;
9439    void *data, *outdata;
9440    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9441
9442    assert(PyUnicode_IS_READY(self));
9443
9444    kind = PyUnicode_KIND(self);
9445    data = PyUnicode_DATA(self);
9446    length = PyUnicode_GET_LENGTH(self);
9447    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9448    if (tmp == NULL)
9449        return PyErr_NoMemory();
9450    newlength = perform(kind, data, length, tmp, &maxchar);
9451    res = PyUnicode_New(newlength, maxchar);
9452    if (res == NULL)
9453        goto leave;
9454    tmpend = tmp + newlength;
9455    outdata = PyUnicode_DATA(res);
9456    outkind = PyUnicode_KIND(res);
9457    switch (outkind) {
9458    case PyUnicode_1BYTE_KIND:
9459        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9460        break;
9461    case PyUnicode_2BYTE_KIND:
9462        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9463        break;
9464    case PyUnicode_4BYTE_KIND:
9465        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9466        break;
9467    default:
9468        assert(0);
9469        break;
9470    }
9471  leave:
9472    PyMem_FREE(tmp);
9473    return res;
9474}
9475
9476PyObject *
9477PyUnicode_Join(PyObject *separator, PyObject *seq)
9478{
9479    PyObject *sep = NULL;
9480    Py_ssize_t seplen;
9481    PyObject *res = NULL; /* the result */
9482    PyObject *fseq;          /* PySequence_Fast(seq) */
9483    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9484    PyObject **items;
9485    PyObject *item;
9486    Py_ssize_t sz, i, res_offset;
9487    Py_UCS4 maxchar;
9488    Py_UCS4 item_maxchar;
9489    int use_memcpy;
9490    unsigned char *res_data = NULL, *sep_data = NULL;
9491    PyObject *last_obj;
9492    unsigned int kind = 0;
9493
9494    fseq = PySequence_Fast(seq, "");
9495    if (fseq == NULL) {
9496        return NULL;
9497    }
9498
9499    /* NOTE: the following code can't call back into Python code,
9500     * so we are sure that fseq won't be mutated.
9501     */
9502
9503    seqlen = PySequence_Fast_GET_SIZE(fseq);
9504    /* If empty sequence, return u"". */
9505    if (seqlen == 0) {
9506        Py_DECREF(fseq);
9507        Py_INCREF(unicode_empty);
9508        res = unicode_empty;
9509        return res;
9510    }
9511
9512    /* If singleton sequence with an exact Unicode, return that. */
9513    last_obj = NULL;
9514    items = PySequence_Fast_ITEMS(fseq);
9515    if (seqlen == 1) {
9516        if (PyUnicode_CheckExact(items[0])) {
9517            res = items[0];
9518            Py_INCREF(res);
9519            Py_DECREF(fseq);
9520            return res;
9521        }
9522        seplen = 0;
9523        maxchar = 0;
9524    }
9525    else {
9526        /* Set up sep and seplen */
9527        if (separator == NULL) {
9528            /* fall back to a blank space separator */
9529            sep = PyUnicode_FromOrdinal(' ');
9530            if (!sep)
9531                goto onError;
9532            seplen = 1;
9533            maxchar = 32;
9534        }
9535        else {
9536            if (!PyUnicode_Check(separator)) {
9537                PyErr_Format(PyExc_TypeError,
9538                             "separator: expected str instance,"
9539                             " %.80s found",
9540                             Py_TYPE(separator)->tp_name);
9541                goto onError;
9542            }
9543            if (PyUnicode_READY(separator))
9544                goto onError;
9545            sep = separator;
9546            seplen = PyUnicode_GET_LENGTH(separator);
9547            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9548            /* inc refcount to keep this code path symmetric with the
9549               above case of a blank separator */
9550            Py_INCREF(sep);
9551        }
9552        last_obj = sep;
9553    }
9554
9555    /* There are at least two things to join, or else we have a subclass
9556     * of str in the sequence.
9557     * Do a pre-pass to figure out the total amount of space we'll
9558     * need (sz), and see whether all argument are strings.
9559     */
9560    sz = 0;
9561#ifdef Py_DEBUG
9562    use_memcpy = 0;
9563#else
9564    use_memcpy = 1;
9565#endif
9566    for (i = 0; i < seqlen; i++) {
9567        const Py_ssize_t old_sz = sz;
9568        item = items[i];
9569        if (!PyUnicode_Check(item)) {
9570            PyErr_Format(PyExc_TypeError,
9571                         "sequence item %zd: expected str instance,"
9572                         " %.80s found",
9573                         i, Py_TYPE(item)->tp_name);
9574            goto onError;
9575        }
9576        if (PyUnicode_READY(item) == -1)
9577            goto onError;
9578        sz += PyUnicode_GET_LENGTH(item);
9579        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9580        maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
9581        if (i != 0)
9582            sz += seplen;
9583        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9584            PyErr_SetString(PyExc_OverflowError,
9585                            "join() result is too long for a Python string");
9586            goto onError;
9587        }
9588        if (use_memcpy && last_obj != NULL) {
9589            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9590                use_memcpy = 0;
9591        }
9592        last_obj = item;
9593    }
9594
9595    res = PyUnicode_New(sz, maxchar);
9596    if (res == NULL)
9597        goto onError;
9598
9599    /* Catenate everything. */
9600#ifdef Py_DEBUG
9601    use_memcpy = 0;
9602#else
9603    if (use_memcpy) {
9604        res_data = PyUnicode_1BYTE_DATA(res);
9605        kind = PyUnicode_KIND(res);
9606        if (seplen != 0)
9607            sep_data = PyUnicode_1BYTE_DATA(sep);
9608    }
9609#endif
9610    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9611        Py_ssize_t itemlen;
9612        item = items[i];
9613        /* Copy item, and maybe the separator. */
9614        if (i && seplen != 0) {
9615            if (use_memcpy) {
9616                Py_MEMCPY(res_data,
9617                          sep_data,
9618                          kind * seplen);
9619                res_data += kind * seplen;
9620            }
9621            else {
9622                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9623                res_offset += seplen;
9624            }
9625        }
9626        itemlen = PyUnicode_GET_LENGTH(item);
9627        if (itemlen != 0) {
9628            if (use_memcpy) {
9629                Py_MEMCPY(res_data,
9630                          PyUnicode_DATA(item),
9631                          kind * itemlen);
9632                res_data += kind * itemlen;
9633            }
9634            else {
9635                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9636                res_offset += itemlen;
9637            }
9638        }
9639    }
9640    if (use_memcpy)
9641        assert(res_data == PyUnicode_1BYTE_DATA(res)
9642                           + kind * PyUnicode_GET_LENGTH(res));
9643    else
9644        assert(res_offset == PyUnicode_GET_LENGTH(res));
9645
9646    Py_DECREF(fseq);
9647    Py_XDECREF(sep);
9648    assert(_PyUnicode_CheckConsistency(res, 1));
9649    return res;
9650
9651  onError:
9652    Py_DECREF(fseq);
9653    Py_XDECREF(sep);
9654    Py_XDECREF(res);
9655    return NULL;
9656}
9657
9658#define FILL(kind, data, value, start, length) \
9659    do { \
9660        Py_ssize_t i_ = 0; \
9661        assert(kind != PyUnicode_WCHAR_KIND); \
9662        switch ((kind)) { \
9663        case PyUnicode_1BYTE_KIND: { \
9664            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9665            memset(to_, (unsigned char)value, (length)); \
9666            break; \
9667        } \
9668        case PyUnicode_2BYTE_KIND: { \
9669            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9670            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9671            break; \
9672        } \
9673        case PyUnicode_4BYTE_KIND: { \
9674            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9675            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9676            break; \
9677        default: assert(0); \
9678        } \
9679        } \
9680    } while (0)
9681
9682void
9683_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9684                    Py_UCS4 fill_char)
9685{
9686    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9687    const void *data = PyUnicode_DATA(unicode);
9688    assert(PyUnicode_IS_READY(unicode));
9689    assert(unicode_modifiable(unicode));
9690    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9691    assert(start >= 0);
9692    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9693    FILL(kind, data, fill_char, start, length);
9694}
9695
9696Py_ssize_t
9697PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9698               Py_UCS4 fill_char)
9699{
9700    Py_ssize_t maxlen;
9701
9702    if (!PyUnicode_Check(unicode)) {
9703        PyErr_BadInternalCall();
9704        return -1;
9705    }
9706    if (PyUnicode_READY(unicode) == -1)
9707        return -1;
9708    if (unicode_check_modifiable(unicode))
9709        return -1;
9710
9711    if (start < 0) {
9712        PyErr_SetString(PyExc_IndexError, "string index out of range");
9713        return -1;
9714    }
9715    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9716        PyErr_SetString(PyExc_ValueError,
9717                         "fill character is bigger than "
9718                         "the string maximum character");
9719        return -1;
9720    }
9721
9722    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9723    length = Py_MIN(maxlen, length);
9724    if (length <= 0)
9725        return 0;
9726
9727    _PyUnicode_FastFill(unicode, start, length, fill_char);
9728    return length;
9729}
9730
9731static PyObject *
9732pad(PyObject *self,
9733    Py_ssize_t left,
9734    Py_ssize_t right,
9735    Py_UCS4 fill)
9736{
9737    PyObject *u;
9738    Py_UCS4 maxchar;
9739    int kind;
9740    void *data;
9741
9742    if (left < 0)
9743        left = 0;
9744    if (right < 0)
9745        right = 0;
9746
9747    if (left == 0 && right == 0)
9748        return unicode_result_unchanged(self);
9749
9750    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9751        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9752        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9753        return NULL;
9754    }
9755    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9756    maxchar = MAX_MAXCHAR(maxchar, fill);
9757    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9758    if (!u)
9759        return NULL;
9760
9761    kind = PyUnicode_KIND(u);
9762    data = PyUnicode_DATA(u);
9763    if (left)
9764        FILL(kind, data, fill, 0, left);
9765    if (right)
9766        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9767    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9768    assert(_PyUnicode_CheckConsistency(u, 1));
9769    return u;
9770}
9771
9772PyObject *
9773PyUnicode_Splitlines(PyObject *string, int keepends)
9774{
9775    PyObject *list;
9776
9777    string = PyUnicode_FromObject(string);
9778    if (string == NULL)
9779        return NULL;
9780    if (PyUnicode_READY(string) == -1) {
9781        Py_DECREF(string);
9782        return NULL;
9783    }
9784
9785    switch (PyUnicode_KIND(string)) {
9786    case PyUnicode_1BYTE_KIND:
9787        if (PyUnicode_IS_ASCII(string))
9788            list = asciilib_splitlines(
9789                string, PyUnicode_1BYTE_DATA(string),
9790                PyUnicode_GET_LENGTH(string), keepends);
9791        else
9792            list = ucs1lib_splitlines(
9793                string, PyUnicode_1BYTE_DATA(string),
9794                PyUnicode_GET_LENGTH(string), keepends);
9795        break;
9796    case PyUnicode_2BYTE_KIND:
9797        list = ucs2lib_splitlines(
9798            string, PyUnicode_2BYTE_DATA(string),
9799            PyUnicode_GET_LENGTH(string), keepends);
9800        break;
9801    case PyUnicode_4BYTE_KIND:
9802        list = ucs4lib_splitlines(
9803            string, PyUnicode_4BYTE_DATA(string),
9804            PyUnicode_GET_LENGTH(string), keepends);
9805        break;
9806    default:
9807        assert(0);
9808        list = 0;
9809    }
9810    Py_DECREF(string);
9811    return list;
9812}
9813
9814static PyObject *
9815split(PyObject *self,
9816      PyObject *substring,
9817      Py_ssize_t maxcount)
9818{
9819    int kind1, kind2, kind;
9820    void *buf1, *buf2;
9821    Py_ssize_t len1, len2;
9822    PyObject* out;
9823
9824    if (maxcount < 0)
9825        maxcount = PY_SSIZE_T_MAX;
9826
9827    if (PyUnicode_READY(self) == -1)
9828        return NULL;
9829
9830    if (substring == NULL)
9831        switch (PyUnicode_KIND(self)) {
9832        case PyUnicode_1BYTE_KIND:
9833            if (PyUnicode_IS_ASCII(self))
9834                return asciilib_split_whitespace(
9835                    self,  PyUnicode_1BYTE_DATA(self),
9836                    PyUnicode_GET_LENGTH(self), maxcount
9837                    );
9838            else
9839                return ucs1lib_split_whitespace(
9840                    self,  PyUnicode_1BYTE_DATA(self),
9841                    PyUnicode_GET_LENGTH(self), maxcount
9842                    );
9843        case PyUnicode_2BYTE_KIND:
9844            return ucs2lib_split_whitespace(
9845                self,  PyUnicode_2BYTE_DATA(self),
9846                PyUnicode_GET_LENGTH(self), maxcount
9847                );
9848        case PyUnicode_4BYTE_KIND:
9849            return ucs4lib_split_whitespace(
9850                self,  PyUnicode_4BYTE_DATA(self),
9851                PyUnicode_GET_LENGTH(self), maxcount
9852                );
9853        default:
9854            assert(0);
9855            return NULL;
9856        }
9857
9858    if (PyUnicode_READY(substring) == -1)
9859        return NULL;
9860
9861    kind1 = PyUnicode_KIND(self);
9862    kind2 = PyUnicode_KIND(substring);
9863    kind = kind1 > kind2 ? kind1 : kind2;
9864    buf1 = PyUnicode_DATA(self);
9865    buf2 = PyUnicode_DATA(substring);
9866    if (kind1 != kind)
9867        buf1 = _PyUnicode_AsKind(self, kind);
9868    if (!buf1)
9869        return NULL;
9870    if (kind2 != kind)
9871        buf2 = _PyUnicode_AsKind(substring, kind);
9872    if (!buf2) {
9873        if (kind1 != kind) PyMem_Free(buf1);
9874        return NULL;
9875    }
9876    len1 = PyUnicode_GET_LENGTH(self);
9877    len2 = PyUnicode_GET_LENGTH(substring);
9878
9879    switch (kind) {
9880    case PyUnicode_1BYTE_KIND:
9881        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9882            out = asciilib_split(
9883                self,  buf1, len1, buf2, len2, maxcount);
9884        else
9885            out = ucs1lib_split(
9886                self,  buf1, len1, buf2, len2, maxcount);
9887        break;
9888    case PyUnicode_2BYTE_KIND:
9889        out = ucs2lib_split(
9890            self,  buf1, len1, buf2, len2, maxcount);
9891        break;
9892    case PyUnicode_4BYTE_KIND:
9893        out = ucs4lib_split(
9894            self,  buf1, len1, buf2, len2, maxcount);
9895        break;
9896    default:
9897        out = NULL;
9898    }
9899    if (kind1 != kind)
9900        PyMem_Free(buf1);
9901    if (kind2 != kind)
9902        PyMem_Free(buf2);
9903    return out;
9904}
9905
9906static PyObject *
9907rsplit(PyObject *self,
9908       PyObject *substring,
9909       Py_ssize_t maxcount)
9910{
9911    int kind1, kind2, kind;
9912    void *buf1, *buf2;
9913    Py_ssize_t len1, len2;
9914    PyObject* out;
9915
9916    if (maxcount < 0)
9917        maxcount = PY_SSIZE_T_MAX;
9918
9919    if (PyUnicode_READY(self) == -1)
9920        return NULL;
9921
9922    if (substring == NULL)
9923        switch (PyUnicode_KIND(self)) {
9924        case PyUnicode_1BYTE_KIND:
9925            if (PyUnicode_IS_ASCII(self))
9926                return asciilib_rsplit_whitespace(
9927                    self,  PyUnicode_1BYTE_DATA(self),
9928                    PyUnicode_GET_LENGTH(self), maxcount
9929                    );
9930            else
9931                return ucs1lib_rsplit_whitespace(
9932                    self,  PyUnicode_1BYTE_DATA(self),
9933                    PyUnicode_GET_LENGTH(self), maxcount
9934                    );
9935        case PyUnicode_2BYTE_KIND:
9936            return ucs2lib_rsplit_whitespace(
9937                self,  PyUnicode_2BYTE_DATA(self),
9938                PyUnicode_GET_LENGTH(self), maxcount
9939                );
9940        case PyUnicode_4BYTE_KIND:
9941            return ucs4lib_rsplit_whitespace(
9942                self,  PyUnicode_4BYTE_DATA(self),
9943                PyUnicode_GET_LENGTH(self), maxcount
9944                );
9945        default:
9946            assert(0);
9947            return NULL;
9948        }
9949
9950    if (PyUnicode_READY(substring) == -1)
9951        return NULL;
9952
9953    kind1 = PyUnicode_KIND(self);
9954    kind2 = PyUnicode_KIND(substring);
9955    kind = kind1 > kind2 ? kind1 : kind2;
9956    buf1 = PyUnicode_DATA(self);
9957    buf2 = PyUnicode_DATA(substring);
9958    if (kind1 != kind)
9959        buf1 = _PyUnicode_AsKind(self, kind);
9960    if (!buf1)
9961        return NULL;
9962    if (kind2 != kind)
9963        buf2 = _PyUnicode_AsKind(substring, kind);
9964    if (!buf2) {
9965        if (kind1 != kind) PyMem_Free(buf1);
9966        return NULL;
9967    }
9968    len1 = PyUnicode_GET_LENGTH(self);
9969    len2 = PyUnicode_GET_LENGTH(substring);
9970
9971    switch (kind) {
9972    case PyUnicode_1BYTE_KIND:
9973        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9974            out = asciilib_rsplit(
9975                self,  buf1, len1, buf2, len2, maxcount);
9976        else
9977            out = ucs1lib_rsplit(
9978                self,  buf1, len1, buf2, len2, maxcount);
9979        break;
9980    case PyUnicode_2BYTE_KIND:
9981        out = ucs2lib_rsplit(
9982            self,  buf1, len1, buf2, len2, maxcount);
9983        break;
9984    case PyUnicode_4BYTE_KIND:
9985        out = ucs4lib_rsplit(
9986            self,  buf1, len1, buf2, len2, maxcount);
9987        break;
9988    default:
9989        out = NULL;
9990    }
9991    if (kind1 != kind)
9992        PyMem_Free(buf1);
9993    if (kind2 != kind)
9994        PyMem_Free(buf2);
9995    return out;
9996}
9997
9998static Py_ssize_t
9999anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10000            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10001{
10002    switch (kind) {
10003    case PyUnicode_1BYTE_KIND:
10004        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10005            return asciilib_find(buf1, len1, buf2, len2, offset);
10006        else
10007            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10008    case PyUnicode_2BYTE_KIND:
10009        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10010    case PyUnicode_4BYTE_KIND:
10011        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10012    }
10013    assert(0);
10014    return -1;
10015}
10016
10017static Py_ssize_t
10018anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10019             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10020{
10021    switch (kind) {
10022    case PyUnicode_1BYTE_KIND:
10023        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10024            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10025        else
10026            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10027    case PyUnicode_2BYTE_KIND:
10028        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10029    case PyUnicode_4BYTE_KIND:
10030        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10031    }
10032    assert(0);
10033    return 0;
10034}
10035
10036static PyObject *
10037replace(PyObject *self, PyObject *str1,
10038        PyObject *str2, Py_ssize_t maxcount)
10039{
10040    PyObject *u;
10041    char *sbuf = PyUnicode_DATA(self);
10042    char *buf1 = PyUnicode_DATA(str1);
10043    char *buf2 = PyUnicode_DATA(str2);
10044    int srelease = 0, release1 = 0, release2 = 0;
10045    int skind = PyUnicode_KIND(self);
10046    int kind1 = PyUnicode_KIND(str1);
10047    int kind2 = PyUnicode_KIND(str2);
10048    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10049    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10050    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10051    int mayshrink;
10052    Py_UCS4 maxchar, maxchar_str2;
10053
10054    if (maxcount < 0)
10055        maxcount = PY_SSIZE_T_MAX;
10056    else if (maxcount == 0 || slen == 0)
10057        goto nothing;
10058
10059    if (str1 == str2)
10060        goto nothing;
10061    if (skind < kind1)
10062        /* substring too wide to be present */
10063        goto nothing;
10064
10065    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10066    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10067    /* Replacing str1 with str2 may cause a maxchar reduction in the
10068       result string. */
10069    mayshrink = (maxchar_str2 < maxchar);
10070    maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
10071
10072    if (len1 == len2) {
10073        /* same length */
10074        if (len1 == 0)
10075            goto nothing;
10076        if (len1 == 1) {
10077            /* replace characters */
10078            Py_UCS4 u1, u2;
10079            int rkind;
10080            Py_ssize_t index, pos;
10081            char *src;
10082
10083            u1 = PyUnicode_READ_CHAR(str1, 0);
10084            pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10085            if (pos < 0)
10086                goto nothing;
10087            u2 = PyUnicode_READ_CHAR(str2, 0);
10088            u = PyUnicode_New(slen, maxchar);
10089            if (!u)
10090                goto error;
10091            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10092            rkind = PyUnicode_KIND(u);
10093
10094            PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10095            index = 0;
10096            src = sbuf;
10097            while (--maxcount)
10098            {
10099                pos++;
10100                src += pos * PyUnicode_KIND(self);
10101                slen -= pos;
10102                index += pos;
10103                pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10104                if (pos < 0)
10105                    break;
10106                PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10107            }
10108        }
10109        else {
10110            int rkind = skind;
10111            char *res;
10112            Py_ssize_t i;
10113
10114            if (kind1 < rkind) {
10115                /* widen substring */
10116                buf1 = _PyUnicode_AsKind(str1, rkind);
10117                if (!buf1) goto error;
10118                release1 = 1;
10119            }
10120            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10121            if (i < 0)
10122                goto nothing;
10123            if (rkind > kind2) {
10124                /* widen replacement */
10125                buf2 = _PyUnicode_AsKind(str2, rkind);
10126                if (!buf2) goto error;
10127                release2 = 1;
10128            }
10129            else if (rkind < kind2) {
10130                /* widen self and buf1 */
10131                rkind = kind2;
10132                if (release1) PyMem_Free(buf1);
10133                sbuf = _PyUnicode_AsKind(self, rkind);
10134                if (!sbuf) goto error;
10135                srelease = 1;
10136                buf1 = _PyUnicode_AsKind(str1, rkind);
10137                if (!buf1) goto error;
10138                release1 = 1;
10139            }
10140            u = PyUnicode_New(slen, maxchar);
10141            if (!u)
10142                goto error;
10143            assert(PyUnicode_KIND(u) == rkind);
10144            res = PyUnicode_DATA(u);
10145
10146            memcpy(res, sbuf, rkind * slen);
10147            /* change everything in-place, starting with this one */
10148            memcpy(res + rkind * i,
10149                   buf2,
10150                   rkind * len2);
10151            i += len1;
10152
10153            while ( --maxcount > 0) {
10154                i = anylib_find(rkind, self,
10155                                sbuf+rkind*i, slen-i,
10156                                str1, buf1, len1, i);
10157                if (i == -1)
10158                    break;
10159                memcpy(res + rkind * i,
10160                       buf2,
10161                       rkind * len2);
10162                i += len1;
10163            }
10164        }
10165    }
10166    else {
10167        Py_ssize_t n, i, j, ires;
10168        Py_ssize_t product, new_size;
10169        int rkind = skind;
10170        char *res;
10171
10172        if (kind1 < rkind) {
10173            /* widen substring */
10174            buf1 = _PyUnicode_AsKind(str1, rkind);
10175            if (!buf1) goto error;
10176            release1 = 1;
10177        }
10178        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10179        if (n == 0)
10180            goto nothing;
10181        if (kind2 < rkind) {
10182            /* widen replacement */
10183            buf2 = _PyUnicode_AsKind(str2, rkind);
10184            if (!buf2) goto error;
10185            release2 = 1;
10186        }
10187        else if (kind2 > rkind) {
10188            /* widen self and buf1 */
10189            rkind = kind2;
10190            sbuf = _PyUnicode_AsKind(self, rkind);
10191            if (!sbuf) goto error;
10192            srelease = 1;
10193            if (release1) PyMem_Free(buf1);
10194            buf1 = _PyUnicode_AsKind(str1, rkind);
10195            if (!buf1) goto error;
10196            release1 = 1;
10197        }
10198        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10199           PyUnicode_GET_LENGTH(str1))); */
10200        product = n * (len2-len1);
10201        if ((product / (len2-len1)) != n) {
10202                PyErr_SetString(PyExc_OverflowError,
10203                                "replace string is too long");
10204                goto error;
10205        }
10206        new_size = slen + product;
10207        if (new_size == 0) {
10208            Py_INCREF(unicode_empty);
10209            u = unicode_empty;
10210            goto done;
10211        }
10212        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10213            PyErr_SetString(PyExc_OverflowError,
10214                            "replace string is too long");
10215            goto error;
10216        }
10217        u = PyUnicode_New(new_size, maxchar);
10218        if (!u)
10219            goto error;
10220        assert(PyUnicode_KIND(u) == rkind);
10221        res = PyUnicode_DATA(u);
10222        ires = i = 0;
10223        if (len1 > 0) {
10224            while (n-- > 0) {
10225                /* look for next match */
10226                j = anylib_find(rkind, self,
10227                                sbuf + rkind * i, slen-i,
10228                                str1, buf1, len1, i);
10229                if (j == -1)
10230                    break;
10231                else if (j > i) {
10232                    /* copy unchanged part [i:j] */
10233                    memcpy(res + rkind * ires,
10234                           sbuf + rkind * i,
10235                           rkind * (j-i));
10236                    ires += j - i;
10237                }
10238                /* copy substitution string */
10239                if (len2 > 0) {
10240                    memcpy(res + rkind * ires,
10241                           buf2,
10242                           rkind * len2);
10243                    ires += len2;
10244                }
10245                i = j + len1;
10246            }
10247            if (i < slen)
10248                /* copy tail [i:] */
10249                memcpy(res + rkind * ires,
10250                       sbuf + rkind * i,
10251                       rkind * (slen-i));
10252        }
10253        else {
10254            /* interleave */
10255            while (n > 0) {
10256                memcpy(res + rkind * ires,
10257                       buf2,
10258                       rkind * len2);
10259                ires += len2;
10260                if (--n <= 0)
10261                    break;
10262                memcpy(res + rkind * ires,
10263                       sbuf + rkind * i,
10264                       rkind);
10265                ires++;
10266                i++;
10267            }
10268            memcpy(res + rkind * ires,
10269                   sbuf + rkind * i,
10270                   rkind * (slen-i));
10271        }
10272    }
10273
10274    if (mayshrink) {
10275        unicode_adjust_maxchar(&u);
10276        if (u == NULL)
10277            goto error;
10278    }
10279
10280  done:
10281    if (srelease)
10282        PyMem_FREE(sbuf);
10283    if (release1)
10284        PyMem_FREE(buf1);
10285    if (release2)
10286        PyMem_FREE(buf2);
10287    assert(_PyUnicode_CheckConsistency(u, 1));
10288    return u;
10289
10290  nothing:
10291    /* nothing to replace; return original string (when possible) */
10292    if (srelease)
10293        PyMem_FREE(sbuf);
10294    if (release1)
10295        PyMem_FREE(buf1);
10296    if (release2)
10297        PyMem_FREE(buf2);
10298    return unicode_result_unchanged(self);
10299
10300  error:
10301    if (srelease && sbuf)
10302        PyMem_FREE(sbuf);
10303    if (release1 && buf1)
10304        PyMem_FREE(buf1);
10305    if (release2 && buf2)
10306        PyMem_FREE(buf2);
10307    return NULL;
10308}
10309
10310/* --- Unicode Object Methods --------------------------------------------- */
10311
10312PyDoc_STRVAR(title__doc__,
10313             "S.title() -> str\n\
10314\n\
10315Return a titlecased version of S, i.e. words start with title case\n\
10316characters, all remaining cased characters have lower case.");
10317
10318static PyObject*
10319unicode_title(PyObject *self)
10320{
10321    if (PyUnicode_READY(self) == -1)
10322        return NULL;
10323    return case_operation(self, do_title);
10324}
10325
10326PyDoc_STRVAR(capitalize__doc__,
10327             "S.capitalize() -> str\n\
10328\n\
10329Return a capitalized version of S, i.e. make the first character\n\
10330have upper case and the rest lower case.");
10331
10332static PyObject*
10333unicode_capitalize(PyObject *self)
10334{
10335    if (PyUnicode_READY(self) == -1)
10336        return NULL;
10337    if (PyUnicode_GET_LENGTH(self) == 0)
10338        return unicode_result_unchanged(self);
10339    return case_operation(self, do_capitalize);
10340}
10341
10342PyDoc_STRVAR(casefold__doc__,
10343             "S.casefold() -> str\n\
10344\n\
10345Return a version of S suitable for caseless comparisons.");
10346
10347static PyObject *
10348unicode_casefold(PyObject *self)
10349{
10350    if (PyUnicode_READY(self) == -1)
10351        return NULL;
10352    if (PyUnicode_IS_ASCII(self))
10353        return ascii_upper_or_lower(self, 1);
10354    return case_operation(self, do_casefold);
10355}
10356
10357
10358/* Argument converter.  Coerces to a single unicode character */
10359
10360static int
10361convert_uc(PyObject *obj, void *addr)
10362{
10363    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10364    PyObject *uniobj;
10365
10366    uniobj = PyUnicode_FromObject(obj);
10367    if (uniobj == NULL) {
10368        PyErr_SetString(PyExc_TypeError,
10369                        "The fill character cannot be converted to Unicode");
10370        return 0;
10371    }
10372    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10373        PyErr_SetString(PyExc_TypeError,
10374                        "The fill character must be exactly one character long");
10375        Py_DECREF(uniobj);
10376        return 0;
10377    }
10378    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10379    Py_DECREF(uniobj);
10380    return 1;
10381}
10382
10383PyDoc_STRVAR(center__doc__,
10384             "S.center(width[, fillchar]) -> str\n\
10385\n\
10386Return S centered in a string of length width. Padding is\n\
10387done using the specified fill character (default is a space)");
10388
10389static PyObject *
10390unicode_center(PyObject *self, PyObject *args)
10391{
10392    Py_ssize_t marg, left;
10393    Py_ssize_t width;
10394    Py_UCS4 fillchar = ' ';
10395
10396    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10397        return NULL;
10398
10399    if (PyUnicode_READY(self) == -1)
10400        return NULL;
10401
10402    if (PyUnicode_GET_LENGTH(self) >= width)
10403        return unicode_result_unchanged(self);
10404
10405    marg = width - PyUnicode_GET_LENGTH(self);
10406    left = marg / 2 + (marg & width & 1);
10407
10408    return pad(self, left, marg - left, fillchar);
10409}
10410
10411/* This function assumes that str1 and str2 are readied by the caller. */
10412
10413static int
10414unicode_compare(PyObject *str1, PyObject *str2)
10415{
10416    int kind1, kind2;
10417    void *data1, *data2;
10418    Py_ssize_t len1, len2, i;
10419
10420    kind1 = PyUnicode_KIND(str1);
10421    kind2 = PyUnicode_KIND(str2);
10422    data1 = PyUnicode_DATA(str1);
10423    data2 = PyUnicode_DATA(str2);
10424    len1 = PyUnicode_GET_LENGTH(str1);
10425    len2 = PyUnicode_GET_LENGTH(str2);
10426
10427    for (i = 0; i < len1 && i < len2; ++i) {
10428        Py_UCS4 c1, c2;
10429        c1 = PyUnicode_READ(kind1, data1, i);
10430        c2 = PyUnicode_READ(kind2, data2, i);
10431
10432        if (c1 != c2)
10433            return (c1 < c2) ? -1 : 1;
10434    }
10435
10436    return (len1 < len2) ? -1 : (len1 != len2);
10437}
10438
10439int
10440PyUnicode_Compare(PyObject *left, PyObject *right)
10441{
10442    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10443        if (PyUnicode_READY(left) == -1 ||
10444            PyUnicode_READY(right) == -1)
10445            return -1;
10446        return unicode_compare(left, right);
10447    }
10448    PyErr_Format(PyExc_TypeError,
10449                 "Can't compare %.100s and %.100s",
10450                 left->ob_type->tp_name,
10451                 right->ob_type->tp_name);
10452    return -1;
10453}
10454
10455int
10456PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10457{
10458    Py_ssize_t i;
10459    int kind;
10460    void *data;
10461    Py_UCS4 chr;
10462
10463    assert(_PyUnicode_CHECK(uni));
10464    if (PyUnicode_READY(uni) == -1)
10465        return -1;
10466    kind = PyUnicode_KIND(uni);
10467    data = PyUnicode_DATA(uni);
10468    /* Compare Unicode string and source character set string */
10469    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10470        if (chr != str[i])
10471            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10472    /* This check keeps Python strings that end in '\0' from comparing equal
10473     to C strings identical up to that point. */
10474    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10475        return 1; /* uni is longer */
10476    if (str[i])
10477        return -1; /* str is longer */
10478    return 0;
10479}
10480
10481
10482#define TEST_COND(cond)                         \
10483    ((cond) ? Py_True : Py_False)
10484
10485PyObject *
10486PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10487{
10488    int result;
10489
10490    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10491        PyObject *v;
10492        if (PyUnicode_READY(left) == -1 ||
10493            PyUnicode_READY(right) == -1)
10494            return NULL;
10495        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10496            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10497            if (op == Py_EQ) {
10498                Py_INCREF(Py_False);
10499                return Py_False;
10500            }
10501            if (op == Py_NE) {
10502                Py_INCREF(Py_True);
10503                return Py_True;
10504            }
10505        }
10506        if (left == right)
10507            result = 0;
10508        else
10509            result = unicode_compare(left, right);
10510
10511        /* Convert the return value to a Boolean */
10512        switch (op) {
10513        case Py_EQ:
10514            v = TEST_COND(result == 0);
10515            break;
10516        case Py_NE:
10517            v = TEST_COND(result != 0);
10518            break;
10519        case Py_LE:
10520            v = TEST_COND(result <= 0);
10521            break;
10522        case Py_GE:
10523            v = TEST_COND(result >= 0);
10524            break;
10525        case Py_LT:
10526            v = TEST_COND(result == -1);
10527            break;
10528        case Py_GT:
10529            v = TEST_COND(result == 1);
10530            break;
10531        default:
10532            PyErr_BadArgument();
10533            return NULL;
10534        }
10535        Py_INCREF(v);
10536        return v;
10537    }
10538
10539    Py_RETURN_NOTIMPLEMENTED;
10540}
10541
10542int
10543PyUnicode_Contains(PyObject *container, PyObject *element)
10544{
10545    PyObject *str, *sub;
10546    int kind1, kind2, kind;
10547    void *buf1, *buf2;
10548    Py_ssize_t len1, len2;
10549    int result;
10550
10551    /* Coerce the two arguments */
10552    sub = PyUnicode_FromObject(element);
10553    if (!sub) {
10554        PyErr_Format(PyExc_TypeError,
10555                     "'in <string>' requires string as left operand, not %s",
10556                     element->ob_type->tp_name);
10557        return -1;
10558    }
10559
10560    str = PyUnicode_FromObject(container);
10561    if (!str) {
10562        Py_DECREF(sub);
10563        return -1;
10564    }
10565    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10566        Py_DECREF(sub);
10567        Py_DECREF(str);
10568    }
10569
10570    kind1 = PyUnicode_KIND(str);
10571    kind2 = PyUnicode_KIND(sub);
10572    kind = kind1;
10573    buf1 = PyUnicode_DATA(str);
10574    buf2 = PyUnicode_DATA(sub);
10575    if (kind2 != kind) {
10576        if (kind2 > kind) {
10577            Py_DECREF(sub);
10578            Py_DECREF(str);
10579            return 0;
10580        }
10581        buf2 = _PyUnicode_AsKind(sub, kind);
10582    }
10583    if (!buf2) {
10584        Py_DECREF(sub);
10585        Py_DECREF(str);
10586        return -1;
10587    }
10588    len1 = PyUnicode_GET_LENGTH(str);
10589    len2 = PyUnicode_GET_LENGTH(sub);
10590
10591    switch (kind) {
10592    case PyUnicode_1BYTE_KIND:
10593        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10594        break;
10595    case PyUnicode_2BYTE_KIND:
10596        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10597        break;
10598    case PyUnicode_4BYTE_KIND:
10599        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10600        break;
10601    default:
10602        result = -1;
10603        assert(0);
10604    }
10605
10606    Py_DECREF(str);
10607    Py_DECREF(sub);
10608
10609    if (kind2 != kind)
10610        PyMem_Free(buf2);
10611
10612    return result;
10613}
10614
10615/* Concat to string or Unicode object giving a new Unicode object. */
10616
10617PyObject *
10618PyUnicode_Concat(PyObject *left, PyObject *right)
10619{
10620    PyObject *u = NULL, *v = NULL, *w;
10621    Py_UCS4 maxchar, maxchar2;
10622    Py_ssize_t u_len, v_len, new_len;
10623
10624    /* Coerce the two arguments */
10625    u = PyUnicode_FromObject(left);
10626    if (u == NULL)
10627        goto onError;
10628    v = PyUnicode_FromObject(right);
10629    if (v == NULL)
10630        goto onError;
10631
10632    /* Shortcuts */
10633    if (v == unicode_empty) {
10634        Py_DECREF(v);
10635        return u;
10636    }
10637    if (u == unicode_empty) {
10638        Py_DECREF(u);
10639        return v;
10640    }
10641
10642    u_len = PyUnicode_GET_LENGTH(u);
10643    v_len = PyUnicode_GET_LENGTH(v);
10644    if (u_len > PY_SSIZE_T_MAX - v_len) {
10645        PyErr_SetString(PyExc_OverflowError,
10646                        "strings are too large to concat");
10647        goto onError;
10648    }
10649    new_len = u_len + v_len;
10650
10651    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10652    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10653    maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10654
10655    /* Concat the two Unicode strings */
10656    w = PyUnicode_New(new_len, maxchar);
10657    if (w == NULL)
10658        goto onError;
10659    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10660    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
10661    Py_DECREF(u);
10662    Py_DECREF(v);
10663    assert(_PyUnicode_CheckConsistency(w, 1));
10664    return w;
10665
10666  onError:
10667    Py_XDECREF(u);
10668    Py_XDECREF(v);
10669    return NULL;
10670}
10671
10672void
10673PyUnicode_Append(PyObject **p_left, PyObject *right)
10674{
10675    PyObject *left, *res;
10676    Py_UCS4 maxchar, maxchar2;
10677    Py_ssize_t left_len, right_len, new_len;
10678
10679    if (p_left == NULL) {
10680        if (!PyErr_Occurred())
10681            PyErr_BadInternalCall();
10682        return;
10683    }
10684    left = *p_left;
10685    if (right == NULL || !PyUnicode_Check(left)) {
10686        if (!PyErr_Occurred())
10687            PyErr_BadInternalCall();
10688        goto error;
10689    }
10690
10691    if (PyUnicode_READY(left) == -1)
10692        goto error;
10693    if (PyUnicode_READY(right) == -1)
10694        goto error;
10695
10696    /* Shortcuts */
10697    if (left == unicode_empty) {
10698        Py_DECREF(left);
10699        Py_INCREF(right);
10700        *p_left = right;
10701        return;
10702    }
10703    if (right == unicode_empty)
10704        return;
10705
10706    left_len = PyUnicode_GET_LENGTH(left);
10707    right_len = PyUnicode_GET_LENGTH(right);
10708    if (left_len > PY_SSIZE_T_MAX - right_len) {
10709        PyErr_SetString(PyExc_OverflowError,
10710                        "strings are too large to concat");
10711        goto error;
10712    }
10713    new_len = left_len + right_len;
10714
10715    if (unicode_modifiable(left)
10716        && PyUnicode_CheckExact(right)
10717        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
10718        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10719           to change the structure size, but characters are stored just after
10720           the structure, and so it requires to move all characters which is
10721           not so different than duplicating the string. */
10722        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10723    {
10724        /* append inplace */
10725        if (unicode_resize(p_left, new_len) != 0) {
10726            /* XXX if _PyUnicode_Resize() fails, 'left' has been
10727             * deallocated so it cannot be put back into
10728             * 'variable'.  The MemoryError is raised when there
10729             * is no value in 'variable', which might (very
10730             * remotely) be a cause of incompatibilities.
10731             */
10732            goto error;
10733        }
10734        /* copy 'right' into the newly allocated area of 'left' */
10735        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
10736    }
10737    else {
10738        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10739        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10740        maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10741
10742        /* Concat the two Unicode strings */
10743        res = PyUnicode_New(new_len, maxchar);
10744        if (res == NULL)
10745            goto error;
10746        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10747        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
10748        Py_DECREF(left);
10749        *p_left = res;
10750    }
10751    assert(_PyUnicode_CheckConsistency(*p_left, 1));
10752    return;
10753
10754error:
10755    Py_CLEAR(*p_left);
10756}
10757
10758void
10759PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10760{
10761    PyUnicode_Append(pleft, right);
10762    Py_XDECREF(right);
10763}
10764
10765PyDoc_STRVAR(count__doc__,
10766             "S.count(sub[, start[, end]]) -> int\n\
10767\n\
10768Return the number of non-overlapping occurrences of substring sub in\n\
10769string S[start:end].  Optional arguments start and end are\n\
10770interpreted as in slice notation.");
10771
10772static PyObject *
10773unicode_count(PyObject *self, PyObject *args)
10774{
10775    PyObject *substring;
10776    Py_ssize_t start = 0;
10777    Py_ssize_t end = PY_SSIZE_T_MAX;
10778    PyObject *result;
10779    int kind1, kind2, kind;
10780    void *buf1, *buf2;
10781    Py_ssize_t len1, len2, iresult;
10782
10783    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10784                                            &start, &end))
10785        return NULL;
10786
10787    kind1 = PyUnicode_KIND(self);
10788    kind2 = PyUnicode_KIND(substring);
10789    if (kind2 > kind1)
10790        return PyLong_FromLong(0);
10791    kind = kind1;
10792    buf1 = PyUnicode_DATA(self);
10793    buf2 = PyUnicode_DATA(substring);
10794    if (kind2 != kind)
10795        buf2 = _PyUnicode_AsKind(substring, kind);
10796    if (!buf2) {
10797        Py_DECREF(substring);
10798        return NULL;
10799    }
10800    len1 = PyUnicode_GET_LENGTH(self);
10801    len2 = PyUnicode_GET_LENGTH(substring);
10802
10803    ADJUST_INDICES(start, end, len1);
10804    switch (kind) {
10805    case PyUnicode_1BYTE_KIND:
10806        iresult = ucs1lib_count(
10807            ((Py_UCS1*)buf1) + start, end - start,
10808            buf2, len2, PY_SSIZE_T_MAX
10809            );
10810        break;
10811    case PyUnicode_2BYTE_KIND:
10812        iresult = ucs2lib_count(
10813            ((Py_UCS2*)buf1) + start, end - start,
10814            buf2, len2, PY_SSIZE_T_MAX
10815            );
10816        break;
10817    case PyUnicode_4BYTE_KIND:
10818        iresult = ucs4lib_count(
10819            ((Py_UCS4*)buf1) + start, end - start,
10820            buf2, len2, PY_SSIZE_T_MAX
10821            );
10822        break;
10823    default:
10824        assert(0); iresult = 0;
10825    }
10826
10827    result = PyLong_FromSsize_t(iresult);
10828
10829    if (kind2 != kind)
10830        PyMem_Free(buf2);
10831
10832    Py_DECREF(substring);
10833
10834    return result;
10835}
10836
10837PyDoc_STRVAR(encode__doc__,
10838             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10839\n\
10840Encode S using the codec registered for encoding. Default encoding\n\
10841is 'utf-8'. errors may be given to set a different error\n\
10842handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10843a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10844'xmlcharrefreplace' as well as any other name registered with\n\
10845codecs.register_error that can handle UnicodeEncodeErrors.");
10846
10847static PyObject *
10848unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10849{
10850    static char *kwlist[] = {"encoding", "errors", 0};
10851    char *encoding = NULL;
10852    char *errors = NULL;
10853
10854    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10855                                     kwlist, &encoding, &errors))
10856        return NULL;
10857    return PyUnicode_AsEncodedString(self, encoding, errors);
10858}
10859
10860PyDoc_STRVAR(expandtabs__doc__,
10861             "S.expandtabs([tabsize]) -> str\n\
10862\n\
10863Return a copy of S where all tab characters are expanded using spaces.\n\
10864If tabsize is not given, a tab size of 8 characters is assumed.");
10865
10866static PyObject*
10867unicode_expandtabs(PyObject *self, PyObject *args)
10868{
10869    Py_ssize_t i, j, line_pos, src_len, incr;
10870    Py_UCS4 ch;
10871    PyObject *u;
10872    void *src_data, *dest_data;
10873    int tabsize = 8;
10874    int kind;
10875    int found;
10876
10877    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10878        return NULL;
10879
10880    if (PyUnicode_READY(self) == -1)
10881        return NULL;
10882
10883    /* First pass: determine size of output string */
10884    src_len = PyUnicode_GET_LENGTH(self);
10885    i = j = line_pos = 0;
10886    kind = PyUnicode_KIND(self);
10887    src_data = PyUnicode_DATA(self);
10888    found = 0;
10889    for (; i < src_len; i++) {
10890        ch = PyUnicode_READ(kind, src_data, i);
10891        if (ch == '\t') {
10892            found = 1;
10893            if (tabsize > 0) {
10894                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10895                if (j > PY_SSIZE_T_MAX - incr)
10896                    goto overflow;
10897                line_pos += incr;
10898                j += incr;
10899            }
10900        }
10901        else {
10902            if (j > PY_SSIZE_T_MAX - 1)
10903                goto overflow;
10904            line_pos++;
10905            j++;
10906            if (ch == '\n' || ch == '\r')
10907                line_pos = 0;
10908        }
10909    }
10910    if (!found)
10911        return unicode_result_unchanged(self);
10912
10913    /* Second pass: create output string and fill it */
10914    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10915    if (!u)
10916        return NULL;
10917    dest_data = PyUnicode_DATA(u);
10918
10919    i = j = line_pos = 0;
10920
10921    for (; i < src_len; i++) {
10922        ch = PyUnicode_READ(kind, src_data, i);
10923        if (ch == '\t') {
10924            if (tabsize > 0) {
10925                incr = tabsize - (line_pos % tabsize);
10926                line_pos += incr;
10927                FILL(kind, dest_data, ' ', j, incr);
10928                j += incr;
10929            }
10930        }
10931        else {
10932            line_pos++;
10933            PyUnicode_WRITE(kind, dest_data, j, ch);
10934            j++;
10935            if (ch == '\n' || ch == '\r')
10936                line_pos = 0;
10937        }
10938    }
10939    assert (j == PyUnicode_GET_LENGTH(u));
10940    return unicode_result(u);
10941
10942  overflow:
10943    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10944    return NULL;
10945}
10946
10947PyDoc_STRVAR(find__doc__,
10948             "S.find(sub[, start[, end]]) -> int\n\
10949\n\
10950Return the lowest index in S where substring sub is found,\n\
10951such that sub is contained within S[start:end].  Optional\n\
10952arguments start and end are interpreted as in slice notation.\n\
10953\n\
10954Return -1 on failure.");
10955
10956static PyObject *
10957unicode_find(PyObject *self, PyObject *args)
10958{
10959    PyObject *substring;
10960    Py_ssize_t start;
10961    Py_ssize_t end;
10962    Py_ssize_t result;
10963
10964    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10965                                            &start, &end))
10966        return NULL;
10967
10968    if (PyUnicode_READY(self) == -1)
10969        return NULL;
10970    if (PyUnicode_READY(substring) == -1)
10971        return NULL;
10972
10973    result = any_find_slice(1, self, substring, start, end);
10974
10975    Py_DECREF(substring);
10976
10977    if (result == -2)
10978        return NULL;
10979
10980    return PyLong_FromSsize_t(result);
10981}
10982
10983static PyObject *
10984unicode_getitem(PyObject *self, Py_ssize_t index)
10985{
10986    void *data;
10987    enum PyUnicode_Kind kind;
10988    Py_UCS4 ch;
10989    PyObject *res;
10990
10991    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10992        PyErr_BadArgument();
10993        return NULL;
10994    }
10995    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10996        PyErr_SetString(PyExc_IndexError, "string index out of range");
10997        return NULL;
10998    }
10999    kind = PyUnicode_KIND(self);
11000    data = PyUnicode_DATA(self);
11001    ch = PyUnicode_READ(kind, data, index);
11002    if (ch < 256)
11003        return get_latin1_char(ch);
11004
11005    res = PyUnicode_New(1, ch);
11006    if (res == NULL)
11007        return NULL;
11008    kind = PyUnicode_KIND(res);
11009    data = PyUnicode_DATA(res);
11010    PyUnicode_WRITE(kind, data, 0, ch);
11011    assert(_PyUnicode_CheckConsistency(res, 1));
11012    return res;
11013}
11014
11015/* Believe it or not, this produces the same value for ASCII strings
11016   as bytes_hash(). */
11017static Py_hash_t
11018unicode_hash(PyObject *self)
11019{
11020    Py_ssize_t len;
11021    Py_uhash_t x;
11022
11023#ifdef Py_DEBUG
11024    assert(_Py_HashSecret_Initialized);
11025#endif
11026    if (_PyUnicode_HASH(self) != -1)
11027        return _PyUnicode_HASH(self);
11028    if (PyUnicode_READY(self) == -1)
11029        return -1;
11030    len = PyUnicode_GET_LENGTH(self);
11031    /*
11032      We make the hash of the empty string be 0, rather than using
11033      (prefix ^ suffix), since this slightly obfuscates the hash secret
11034    */
11035    if (len == 0) {
11036        _PyUnicode_HASH(self) = 0;
11037        return 0;
11038    }
11039
11040    /* The hash function as a macro, gets expanded three times below. */
11041#define HASH(P)                                            \
11042    x ^= (Py_uhash_t) *P << 7;                             \
11043    while (--len >= 0)                                     \
11044        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
11045
11046    x = (Py_uhash_t) _Py_HashSecret.prefix;
11047    switch (PyUnicode_KIND(self)) {
11048    case PyUnicode_1BYTE_KIND: {
11049        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11050        HASH(c);
11051        break;
11052    }
11053    case PyUnicode_2BYTE_KIND: {
11054        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11055        HASH(s);
11056        break;
11057    }
11058    default: {
11059        Py_UCS4 *l;
11060        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11061               "Impossible switch case in unicode_hash");
11062        l = PyUnicode_4BYTE_DATA(self);
11063        HASH(l);
11064        break;
11065    }
11066    }
11067    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11068    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
11069
11070    if (x == -1)
11071        x = -2;
11072    _PyUnicode_HASH(self) = x;
11073    return x;
11074}
11075#undef HASH
11076
11077PyDoc_STRVAR(index__doc__,
11078             "S.index(sub[, start[, end]]) -> int\n\
11079\n\
11080Like S.find() but raise ValueError when the substring is not found.");
11081
11082static PyObject *
11083unicode_index(PyObject *self, PyObject *args)
11084{
11085    Py_ssize_t result;
11086    PyObject *substring;
11087    Py_ssize_t start;
11088    Py_ssize_t end;
11089
11090    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11091                                            &start, &end))
11092        return NULL;
11093
11094    if (PyUnicode_READY(self) == -1)
11095        return NULL;
11096    if (PyUnicode_READY(substring) == -1)
11097        return NULL;
11098
11099    result = any_find_slice(1, self, substring, start, end);
11100
11101    Py_DECREF(substring);
11102
11103    if (result == -2)
11104        return NULL;
11105
11106    if (result < 0) {
11107        PyErr_SetString(PyExc_ValueError, "substring not found");
11108        return NULL;
11109    }
11110
11111    return PyLong_FromSsize_t(result);
11112}
11113
11114PyDoc_STRVAR(islower__doc__,
11115             "S.islower() -> bool\n\
11116\n\
11117Return True if all cased characters in S are lowercase and there is\n\
11118at least one cased character in S, False otherwise.");
11119
11120static PyObject*
11121unicode_islower(PyObject *self)
11122{
11123    Py_ssize_t i, length;
11124    int kind;
11125    void *data;
11126    int cased;
11127
11128    if (PyUnicode_READY(self) == -1)
11129        return NULL;
11130    length = PyUnicode_GET_LENGTH(self);
11131    kind = PyUnicode_KIND(self);
11132    data = PyUnicode_DATA(self);
11133
11134    /* Shortcut for single character strings */
11135    if (length == 1)
11136        return PyBool_FromLong(
11137            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11138
11139    /* Special case for empty strings */
11140    if (length == 0)
11141        return PyBool_FromLong(0);
11142
11143    cased = 0;
11144    for (i = 0; i < length; i++) {
11145        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11146
11147        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11148            return PyBool_FromLong(0);
11149        else if (!cased && Py_UNICODE_ISLOWER(ch))
11150            cased = 1;
11151    }
11152    return PyBool_FromLong(cased);
11153}
11154
11155PyDoc_STRVAR(isupper__doc__,
11156             "S.isupper() -> bool\n\
11157\n\
11158Return True if all cased characters in S are uppercase and there is\n\
11159at least one cased character in S, False otherwise.");
11160
11161static PyObject*
11162unicode_isupper(PyObject *self)
11163{
11164    Py_ssize_t i, length;
11165    int kind;
11166    void *data;
11167    int cased;
11168
11169    if (PyUnicode_READY(self) == -1)
11170        return NULL;
11171    length = PyUnicode_GET_LENGTH(self);
11172    kind = PyUnicode_KIND(self);
11173    data = PyUnicode_DATA(self);
11174
11175    /* Shortcut for single character strings */
11176    if (length == 1)
11177        return PyBool_FromLong(
11178            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11179
11180    /* Special case for empty strings */
11181    if (length == 0)
11182        return PyBool_FromLong(0);
11183
11184    cased = 0;
11185    for (i = 0; i < length; i++) {
11186        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11187
11188        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11189            return PyBool_FromLong(0);
11190        else if (!cased && Py_UNICODE_ISUPPER(ch))
11191            cased = 1;
11192    }
11193    return PyBool_FromLong(cased);
11194}
11195
11196PyDoc_STRVAR(istitle__doc__,
11197             "S.istitle() -> bool\n\
11198\n\
11199Return True if S is a titlecased string and there is at least one\n\
11200character in S, i.e. upper- and titlecase characters may only\n\
11201follow uncased characters and lowercase characters only cased ones.\n\
11202Return False otherwise.");
11203
11204static PyObject*
11205unicode_istitle(PyObject *self)
11206{
11207    Py_ssize_t i, length;
11208    int kind;
11209    void *data;
11210    int cased, previous_is_cased;
11211
11212    if (PyUnicode_READY(self) == -1)
11213        return NULL;
11214    length = PyUnicode_GET_LENGTH(self);
11215    kind = PyUnicode_KIND(self);
11216    data = PyUnicode_DATA(self);
11217
11218    /* Shortcut for single character strings */
11219    if (length == 1) {
11220        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11221        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11222                               (Py_UNICODE_ISUPPER(ch) != 0));
11223    }
11224
11225    /* Special case for empty strings */
11226    if (length == 0)
11227        return PyBool_FromLong(0);
11228
11229    cased = 0;
11230    previous_is_cased = 0;
11231    for (i = 0; i < length; i++) {
11232        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11233
11234        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11235            if (previous_is_cased)
11236                return PyBool_FromLong(0);
11237            previous_is_cased = 1;
11238            cased = 1;
11239        }
11240        else if (Py_UNICODE_ISLOWER(ch)) {
11241            if (!previous_is_cased)
11242                return PyBool_FromLong(0);
11243            previous_is_cased = 1;
11244            cased = 1;
11245        }
11246        else
11247            previous_is_cased = 0;
11248    }
11249    return PyBool_FromLong(cased);
11250}
11251
11252PyDoc_STRVAR(isspace__doc__,
11253             "S.isspace() -> bool\n\
11254\n\
11255Return True if all characters in S are whitespace\n\
11256and there is at least one character in S, False otherwise.");
11257
11258static PyObject*
11259unicode_isspace(PyObject *self)
11260{
11261    Py_ssize_t i, length;
11262    int kind;
11263    void *data;
11264
11265    if (PyUnicode_READY(self) == -1)
11266        return NULL;
11267    length = PyUnicode_GET_LENGTH(self);
11268    kind = PyUnicode_KIND(self);
11269    data = PyUnicode_DATA(self);
11270
11271    /* Shortcut for single character strings */
11272    if (length == 1)
11273        return PyBool_FromLong(
11274            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11275
11276    /* Special case for empty strings */
11277    if (length == 0)
11278        return PyBool_FromLong(0);
11279
11280    for (i = 0; i < length; i++) {
11281        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11282        if (!Py_UNICODE_ISSPACE(ch))
11283            return PyBool_FromLong(0);
11284    }
11285    return PyBool_FromLong(1);
11286}
11287
11288PyDoc_STRVAR(isalpha__doc__,
11289             "S.isalpha() -> bool\n\
11290\n\
11291Return True if all characters in S are alphabetic\n\
11292and there is at least one character in S, False otherwise.");
11293
11294static PyObject*
11295unicode_isalpha(PyObject *self)
11296{
11297    Py_ssize_t i, length;
11298    int kind;
11299    void *data;
11300
11301    if (PyUnicode_READY(self) == -1)
11302        return NULL;
11303    length = PyUnicode_GET_LENGTH(self);
11304    kind = PyUnicode_KIND(self);
11305    data = PyUnicode_DATA(self);
11306
11307    /* Shortcut for single character strings */
11308    if (length == 1)
11309        return PyBool_FromLong(
11310            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11311
11312    /* Special case for empty strings */
11313    if (length == 0)
11314        return PyBool_FromLong(0);
11315
11316    for (i = 0; i < length; i++) {
11317        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11318            return PyBool_FromLong(0);
11319    }
11320    return PyBool_FromLong(1);
11321}
11322
11323PyDoc_STRVAR(isalnum__doc__,
11324             "S.isalnum() -> bool\n\
11325\n\
11326Return True if all characters in S are alphanumeric\n\
11327and there is at least one character in S, False otherwise.");
11328
11329static PyObject*
11330unicode_isalnum(PyObject *self)
11331{
11332    int kind;
11333    void *data;
11334    Py_ssize_t len, i;
11335
11336    if (PyUnicode_READY(self) == -1)
11337        return NULL;
11338
11339    kind = PyUnicode_KIND(self);
11340    data = PyUnicode_DATA(self);
11341    len = PyUnicode_GET_LENGTH(self);
11342
11343    /* Shortcut for single character strings */
11344    if (len == 1) {
11345        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11346        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11347    }
11348
11349    /* Special case for empty strings */
11350    if (len == 0)
11351        return PyBool_FromLong(0);
11352
11353    for (i = 0; i < len; i++) {
11354        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11355        if (!Py_UNICODE_ISALNUM(ch))
11356            return PyBool_FromLong(0);
11357    }
11358    return PyBool_FromLong(1);
11359}
11360
11361PyDoc_STRVAR(isdecimal__doc__,
11362             "S.isdecimal() -> bool\n\
11363\n\
11364Return True if there are only decimal characters in S,\n\
11365False otherwise.");
11366
11367static PyObject*
11368unicode_isdecimal(PyObject *self)
11369{
11370    Py_ssize_t i, length;
11371    int kind;
11372    void *data;
11373
11374    if (PyUnicode_READY(self) == -1)
11375        return NULL;
11376    length = PyUnicode_GET_LENGTH(self);
11377    kind = PyUnicode_KIND(self);
11378    data = PyUnicode_DATA(self);
11379
11380    /* Shortcut for single character strings */
11381    if (length == 1)
11382        return PyBool_FromLong(
11383            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11384
11385    /* Special case for empty strings */
11386    if (length == 0)
11387        return PyBool_FromLong(0);
11388
11389    for (i = 0; i < length; i++) {
11390        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11391            return PyBool_FromLong(0);
11392    }
11393    return PyBool_FromLong(1);
11394}
11395
11396PyDoc_STRVAR(isdigit__doc__,
11397             "S.isdigit() -> bool\n\
11398\n\
11399Return True if all characters in S are digits\n\
11400and there is at least one character in S, False otherwise.");
11401
11402static PyObject*
11403unicode_isdigit(PyObject *self)
11404{
11405    Py_ssize_t i, length;
11406    int kind;
11407    void *data;
11408
11409    if (PyUnicode_READY(self) == -1)
11410        return NULL;
11411    length = PyUnicode_GET_LENGTH(self);
11412    kind = PyUnicode_KIND(self);
11413    data = PyUnicode_DATA(self);
11414
11415    /* Shortcut for single character strings */
11416    if (length == 1) {
11417        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11418        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11419    }
11420
11421    /* Special case for empty strings */
11422    if (length == 0)
11423        return PyBool_FromLong(0);
11424
11425    for (i = 0; i < length; i++) {
11426        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11427            return PyBool_FromLong(0);
11428    }
11429    return PyBool_FromLong(1);
11430}
11431
11432PyDoc_STRVAR(isnumeric__doc__,
11433             "S.isnumeric() -> bool\n\
11434\n\
11435Return True if there are only numeric characters in S,\n\
11436False otherwise.");
11437
11438static PyObject*
11439unicode_isnumeric(PyObject *self)
11440{
11441    Py_ssize_t i, length;
11442    int kind;
11443    void *data;
11444
11445    if (PyUnicode_READY(self) == -1)
11446        return NULL;
11447    length = PyUnicode_GET_LENGTH(self);
11448    kind = PyUnicode_KIND(self);
11449    data = PyUnicode_DATA(self);
11450
11451    /* Shortcut for single character strings */
11452    if (length == 1)
11453        return PyBool_FromLong(
11454            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11455
11456    /* Special case for empty strings */
11457    if (length == 0)
11458        return PyBool_FromLong(0);
11459
11460    for (i = 0; i < length; i++) {
11461        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11462            return PyBool_FromLong(0);
11463    }
11464    return PyBool_FromLong(1);
11465}
11466
11467int
11468PyUnicode_IsIdentifier(PyObject *self)
11469{
11470    int kind;
11471    void *data;
11472    Py_ssize_t i;
11473    Py_UCS4 first;
11474
11475    if (PyUnicode_READY(self) == -1) {
11476        Py_FatalError("identifier not ready");
11477        return 0;
11478    }
11479
11480    /* Special case for empty strings */
11481    if (PyUnicode_GET_LENGTH(self) == 0)
11482        return 0;
11483    kind = PyUnicode_KIND(self);
11484    data = PyUnicode_DATA(self);
11485
11486    /* PEP 3131 says that the first character must be in
11487       XID_Start and subsequent characters in XID_Continue,
11488       and for the ASCII range, the 2.x rules apply (i.e
11489       start with letters and underscore, continue with
11490       letters, digits, underscore). However, given the current
11491       definition of XID_Start and XID_Continue, it is sufficient
11492       to check just for these, except that _ must be allowed
11493       as starting an identifier.  */
11494    first = PyUnicode_READ(kind, data, 0);
11495    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11496        return 0;
11497
11498    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11499        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11500            return 0;
11501    return 1;
11502}
11503
11504PyDoc_STRVAR(isidentifier__doc__,
11505             "S.isidentifier() -> bool\n\
11506\n\
11507Return True if S is a valid identifier according\n\
11508to the language definition.");
11509
11510static PyObject*
11511unicode_isidentifier(PyObject *self)
11512{
11513    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11514}
11515
11516PyDoc_STRVAR(isprintable__doc__,
11517             "S.isprintable() -> bool\n\
11518\n\
11519Return True if all characters in S are considered\n\
11520printable in repr() or S is empty, False otherwise.");
11521
11522static PyObject*
11523unicode_isprintable(PyObject *self)
11524{
11525    Py_ssize_t i, length;
11526    int kind;
11527    void *data;
11528
11529    if (PyUnicode_READY(self) == -1)
11530        return NULL;
11531    length = PyUnicode_GET_LENGTH(self);
11532    kind = PyUnicode_KIND(self);
11533    data = PyUnicode_DATA(self);
11534
11535    /* Shortcut for single character strings */
11536    if (length == 1)
11537        return PyBool_FromLong(
11538            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11539
11540    for (i = 0; i < length; i++) {
11541        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11542            Py_RETURN_FALSE;
11543        }
11544    }
11545    Py_RETURN_TRUE;
11546}
11547
11548PyDoc_STRVAR(join__doc__,
11549             "S.join(iterable) -> str\n\
11550\n\
11551Return a string which is the concatenation of the strings in the\n\
11552iterable.  The separator between elements is S.");
11553
11554static PyObject*
11555unicode_join(PyObject *self, PyObject *data)
11556{
11557    return PyUnicode_Join(self, data);
11558}
11559
11560static Py_ssize_t
11561unicode_length(PyObject *self)
11562{
11563    if (PyUnicode_READY(self) == -1)
11564        return -1;
11565    return PyUnicode_GET_LENGTH(self);
11566}
11567
11568PyDoc_STRVAR(ljust__doc__,
11569             "S.ljust(width[, fillchar]) -> str\n\
11570\n\
11571Return S left-justified in a Unicode string of length width. Padding is\n\
11572done using the specified fill character (default is a space).");
11573
11574static PyObject *
11575unicode_ljust(PyObject *self, PyObject *args)
11576{
11577    Py_ssize_t width;
11578    Py_UCS4 fillchar = ' ';
11579
11580    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11581        return NULL;
11582
11583    if (PyUnicode_READY(self) == -1)
11584        return NULL;
11585
11586    if (PyUnicode_GET_LENGTH(self) >= width)
11587        return unicode_result_unchanged(self);
11588
11589    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11590}
11591
11592PyDoc_STRVAR(lower__doc__,
11593             "S.lower() -> str\n\
11594\n\
11595Return a copy of the string S converted to lowercase.");
11596
11597static PyObject*
11598unicode_lower(PyObject *self)
11599{
11600    if (PyUnicode_READY(self) == -1)
11601        return NULL;
11602    if (PyUnicode_IS_ASCII(self))
11603        return ascii_upper_or_lower(self, 1);
11604    return case_operation(self, do_lower);
11605}
11606
11607#define LEFTSTRIP 0
11608#define RIGHTSTRIP 1
11609#define BOTHSTRIP 2
11610
11611/* Arrays indexed by above */
11612static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11613
11614#define STRIPNAME(i) (stripformat[i]+3)
11615
11616/* externally visible for str.strip(unicode) */
11617PyObject *
11618_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11619{
11620    void *data;
11621    int kind;
11622    Py_ssize_t i, j, len;
11623    BLOOM_MASK sepmask;
11624
11625    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11626        return NULL;
11627
11628    kind = PyUnicode_KIND(self);
11629    data = PyUnicode_DATA(self);
11630    len = PyUnicode_GET_LENGTH(self);
11631    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11632                              PyUnicode_DATA(sepobj),
11633                              PyUnicode_GET_LENGTH(sepobj));
11634
11635    i = 0;
11636    if (striptype != RIGHTSTRIP) {
11637        while (i < len &&
11638               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11639            i++;
11640        }
11641    }
11642
11643    j = len;
11644    if (striptype != LEFTSTRIP) {
11645        do {
11646            j--;
11647        } while (j >= i &&
11648                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11649        j++;
11650    }
11651
11652    return PyUnicode_Substring(self, i, j);
11653}
11654
11655PyObject*
11656PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11657{
11658    unsigned char *data;
11659    int kind;
11660    Py_ssize_t length;
11661
11662    if (PyUnicode_READY(self) == -1)
11663        return NULL;
11664
11665    length = PyUnicode_GET_LENGTH(self);
11666    end = Py_MIN(end, length);
11667
11668    if (start == 0 && end == length)
11669        return unicode_result_unchanged(self);
11670
11671    if (start < 0 || end < 0) {
11672        PyErr_SetString(PyExc_IndexError, "string index out of range");
11673        return NULL;
11674    }
11675    if (start >= length || end < start) {
11676        Py_INCREF(unicode_empty);
11677        return unicode_empty;
11678    }
11679
11680    length = end - start;
11681    if (PyUnicode_IS_ASCII(self)) {
11682        data = PyUnicode_1BYTE_DATA(self);
11683        return _PyUnicode_FromASCII((char*)(data + start), length);
11684    }
11685    else {
11686        kind = PyUnicode_KIND(self);
11687        data = PyUnicode_1BYTE_DATA(self);
11688        return PyUnicode_FromKindAndData(kind,
11689                                         data + kind * start,
11690                                         length);
11691    }
11692}
11693
11694static PyObject *
11695do_strip(PyObject *self, int striptype)
11696{
11697    int kind;
11698    void *data;
11699    Py_ssize_t len, i, j;
11700
11701    if (PyUnicode_READY(self) == -1)
11702        return NULL;
11703
11704    kind = PyUnicode_KIND(self);
11705    data = PyUnicode_DATA(self);
11706    len = PyUnicode_GET_LENGTH(self);
11707
11708    i = 0;
11709    if (striptype != RIGHTSTRIP) {
11710        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11711            i++;
11712        }
11713    }
11714
11715    j = len;
11716    if (striptype != LEFTSTRIP) {
11717        do {
11718            j--;
11719        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11720        j++;
11721    }
11722
11723    return PyUnicode_Substring(self, i, j);
11724}
11725
11726
11727static PyObject *
11728do_argstrip(PyObject *self, int striptype, PyObject *args)
11729{
11730    PyObject *sep = NULL;
11731
11732    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11733        return NULL;
11734
11735    if (sep != NULL && sep != Py_None) {
11736        if (PyUnicode_Check(sep))
11737            return _PyUnicode_XStrip(self, striptype, sep);
11738        else {
11739            PyErr_Format(PyExc_TypeError,
11740                         "%s arg must be None or str",
11741                         STRIPNAME(striptype));
11742            return NULL;
11743        }
11744    }
11745
11746    return do_strip(self, striptype);
11747}
11748
11749
11750PyDoc_STRVAR(strip__doc__,
11751             "S.strip([chars]) -> str\n\
11752\n\
11753Return a copy of the string S with leading and trailing\n\
11754whitespace removed.\n\
11755If chars is given and not None, remove characters in chars instead.");
11756
11757static PyObject *
11758unicode_strip(PyObject *self, PyObject *args)
11759{
11760    if (PyTuple_GET_SIZE(args) == 0)
11761        return do_strip(self, BOTHSTRIP); /* Common case */
11762    else
11763        return do_argstrip(self, BOTHSTRIP, args);
11764}
11765
11766
11767PyDoc_STRVAR(lstrip__doc__,
11768             "S.lstrip([chars]) -> str\n\
11769\n\
11770Return a copy of the string S with leading whitespace removed.\n\
11771If chars is given and not None, remove characters in chars instead.");
11772
11773static PyObject *
11774unicode_lstrip(PyObject *self, PyObject *args)
11775{
11776    if (PyTuple_GET_SIZE(args) == 0)
11777        return do_strip(self, LEFTSTRIP); /* Common case */
11778    else
11779        return do_argstrip(self, LEFTSTRIP, args);
11780}
11781
11782
11783PyDoc_STRVAR(rstrip__doc__,
11784             "S.rstrip([chars]) -> str\n\
11785\n\
11786Return a copy of the string S with trailing whitespace removed.\n\
11787If chars is given and not None, remove characters in chars instead.");
11788
11789static PyObject *
11790unicode_rstrip(PyObject *self, PyObject *args)
11791{
11792    if (PyTuple_GET_SIZE(args) == 0)
11793        return do_strip(self, RIGHTSTRIP); /* Common case */
11794    else
11795        return do_argstrip(self, RIGHTSTRIP, args);
11796}
11797
11798
11799static PyObject*
11800unicode_repeat(PyObject *str, Py_ssize_t len)
11801{
11802    PyObject *u;
11803    Py_ssize_t nchars, n;
11804
11805    if (len < 1) {
11806        Py_INCREF(unicode_empty);
11807        return unicode_empty;
11808    }
11809
11810    /* no repeat, return original string */
11811    if (len == 1)
11812        return unicode_result_unchanged(str);
11813
11814    if (PyUnicode_READY(str) == -1)
11815        return NULL;
11816
11817    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11818        PyErr_SetString(PyExc_OverflowError,
11819                        "repeated string is too long");
11820        return NULL;
11821    }
11822    nchars = len * PyUnicode_GET_LENGTH(str);
11823
11824    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11825    if (!u)
11826        return NULL;
11827    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11828
11829    if (PyUnicode_GET_LENGTH(str) == 1) {
11830        const int kind = PyUnicode_KIND(str);
11831        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11832        if (kind == PyUnicode_1BYTE_KIND) {
11833            void *to = PyUnicode_DATA(u);
11834            memset(to, (unsigned char)fill_char, len);
11835        }
11836        else if (kind == PyUnicode_2BYTE_KIND) {
11837            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11838            for (n = 0; n < len; ++n)
11839                ucs2[n] = fill_char;
11840        } else {
11841            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11842            assert(kind == PyUnicode_4BYTE_KIND);
11843            for (n = 0; n < len; ++n)
11844                ucs4[n] = fill_char;
11845        }
11846    }
11847    else {
11848        /* number of characters copied this far */
11849        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11850        const Py_ssize_t char_size = PyUnicode_KIND(str);
11851        char *to = (char *) PyUnicode_DATA(u);
11852        Py_MEMCPY(to, PyUnicode_DATA(str),
11853                  PyUnicode_GET_LENGTH(str) * char_size);
11854        while (done < nchars) {
11855            n = (done <= nchars-done) ? done : nchars-done;
11856            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11857            done += n;
11858        }
11859    }
11860
11861    assert(_PyUnicode_CheckConsistency(u, 1));
11862    return u;
11863}
11864
11865PyObject *
11866PyUnicode_Replace(PyObject *obj,
11867                  PyObject *subobj,
11868                  PyObject *replobj,
11869                  Py_ssize_t maxcount)
11870{
11871    PyObject *self;
11872    PyObject *str1;
11873    PyObject *str2;
11874    PyObject *result;
11875
11876    self = PyUnicode_FromObject(obj);
11877    if (self == NULL)
11878        return NULL;
11879    str1 = PyUnicode_FromObject(subobj);
11880    if (str1 == NULL) {
11881        Py_DECREF(self);
11882        return NULL;
11883    }
11884    str2 = PyUnicode_FromObject(replobj);
11885    if (str2 == NULL) {
11886        Py_DECREF(self);
11887        Py_DECREF(str1);
11888        return NULL;
11889    }
11890    if (PyUnicode_READY(self) == -1 ||
11891        PyUnicode_READY(str1) == -1 ||
11892        PyUnicode_READY(str2) == -1)
11893        result = NULL;
11894    else
11895        result = replace(self, str1, str2, maxcount);
11896    Py_DECREF(self);
11897    Py_DECREF(str1);
11898    Py_DECREF(str2);
11899    return result;
11900}
11901
11902PyDoc_STRVAR(replace__doc__,
11903             "S.replace(old, new[, count]) -> str\n\
11904\n\
11905Return a copy of S with all occurrences of substring\n\
11906old replaced by new.  If the optional argument count is\n\
11907given, only the first count occurrences are replaced.");
11908
11909static PyObject*
11910unicode_replace(PyObject *self, PyObject *args)
11911{
11912    PyObject *str1;
11913    PyObject *str2;
11914    Py_ssize_t maxcount = -1;
11915    PyObject *result;
11916
11917    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11918        return NULL;
11919    if (PyUnicode_READY(self) == -1)
11920        return NULL;
11921    str1 = PyUnicode_FromObject(str1);
11922    if (str1 == NULL)
11923        return NULL;
11924    str2 = PyUnicode_FromObject(str2);
11925    if (str2 == NULL) {
11926        Py_DECREF(str1);
11927        return NULL;
11928    }
11929    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11930        result = NULL;
11931    else
11932        result = replace(self, str1, str2, maxcount);
11933
11934    Py_DECREF(str1);
11935    Py_DECREF(str2);
11936    return result;
11937}
11938
11939static PyObject *
11940unicode_repr(PyObject *unicode)
11941{
11942    PyObject *repr;
11943    Py_ssize_t isize;
11944    Py_ssize_t osize, squote, dquote, i, o;
11945    Py_UCS4 max, quote;
11946    int ikind, okind;
11947    void *idata, *odata;
11948
11949    if (PyUnicode_READY(unicode) == -1)
11950        return NULL;
11951
11952    isize = PyUnicode_GET_LENGTH(unicode);
11953    idata = PyUnicode_DATA(unicode);
11954
11955    /* Compute length of output, quote characters, and
11956       maximum character */
11957    osize = 2; /* quotes */
11958    max = 127;
11959    squote = dquote = 0;
11960    ikind = PyUnicode_KIND(unicode);
11961    for (i = 0; i < isize; i++) {
11962        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11963        switch (ch) {
11964        case '\'': squote++; osize++; break;
11965        case '"':  dquote++; osize++; break;
11966        case '\\': case '\t': case '\r': case '\n':
11967            osize += 2; break;
11968        default:
11969            /* Fast-path ASCII */
11970            if (ch < ' ' || ch == 0x7f)
11971                osize += 4; /* \xHH */
11972            else if (ch < 0x7f)
11973                osize++;
11974            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11975                osize++;
11976                max = ch > max ? ch : max;
11977            }
11978            else if (ch < 0x100)
11979                osize += 4; /* \xHH */
11980            else if (ch < 0x10000)
11981                osize += 6; /* \uHHHH */
11982            else
11983                osize += 10; /* \uHHHHHHHH */
11984        }
11985    }
11986
11987    quote = '\'';
11988    if (squote) {
11989        if (dquote)
11990            /* Both squote and dquote present. Use squote,
11991               and escape them */
11992            osize += squote;
11993        else
11994            quote = '"';
11995    }
11996
11997    repr = PyUnicode_New(osize, max);
11998    if (repr == NULL)
11999        return NULL;
12000    okind = PyUnicode_KIND(repr);
12001    odata = PyUnicode_DATA(repr);
12002
12003    PyUnicode_WRITE(okind, odata, 0, quote);
12004    PyUnicode_WRITE(okind, odata, osize-1, quote);
12005
12006    for (i = 0, o = 1; i < isize; i++) {
12007        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12008
12009        /* Escape quotes and backslashes */
12010        if ((ch == quote) || (ch == '\\')) {
12011            PyUnicode_WRITE(okind, odata, o++, '\\');
12012            PyUnicode_WRITE(okind, odata, o++, ch);
12013            continue;
12014        }
12015
12016        /* Map special whitespace to '\t', \n', '\r' */
12017        if (ch == '\t') {
12018            PyUnicode_WRITE(okind, odata, o++, '\\');
12019            PyUnicode_WRITE(okind, odata, o++, 't');
12020        }
12021        else if (ch == '\n') {
12022            PyUnicode_WRITE(okind, odata, o++, '\\');
12023            PyUnicode_WRITE(okind, odata, o++, 'n');
12024        }
12025        else if (ch == '\r') {
12026            PyUnicode_WRITE(okind, odata, o++, '\\');
12027            PyUnicode_WRITE(okind, odata, o++, 'r');
12028        }
12029
12030        /* Map non-printable US ASCII to '\xhh' */
12031        else if (ch < ' ' || ch == 0x7F) {
12032            PyUnicode_WRITE(okind, odata, o++, '\\');
12033            PyUnicode_WRITE(okind, odata, o++, 'x');
12034            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12035            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12036        }
12037
12038        /* Copy ASCII characters as-is */
12039        else if (ch < 0x7F) {
12040            PyUnicode_WRITE(okind, odata, o++, ch);
12041        }
12042
12043        /* Non-ASCII characters */
12044        else {
12045            /* Map Unicode whitespace and control characters
12046               (categories Z* and C* except ASCII space)
12047            */
12048            if (!Py_UNICODE_ISPRINTABLE(ch)) {
12049                PyUnicode_WRITE(okind, odata, o++, '\\');
12050                /* Map 8-bit characters to '\xhh' */
12051                if (ch <= 0xff) {
12052                    PyUnicode_WRITE(okind, odata, o++, 'x');
12053                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12054                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12055                }
12056                /* Map 16-bit characters to '\uxxxx' */
12057                else if (ch <= 0xffff) {
12058                    PyUnicode_WRITE(okind, odata, o++, 'u');
12059                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12060                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12061                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12062                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12063                }
12064                /* Map 21-bit characters to '\U00xxxxxx' */
12065                else {
12066                    PyUnicode_WRITE(okind, odata, o++, 'U');
12067                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12068                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12069                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12070                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12071                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12072                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12073                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12074                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12075                }
12076            }
12077            /* Copy characters as-is */
12078            else {
12079                PyUnicode_WRITE(okind, odata, o++, ch);
12080            }
12081        }
12082    }
12083    /* Closing quote already added at the beginning */
12084    assert(_PyUnicode_CheckConsistency(repr, 1));
12085    return repr;
12086}
12087
12088PyDoc_STRVAR(rfind__doc__,
12089             "S.rfind(sub[, start[, end]]) -> int\n\
12090\n\
12091Return the highest index in S where substring sub is found,\n\
12092such that sub is contained within S[start:end].  Optional\n\
12093arguments start and end are interpreted as in slice notation.\n\
12094\n\
12095Return -1 on failure.");
12096
12097static PyObject *
12098unicode_rfind(PyObject *self, PyObject *args)
12099{
12100    PyObject *substring;
12101    Py_ssize_t start;
12102    Py_ssize_t end;
12103    Py_ssize_t result;
12104
12105    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12106                                            &start, &end))
12107        return NULL;
12108
12109    if (PyUnicode_READY(self) == -1)
12110        return NULL;
12111    if (PyUnicode_READY(substring) == -1)
12112        return NULL;
12113
12114    result = any_find_slice(-1, self, substring, start, end);
12115
12116    Py_DECREF(substring);
12117
12118    if (result == -2)
12119        return NULL;
12120
12121    return PyLong_FromSsize_t(result);
12122}
12123
12124PyDoc_STRVAR(rindex__doc__,
12125             "S.rindex(sub[, start[, end]]) -> int\n\
12126\n\
12127Like S.rfind() but raise ValueError when the substring is not found.");
12128
12129static PyObject *
12130unicode_rindex(PyObject *self, PyObject *args)
12131{
12132    PyObject *substring;
12133    Py_ssize_t start;
12134    Py_ssize_t end;
12135    Py_ssize_t result;
12136
12137    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12138                                            &start, &end))
12139        return NULL;
12140
12141    if (PyUnicode_READY(self) == -1)
12142        return NULL;
12143    if (PyUnicode_READY(substring) == -1)
12144        return NULL;
12145
12146    result = any_find_slice(-1, self, substring, start, end);
12147
12148    Py_DECREF(substring);
12149
12150    if (result == -2)
12151        return NULL;
12152
12153    if (result < 0) {
12154        PyErr_SetString(PyExc_ValueError, "substring not found");
12155        return NULL;
12156    }
12157
12158    return PyLong_FromSsize_t(result);
12159}
12160
12161PyDoc_STRVAR(rjust__doc__,
12162             "S.rjust(width[, fillchar]) -> str\n\
12163\n\
12164Return S right-justified in a string of length width. Padding is\n\
12165done using the specified fill character (default is a space).");
12166
12167static PyObject *
12168unicode_rjust(PyObject *self, PyObject *args)
12169{
12170    Py_ssize_t width;
12171    Py_UCS4 fillchar = ' ';
12172
12173    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12174        return NULL;
12175
12176    if (PyUnicode_READY(self) == -1)
12177        return NULL;
12178
12179    if (PyUnicode_GET_LENGTH(self) >= width)
12180        return unicode_result_unchanged(self);
12181
12182    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12183}
12184
12185PyObject *
12186PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12187{
12188    PyObject *result;
12189
12190    s = PyUnicode_FromObject(s);
12191    if (s == NULL)
12192        return NULL;
12193    if (sep != NULL) {
12194        sep = PyUnicode_FromObject(sep);
12195        if (sep == NULL) {
12196            Py_DECREF(s);
12197            return NULL;
12198        }
12199    }
12200
12201    result = split(s, sep, maxsplit);
12202
12203    Py_DECREF(s);
12204    Py_XDECREF(sep);
12205    return result;
12206}
12207
12208PyDoc_STRVAR(split__doc__,
12209             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12210\n\
12211Return a list of the words in S, using sep as the\n\
12212delimiter string.  If maxsplit is given, at most maxsplit\n\
12213splits are done. If sep is not specified or is None, any\n\
12214whitespace string is a separator and empty strings are\n\
12215removed from the result.");
12216
12217static PyObject*
12218unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12219{
12220    static char *kwlist[] = {"sep", "maxsplit", 0};
12221    PyObject *substring = Py_None;
12222    Py_ssize_t maxcount = -1;
12223
12224    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12225                                     kwlist, &substring, &maxcount))
12226        return NULL;
12227
12228    if (substring == Py_None)
12229        return split(self, NULL, maxcount);
12230    else if (PyUnicode_Check(substring))
12231        return split(self, substring, maxcount);
12232    else
12233        return PyUnicode_Split(self, substring, maxcount);
12234}
12235
12236PyObject *
12237PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12238{
12239    PyObject* str_obj;
12240    PyObject* sep_obj;
12241    PyObject* out;
12242    int kind1, kind2, kind;
12243    void *buf1 = NULL, *buf2 = NULL;
12244    Py_ssize_t len1, len2;
12245
12246    str_obj = PyUnicode_FromObject(str_in);
12247    if (!str_obj)
12248        return NULL;
12249    sep_obj = PyUnicode_FromObject(sep_in);
12250    if (!sep_obj) {
12251        Py_DECREF(str_obj);
12252        return NULL;
12253    }
12254    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12255        Py_DECREF(sep_obj);
12256        Py_DECREF(str_obj);
12257        return NULL;
12258    }
12259
12260    kind1 = PyUnicode_KIND(str_obj);
12261    kind2 = PyUnicode_KIND(sep_obj);
12262    kind = Py_MAX(kind1, kind2);
12263    buf1 = PyUnicode_DATA(str_obj);
12264    if (kind1 != kind)
12265        buf1 = _PyUnicode_AsKind(str_obj, kind);
12266    if (!buf1)
12267        goto onError;
12268    buf2 = PyUnicode_DATA(sep_obj);
12269    if (kind2 != kind)
12270        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12271    if (!buf2)
12272        goto onError;
12273    len1 = PyUnicode_GET_LENGTH(str_obj);
12274    len2 = PyUnicode_GET_LENGTH(sep_obj);
12275
12276    switch (PyUnicode_KIND(str_obj)) {
12277    case PyUnicode_1BYTE_KIND:
12278        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12279            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12280        else
12281            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12282        break;
12283    case PyUnicode_2BYTE_KIND:
12284        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12285        break;
12286    case PyUnicode_4BYTE_KIND:
12287        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12288        break;
12289    default:
12290        assert(0);
12291        out = 0;
12292    }
12293
12294    Py_DECREF(sep_obj);
12295    Py_DECREF(str_obj);
12296    if (kind1 != kind)
12297        PyMem_Free(buf1);
12298    if (kind2 != kind)
12299        PyMem_Free(buf2);
12300
12301    return out;
12302  onError:
12303    Py_DECREF(sep_obj);
12304    Py_DECREF(str_obj);
12305    if (kind1 != kind && buf1)
12306        PyMem_Free(buf1);
12307    if (kind2 != kind && buf2)
12308        PyMem_Free(buf2);
12309    return NULL;
12310}
12311
12312
12313PyObject *
12314PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12315{
12316    PyObject* str_obj;
12317    PyObject* sep_obj;
12318    PyObject* out;
12319    int kind1, kind2, kind;
12320    void *buf1 = NULL, *buf2 = NULL;
12321    Py_ssize_t len1, len2;
12322
12323    str_obj = PyUnicode_FromObject(str_in);
12324    if (!str_obj)
12325        return NULL;
12326    sep_obj = PyUnicode_FromObject(sep_in);
12327    if (!sep_obj) {
12328        Py_DECREF(str_obj);
12329        return NULL;
12330    }
12331
12332    kind1 = PyUnicode_KIND(str_in);
12333    kind2 = PyUnicode_KIND(sep_obj);
12334    kind = Py_MAX(kind1, kind2);
12335    buf1 = PyUnicode_DATA(str_in);
12336    if (kind1 != kind)
12337        buf1 = _PyUnicode_AsKind(str_in, kind);
12338    if (!buf1)
12339        goto onError;
12340    buf2 = PyUnicode_DATA(sep_obj);
12341    if (kind2 != kind)
12342        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12343    if (!buf2)
12344        goto onError;
12345    len1 = PyUnicode_GET_LENGTH(str_obj);
12346    len2 = PyUnicode_GET_LENGTH(sep_obj);
12347
12348    switch (PyUnicode_KIND(str_in)) {
12349    case PyUnicode_1BYTE_KIND:
12350        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12351            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12352        else
12353            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12354        break;
12355    case PyUnicode_2BYTE_KIND:
12356        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12357        break;
12358    case PyUnicode_4BYTE_KIND:
12359        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12360        break;
12361    default:
12362        assert(0);
12363        out = 0;
12364    }
12365
12366    Py_DECREF(sep_obj);
12367    Py_DECREF(str_obj);
12368    if (kind1 != kind)
12369        PyMem_Free(buf1);
12370    if (kind2 != kind)
12371        PyMem_Free(buf2);
12372
12373    return out;
12374  onError:
12375    Py_DECREF(sep_obj);
12376    Py_DECREF(str_obj);
12377    if (kind1 != kind && buf1)
12378        PyMem_Free(buf1);
12379    if (kind2 != kind && buf2)
12380        PyMem_Free(buf2);
12381    return NULL;
12382}
12383
12384PyDoc_STRVAR(partition__doc__,
12385             "S.partition(sep) -> (head, sep, tail)\n\
12386\n\
12387Search for the separator sep in S, and return the part before it,\n\
12388the separator itself, and the part after it.  If the separator is not\n\
12389found, return S and two empty strings.");
12390
12391static PyObject*
12392unicode_partition(PyObject *self, PyObject *separator)
12393{
12394    return PyUnicode_Partition(self, separator);
12395}
12396
12397PyDoc_STRVAR(rpartition__doc__,
12398             "S.rpartition(sep) -> (head, sep, tail)\n\
12399\n\
12400Search for the separator sep in S, starting at the end of S, and return\n\
12401the part before it, the separator itself, and the part after it.  If the\n\
12402separator is not found, return two empty strings and S.");
12403
12404static PyObject*
12405unicode_rpartition(PyObject *self, PyObject *separator)
12406{
12407    return PyUnicode_RPartition(self, separator);
12408}
12409
12410PyObject *
12411PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12412{
12413    PyObject *result;
12414
12415    s = PyUnicode_FromObject(s);
12416    if (s == NULL)
12417        return NULL;
12418    if (sep != NULL) {
12419        sep = PyUnicode_FromObject(sep);
12420        if (sep == NULL) {
12421            Py_DECREF(s);
12422            return NULL;
12423        }
12424    }
12425
12426    result = rsplit(s, sep, maxsplit);
12427
12428    Py_DECREF(s);
12429    Py_XDECREF(sep);
12430    return result;
12431}
12432
12433PyDoc_STRVAR(rsplit__doc__,
12434             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12435\n\
12436Return a list of the words in S, using sep as the\n\
12437delimiter string, starting at the end of the string and\n\
12438working to the front.  If maxsplit is given, at most maxsplit\n\
12439splits are done. If sep is not specified, any whitespace string\n\
12440is a separator.");
12441
12442static PyObject*
12443unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12444{
12445    static char *kwlist[] = {"sep", "maxsplit", 0};
12446    PyObject *substring = Py_None;
12447    Py_ssize_t maxcount = -1;
12448
12449    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12450                                     kwlist, &substring, &maxcount))
12451        return NULL;
12452
12453    if (substring == Py_None)
12454        return rsplit(self, NULL, maxcount);
12455    else if (PyUnicode_Check(substring))
12456        return rsplit(self, substring, maxcount);
12457    else
12458        return PyUnicode_RSplit(self, substring, maxcount);
12459}
12460
12461PyDoc_STRVAR(splitlines__doc__,
12462             "S.splitlines([keepends]) -> list of strings\n\
12463\n\
12464Return a list of the lines in S, breaking at line boundaries.\n\
12465Line breaks are not included in the resulting list unless keepends\n\
12466is given and true.");
12467
12468static PyObject*
12469unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12470{
12471    static char *kwlist[] = {"keepends", 0};
12472    int keepends = 0;
12473
12474    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12475                                     kwlist, &keepends))
12476        return NULL;
12477
12478    return PyUnicode_Splitlines(self, keepends);
12479}
12480
12481static
12482PyObject *unicode_str(PyObject *self)
12483{
12484    return unicode_result_unchanged(self);
12485}
12486
12487PyDoc_STRVAR(swapcase__doc__,
12488             "S.swapcase() -> str\n\
12489\n\
12490Return a copy of S with uppercase characters converted to lowercase\n\
12491and vice versa.");
12492
12493static PyObject*
12494unicode_swapcase(PyObject *self)
12495{
12496    if (PyUnicode_READY(self) == -1)
12497        return NULL;
12498    return case_operation(self, do_swapcase);
12499}
12500
12501PyDoc_STRVAR(maketrans__doc__,
12502             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12503\n\
12504Return a translation table usable for str.translate().\n\
12505If there is only one argument, it must be a dictionary mapping Unicode\n\
12506ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12507Character keys will be then converted to ordinals.\n\
12508If there are two arguments, they must be strings of equal length, and\n\
12509in the resulting dictionary, each character in x will be mapped to the\n\
12510character at the same position in y. If there is a third argument, it\n\
12511must be a string, whose characters will be mapped to None in the result.");
12512
12513static PyObject*
12514unicode_maketrans(PyObject *null, PyObject *args)
12515{
12516    PyObject *x, *y = NULL, *z = NULL;
12517    PyObject *new = NULL, *key, *value;
12518    Py_ssize_t i = 0;
12519    int res;
12520
12521    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12522        return NULL;
12523    new = PyDict_New();
12524    if (!new)
12525        return NULL;
12526    if (y != NULL) {
12527        int x_kind, y_kind, z_kind;
12528        void *x_data, *y_data, *z_data;
12529
12530        /* x must be a string too, of equal length */
12531        if (!PyUnicode_Check(x)) {
12532            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12533                            "be a string if there is a second argument");
12534            goto err;
12535        }
12536        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12537            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12538                            "arguments must have equal length");
12539            goto err;
12540        }
12541        /* create entries for translating chars in x to those in y */
12542        x_kind = PyUnicode_KIND(x);
12543        y_kind = PyUnicode_KIND(y);
12544        x_data = PyUnicode_DATA(x);
12545        y_data = PyUnicode_DATA(y);
12546        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12547            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12548            if (!key)
12549                goto err;
12550            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12551            if (!value) {
12552                Py_DECREF(key);
12553                goto err;
12554            }
12555            res = PyDict_SetItem(new, key, value);
12556            Py_DECREF(key);
12557            Py_DECREF(value);
12558            if (res < 0)
12559                goto err;
12560        }
12561        /* create entries for deleting chars in z */
12562        if (z != NULL) {
12563            z_kind = PyUnicode_KIND(z);
12564            z_data = PyUnicode_DATA(z);
12565            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12566                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12567                if (!key)
12568                    goto err;
12569                res = PyDict_SetItem(new, key, Py_None);
12570                Py_DECREF(key);
12571                if (res < 0)
12572                    goto err;
12573            }
12574        }
12575    } else {
12576        int kind;
12577        void *data;
12578
12579        /* x must be a dict */
12580        if (!PyDict_CheckExact(x)) {
12581            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12582                            "to maketrans it must be a dict");
12583            goto err;
12584        }
12585        /* copy entries into the new dict, converting string keys to int keys */
12586        while (PyDict_Next(x, &i, &key, &value)) {
12587            if (PyUnicode_Check(key)) {
12588                /* convert string keys to integer keys */
12589                PyObject *newkey;
12590                if (PyUnicode_GET_LENGTH(key) != 1) {
12591                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12592                                    "table must be of length 1");
12593                    goto err;
12594                }
12595                kind = PyUnicode_KIND(key);
12596                data = PyUnicode_DATA(key);
12597                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12598                if (!newkey)
12599                    goto err;
12600                res = PyDict_SetItem(new, newkey, value);
12601                Py_DECREF(newkey);
12602                if (res < 0)
12603                    goto err;
12604            } else if (PyLong_Check(key)) {
12605                /* just keep integer keys */
12606                if (PyDict_SetItem(new, key, value) < 0)
12607                    goto err;
12608            } else {
12609                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12610                                "be strings or integers");
12611                goto err;
12612            }
12613        }
12614    }
12615    return new;
12616  err:
12617    Py_DECREF(new);
12618    return NULL;
12619}
12620
12621PyDoc_STRVAR(translate__doc__,
12622             "S.translate(table) -> str\n\
12623\n\
12624Return a copy of the string S, where all characters have been mapped\n\
12625through the given translation table, which must be a mapping of\n\
12626Unicode ordinals to Unicode ordinals, strings, or None.\n\
12627Unmapped characters are left untouched. Characters mapped to None\n\
12628are deleted.");
12629
12630static PyObject*
12631unicode_translate(PyObject *self, PyObject *table)
12632{
12633    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12634}
12635
12636PyDoc_STRVAR(upper__doc__,
12637             "S.upper() -> str\n\
12638\n\
12639Return a copy of S converted to uppercase.");
12640
12641static PyObject*
12642unicode_upper(PyObject *self)
12643{
12644    if (PyUnicode_READY(self) == -1)
12645        return NULL;
12646    if (PyUnicode_IS_ASCII(self))
12647        return ascii_upper_or_lower(self, 0);
12648    return case_operation(self, do_upper);
12649}
12650
12651PyDoc_STRVAR(zfill__doc__,
12652             "S.zfill(width) -> str\n\
12653\n\
12654Pad a numeric string S with zeros on the left, to fill a field\n\
12655of the specified width. The string S is never truncated.");
12656
12657static PyObject *
12658unicode_zfill(PyObject *self, PyObject *args)
12659{
12660    Py_ssize_t fill;
12661    PyObject *u;
12662    Py_ssize_t width;
12663    int kind;
12664    void *data;
12665    Py_UCS4 chr;
12666
12667    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12668        return NULL;
12669
12670    if (PyUnicode_READY(self) == -1)
12671        return NULL;
12672
12673    if (PyUnicode_GET_LENGTH(self) >= width)
12674        return unicode_result_unchanged(self);
12675
12676    fill = width - PyUnicode_GET_LENGTH(self);
12677
12678    u = pad(self, fill, 0, '0');
12679
12680    if (u == NULL)
12681        return NULL;
12682
12683    kind = PyUnicode_KIND(u);
12684    data = PyUnicode_DATA(u);
12685    chr = PyUnicode_READ(kind, data, fill);
12686
12687    if (chr == '+' || chr == '-') {
12688        /* move sign to beginning of string */
12689        PyUnicode_WRITE(kind, data, 0, chr);
12690        PyUnicode_WRITE(kind, data, fill, '0');
12691    }
12692
12693    assert(_PyUnicode_CheckConsistency(u, 1));
12694    return u;
12695}
12696
12697#if 0
12698static PyObject *
12699unicode__decimal2ascii(PyObject *self)
12700{
12701    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12702}
12703#endif
12704
12705PyDoc_STRVAR(startswith__doc__,
12706             "S.startswith(prefix[, start[, end]]) -> bool\n\
12707\n\
12708Return True if S starts with the specified prefix, False otherwise.\n\
12709With optional start, test S beginning at that position.\n\
12710With optional end, stop comparing S at that position.\n\
12711prefix can also be a tuple of strings to try.");
12712
12713static PyObject *
12714unicode_startswith(PyObject *self,
12715                   PyObject *args)
12716{
12717    PyObject *subobj;
12718    PyObject *substring;
12719    Py_ssize_t start = 0;
12720    Py_ssize_t end = PY_SSIZE_T_MAX;
12721    int result;
12722
12723    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12724        return NULL;
12725    if (PyTuple_Check(subobj)) {
12726        Py_ssize_t i;
12727        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12728            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12729            if (substring == NULL)
12730                return NULL;
12731            result = tailmatch(self, substring, start, end, -1);
12732            Py_DECREF(substring);
12733            if (result) {
12734                Py_RETURN_TRUE;
12735            }
12736        }
12737        /* nothing matched */
12738        Py_RETURN_FALSE;
12739    }
12740    substring = PyUnicode_FromObject(subobj);
12741    if (substring == NULL) {
12742        if (PyErr_ExceptionMatches(PyExc_TypeError))
12743            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12744                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12745        return NULL;
12746    }
12747    result = tailmatch(self, substring, start, end, -1);
12748    Py_DECREF(substring);
12749    return PyBool_FromLong(result);
12750}
12751
12752
12753PyDoc_STRVAR(endswith__doc__,
12754             "S.endswith(suffix[, start[, end]]) -> bool\n\
12755\n\
12756Return True if S ends with the specified suffix, False otherwise.\n\
12757With optional start, test S beginning at that position.\n\
12758With optional end, stop comparing S at that position.\n\
12759suffix can also be a tuple of strings to try.");
12760
12761static PyObject *
12762unicode_endswith(PyObject *self,
12763                 PyObject *args)
12764{
12765    PyObject *subobj;
12766    PyObject *substring;
12767    Py_ssize_t start = 0;
12768    Py_ssize_t end = PY_SSIZE_T_MAX;
12769    int result;
12770
12771    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12772        return NULL;
12773    if (PyTuple_Check(subobj)) {
12774        Py_ssize_t i;
12775        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12776            substring = PyUnicode_FromObject(
12777                PyTuple_GET_ITEM(subobj, i));
12778            if (substring == NULL)
12779                return NULL;
12780            result = tailmatch(self, substring, start, end, +1);
12781            Py_DECREF(substring);
12782            if (result) {
12783                Py_RETURN_TRUE;
12784            }
12785        }
12786        Py_RETURN_FALSE;
12787    }
12788    substring = PyUnicode_FromObject(subobj);
12789    if (substring == NULL) {
12790        if (PyErr_ExceptionMatches(PyExc_TypeError))
12791            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12792                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12793        return NULL;
12794    }
12795    result = tailmatch(self, substring, start, end, +1);
12796    Py_DECREF(substring);
12797    return PyBool_FromLong(result);
12798}
12799
12800Py_LOCAL_INLINE(void)
12801_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12802{
12803    writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12804    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12805    writer->data = PyUnicode_DATA(writer->buffer);
12806    writer->kind = PyUnicode_KIND(writer->buffer);
12807}
12808
12809void
12810_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
12811{
12812    memset(writer, 0, sizeof(*writer));
12813#ifdef Py_DEBUG
12814    writer->kind = 5;    /* invalid kind */
12815#endif
12816    writer->min_length = Py_MAX(min_length, 100);
12817    writer->overallocate = (min_length > 0);
12818}
12819
12820int
12821_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12822                                 Py_ssize_t length, Py_UCS4 maxchar)
12823{
12824    Py_ssize_t newlen;
12825    PyObject *newbuffer;
12826
12827    assert(length > 0);
12828
12829    if (length > PY_SSIZE_T_MAX - writer->pos) {
12830        PyErr_NoMemory();
12831        return -1;
12832    }
12833    newlen = writer->pos + length;
12834
12835    if (writer->buffer == NULL) {
12836        if (writer->overallocate) {
12837            /* overallocate 25% to limit the number of resize */
12838            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12839                newlen += newlen / 4;
12840            if (newlen < writer->min_length)
12841                newlen = writer->min_length;
12842        }
12843        writer->buffer = PyUnicode_New(newlen, maxchar);
12844        if (writer->buffer == NULL)
12845            return -1;
12846        _PyUnicodeWriter_Update(writer);
12847        return 0;
12848    }
12849
12850    if (newlen > writer->size) {
12851        if (writer->overallocate) {
12852            /* overallocate 25% to limit the number of resize */
12853            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12854                newlen += newlen / 4;
12855            if (newlen < writer->min_length)
12856                newlen = writer->min_length;
12857        }
12858
12859        if (maxchar > writer->maxchar || writer->readonly) {
12860            /* resize + widen */
12861            newbuffer = PyUnicode_New(newlen, maxchar);
12862            if (newbuffer == NULL)
12863                return -1;
12864            _PyUnicode_FastCopyCharacters(newbuffer, 0,
12865                                          writer->buffer, 0, writer->pos);
12866            Py_DECREF(writer->buffer);
12867            writer->readonly = 0;
12868        }
12869        else {
12870            newbuffer = resize_compact(writer->buffer, newlen);
12871            if (newbuffer == NULL)
12872                return -1;
12873        }
12874        writer->buffer = newbuffer;
12875        _PyUnicodeWriter_Update(writer);
12876    }
12877    else if (maxchar > writer->maxchar) {
12878        assert(!writer->readonly);
12879        newbuffer = PyUnicode_New(writer->size, maxchar);
12880        if (newbuffer == NULL)
12881            return -1;
12882        _PyUnicode_FastCopyCharacters(newbuffer, 0,
12883                                      writer->buffer, 0, writer->pos);
12884        Py_DECREF(writer->buffer);
12885        writer->buffer = newbuffer;
12886        _PyUnicodeWriter_Update(writer);
12887    }
12888    return 0;
12889}
12890
12891int
12892_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12893{
12894    Py_UCS4 maxchar;
12895    Py_ssize_t len;
12896
12897    if (PyUnicode_READY(str) == -1)
12898        return -1;
12899    len = PyUnicode_GET_LENGTH(str);
12900    if (len == 0)
12901        return 0;
12902    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12903    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
12904        if (writer->buffer == NULL && !writer->overallocate) {
12905            Py_INCREF(str);
12906            writer->buffer = str;
12907            _PyUnicodeWriter_Update(writer);
12908            writer->readonly = 1;
12909            writer->size = 0;
12910            writer->pos += len;
12911            return 0;
12912        }
12913        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12914            return -1;
12915    }
12916    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12917                                  str, 0, len);
12918    writer->pos += len;
12919    return 0;
12920}
12921
12922PyObject *
12923_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
12924{
12925    if (writer->pos == 0) {
12926        Py_XDECREF(writer->buffer);
12927        Py_INCREF(unicode_empty);
12928        return unicode_empty;
12929    }
12930    if (writer->readonly) {
12931        assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12932        return writer->buffer;
12933    }
12934    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12935        PyObject *newbuffer;
12936        newbuffer = resize_compact(writer->buffer, writer->pos);
12937        if (newbuffer == NULL) {
12938            Py_DECREF(writer->buffer);
12939            return NULL;
12940        }
12941        writer->buffer = newbuffer;
12942    }
12943    assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
12944    return writer->buffer;
12945}
12946
12947void
12948_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
12949{
12950    Py_CLEAR(writer->buffer);
12951}
12952
12953#include "stringlib/unicode_format.h"
12954
12955PyDoc_STRVAR(format__doc__,
12956             "S.format(*args, **kwargs) -> str\n\
12957\n\
12958Return a formatted version of S, using substitutions from args and kwargs.\n\
12959The substitutions are identified by braces ('{' and '}').");
12960
12961PyDoc_STRVAR(format_map__doc__,
12962             "S.format_map(mapping) -> str\n\
12963\n\
12964Return a formatted version of S, using substitutions from mapping.\n\
12965The substitutions are identified by braces ('{' and '}').");
12966
12967static PyObject *
12968unicode__format__(PyObject* self, PyObject* args)
12969{
12970    PyObject *format_spec;
12971    _PyUnicodeWriter writer;
12972    int ret;
12973
12974    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12975        return NULL;
12976
12977    if (PyUnicode_READY(self) == -1)
12978        return NULL;
12979    _PyUnicodeWriter_Init(&writer, 0);
12980    ret = _PyUnicode_FormatAdvancedWriter(&writer,
12981                                          self, format_spec, 0,
12982                                          PyUnicode_GET_LENGTH(format_spec));
12983    if (ret == -1) {
12984        _PyUnicodeWriter_Dealloc(&writer);
12985        return NULL;
12986    }
12987    return _PyUnicodeWriter_Finish(&writer);
12988}
12989
12990PyDoc_STRVAR(p_format__doc__,
12991             "S.__format__(format_spec) -> str\n\
12992\n\
12993Return a formatted version of S as described by format_spec.");
12994
12995static PyObject *
12996unicode__sizeof__(PyObject *v)
12997{
12998    Py_ssize_t size;
12999
13000    /* If it's a compact object, account for base structure +
13001       character data. */
13002    if (PyUnicode_IS_COMPACT_ASCII(v))
13003        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13004    else if (PyUnicode_IS_COMPACT(v))
13005        size = sizeof(PyCompactUnicodeObject) +
13006            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13007    else {
13008        /* If it is a two-block object, account for base object, and
13009           for character block if present. */
13010        size = sizeof(PyUnicodeObject);
13011        if (_PyUnicode_DATA_ANY(v))
13012            size += (PyUnicode_GET_LENGTH(v) + 1) *
13013                PyUnicode_KIND(v);
13014    }
13015    /* If the wstr pointer is present, account for it unless it is shared
13016       with the data pointer. Check if the data is not shared. */
13017    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13018        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13019    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13020        size += PyUnicode_UTF8_LENGTH(v) + 1;
13021
13022    return PyLong_FromSsize_t(size);
13023}
13024
13025PyDoc_STRVAR(sizeof__doc__,
13026             "S.__sizeof__() -> size of S in memory, in bytes");
13027
13028static PyObject *
13029unicode_getnewargs(PyObject *v)
13030{
13031    PyObject *copy = _PyUnicode_Copy(v);
13032    if (!copy)
13033        return NULL;
13034    return Py_BuildValue("(N)", copy);
13035}
13036
13037static PyMethodDef unicode_methods[] = {
13038    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13039    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13040    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13041    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13042    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13043    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13044    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13045    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13046    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13047    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13048    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13049    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13050    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13051    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13052    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13053    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13054    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13055    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13056    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13057    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13058    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13059    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13060    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13061    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13062    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13063    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13064    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13065    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13066    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13067    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13068    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13069    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13070    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13071    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13072    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13073    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13074    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13075    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13076    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13077    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13078    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13079    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13080    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13081    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13082    {"maketrans", (PyCFunction) unicode_maketrans,
13083     METH_VARARGS | METH_STATIC, maketrans__doc__},
13084    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13085#if 0
13086    /* These methods are just used for debugging the implementation. */
13087    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13088#endif
13089
13090    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13091    {NULL, NULL}
13092};
13093
13094static PyObject *
13095unicode_mod(PyObject *v, PyObject *w)
13096{
13097    if (!PyUnicode_Check(v))
13098        Py_RETURN_NOTIMPLEMENTED;
13099    return PyUnicode_Format(v, w);
13100}
13101
13102static PyNumberMethods unicode_as_number = {
13103    0,              /*nb_add*/
13104    0,              /*nb_subtract*/
13105    0,              /*nb_multiply*/
13106    unicode_mod,            /*nb_remainder*/
13107};
13108
13109static PySequenceMethods unicode_as_sequence = {
13110    (lenfunc) unicode_length,       /* sq_length */
13111    PyUnicode_Concat,           /* sq_concat */
13112    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13113    (ssizeargfunc) unicode_getitem,     /* sq_item */
13114    0,                  /* sq_slice */
13115    0,                  /* sq_ass_item */
13116    0,                  /* sq_ass_slice */
13117    PyUnicode_Contains,         /* sq_contains */
13118};
13119
13120static PyObject*
13121unicode_subscript(PyObject* self, PyObject* item)
13122{
13123    if (PyUnicode_READY(self) == -1)
13124        return NULL;
13125
13126    if (PyIndex_Check(item)) {
13127        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13128        if (i == -1 && PyErr_Occurred())
13129            return NULL;
13130        if (i < 0)
13131            i += PyUnicode_GET_LENGTH(self);
13132        return unicode_getitem(self, i);
13133    } else if (PySlice_Check(item)) {
13134        Py_ssize_t start, stop, step, slicelength, cur, i;
13135        PyObject *result;
13136        void *src_data, *dest_data;
13137        int src_kind, dest_kind;
13138        Py_UCS4 ch, max_char, kind_limit;
13139
13140        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13141                                 &start, &stop, &step, &slicelength) < 0) {
13142            return NULL;
13143        }
13144
13145        if (slicelength <= 0) {
13146            Py_INCREF(unicode_empty);
13147            return unicode_empty;
13148        } else if (start == 0 && step == 1 &&
13149                   slicelength == PyUnicode_GET_LENGTH(self)) {
13150            return unicode_result_unchanged(self);
13151        } else if (step == 1) {
13152            return PyUnicode_Substring(self,
13153                                       start, start + slicelength);
13154        }
13155        /* General case */
13156        src_kind = PyUnicode_KIND(self);
13157        src_data = PyUnicode_DATA(self);
13158        if (!PyUnicode_IS_ASCII(self)) {
13159            kind_limit = kind_maxchar_limit(src_kind);
13160            max_char = 0;
13161            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13162                ch = PyUnicode_READ(src_kind, src_data, cur);
13163                if (ch > max_char) {
13164                    max_char = ch;
13165                    if (max_char >= kind_limit)
13166                        break;
13167                }
13168            }
13169        }
13170        else
13171            max_char = 127;
13172        result = PyUnicode_New(slicelength, max_char);
13173        if (result == NULL)
13174            return NULL;
13175        dest_kind = PyUnicode_KIND(result);
13176        dest_data = PyUnicode_DATA(result);
13177
13178        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13179            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13180            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13181        }
13182        assert(_PyUnicode_CheckConsistency(result, 1));
13183        return result;
13184    } else {
13185        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13186        return NULL;
13187    }
13188}
13189
13190static PyMappingMethods unicode_as_mapping = {
13191    (lenfunc)unicode_length,        /* mp_length */
13192    (binaryfunc)unicode_subscript,  /* mp_subscript */
13193    (objobjargproc)0,           /* mp_ass_subscript */
13194};
13195
13196
13197/* Helpers for PyUnicode_Format() */
13198
13199static PyObject *
13200getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
13201{
13202    Py_ssize_t argidx = *p_argidx;
13203    if (argidx < arglen) {
13204        (*p_argidx)++;
13205        if (arglen < 0)
13206            return args;
13207        else
13208            return PyTuple_GetItem(args, argidx);
13209    }
13210    PyErr_SetString(PyExc_TypeError,
13211                    "not enough arguments for format string");
13212    return NULL;
13213}
13214
13215/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13216
13217static int
13218formatfloat(PyObject *v, int flags, int prec, int type,
13219            PyObject **p_output, _PyUnicodeWriter *writer)
13220{
13221    char *p;
13222    double x;
13223    Py_ssize_t len;
13224
13225    x = PyFloat_AsDouble(v);
13226    if (x == -1.0 && PyErr_Occurred())
13227        return -1;
13228
13229    if (prec < 0)
13230        prec = 6;
13231
13232    p = PyOS_double_to_string(x, type, prec,
13233                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
13234    if (p == NULL)
13235        return -1;
13236    len = strlen(p);
13237    if (writer) {
13238        if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13239            PyMem_Free(p);
13240            return -1;
13241        }
13242        unicode_write_cstr(writer->buffer, writer->pos, p, len);
13243        writer->pos += len;
13244    }
13245    else
13246        *p_output = _PyUnicode_FromASCII(p, len);
13247    PyMem_Free(p);
13248    return 0;
13249}
13250
13251/* formatlong() emulates the format codes d, u, o, x and X, and
13252 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13253 * Python's regular ints.
13254 * Return value:  a new PyUnicodeObject*, or NULL if error.
13255 *     The output string is of the form
13256 *         "-"? ("0x" | "0X")? digit+
13257 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13258 *         set in flags.  The case of hex digits will be correct,
13259 *     There will be at least prec digits, zero-filled on the left if
13260 *         necessary to get that many.
13261 * val          object to be converted
13262 * flags        bitmask of format flags; only F_ALT is looked at
13263 * prec         minimum number of digits; 0-fill on left if needed
13264 * type         a character in [duoxX]; u acts the same as d
13265 *
13266 * CAUTION:  o, x and X conversions on regular ints can never
13267 * produce a '-' sign, but can for Python's unbounded ints.
13268 */
13269static PyObject*
13270formatlong(PyObject *val, int flags, int prec, int type)
13271{
13272    PyObject *result = NULL;
13273    char *buf;
13274    Py_ssize_t i;
13275    int sign;           /* 1 if '-', else 0 */
13276    int len;            /* number of characters */
13277    Py_ssize_t llen;
13278    int numdigits;      /* len == numnondigits + numdigits */
13279    int numnondigits = 0;
13280
13281    /* Avoid exceeding SSIZE_T_MAX */
13282    if (prec > INT_MAX-3) {
13283        PyErr_SetString(PyExc_OverflowError,
13284                        "precision too large");
13285        return NULL;
13286    }
13287
13288    assert(PyLong_Check(val));
13289
13290    switch (type) {
13291    case 'd':
13292    case 'u':
13293        /* Special-case boolean: we want 0/1 */
13294        if (PyBool_Check(val))
13295            result = PyNumber_ToBase(val, 10);
13296        else
13297            result = Py_TYPE(val)->tp_str(val);
13298        break;
13299    case 'o':
13300        numnondigits = 2;
13301        result = PyNumber_ToBase(val, 8);
13302        break;
13303    case 'x':
13304    case 'X':
13305        numnondigits = 2;
13306        result = PyNumber_ToBase(val, 16);
13307        break;
13308    default:
13309        assert(!"'type' not in [duoxX]");
13310    }
13311    if (!result)
13312        return NULL;
13313
13314    assert(unicode_modifiable(result));
13315    assert(PyUnicode_IS_READY(result));
13316    assert(PyUnicode_IS_ASCII(result));
13317
13318    /* To modify the string in-place, there can only be one reference. */
13319    if (Py_REFCNT(result) != 1) {
13320        PyErr_BadInternalCall();
13321        return NULL;
13322    }
13323    buf = PyUnicode_DATA(result);
13324    llen = PyUnicode_GET_LENGTH(result);
13325    if (llen > INT_MAX) {
13326        PyErr_SetString(PyExc_ValueError,
13327                        "string too large in _PyBytes_FormatLong");
13328        return NULL;
13329    }
13330    len = (int)llen;
13331    sign = buf[0] == '-';
13332    numnondigits += sign;
13333    numdigits = len - numnondigits;
13334    assert(numdigits > 0);
13335
13336    /* Get rid of base marker unless F_ALT */
13337    if (((flags & F_ALT) == 0 &&
13338        (type == 'o' || type == 'x' || type == 'X'))) {
13339        assert(buf[sign] == '0');
13340        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13341               buf[sign+1] == 'o');
13342        numnondigits -= 2;
13343        buf += 2;
13344        len -= 2;
13345        if (sign)
13346            buf[0] = '-';
13347        assert(len == numnondigits + numdigits);
13348        assert(numdigits > 0);
13349    }
13350
13351    /* Fill with leading zeroes to meet minimum width. */
13352    if (prec > numdigits) {
13353        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13354                                numnondigits + prec);
13355        char *b1;
13356        if (!r1) {
13357            Py_DECREF(result);
13358            return NULL;
13359        }
13360        b1 = PyBytes_AS_STRING(r1);
13361        for (i = 0; i < numnondigits; ++i)
13362            *b1++ = *buf++;
13363        for (i = 0; i < prec - numdigits; i++)
13364            *b1++ = '0';
13365        for (i = 0; i < numdigits; i++)
13366            *b1++ = *buf++;
13367        *b1 = '\0';
13368        Py_DECREF(result);
13369        result = r1;
13370        buf = PyBytes_AS_STRING(result);
13371        len = numnondigits + prec;
13372    }
13373
13374    /* Fix up case for hex conversions. */
13375    if (type == 'X') {
13376        /* Need to convert all lower case letters to upper case.
13377           and need to convert 0x to 0X (and -0x to -0X). */
13378        for (i = 0; i < len; i++)
13379            if (buf[i] >= 'a' && buf[i] <= 'x')
13380                buf[i] -= 'a'-'A';
13381    }
13382    if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13383        PyObject *unicode;
13384        unicode = _PyUnicode_FromASCII(buf, len);
13385        Py_DECREF(result);
13386        result = unicode;
13387    }
13388    return result;
13389}
13390
13391static Py_UCS4
13392formatchar(PyObject *v)
13393{
13394    /* presume that the buffer is at least 3 characters long */
13395    if (PyUnicode_Check(v)) {
13396        if (PyUnicode_GET_LENGTH(v) == 1) {
13397            return PyUnicode_READ_CHAR(v, 0);
13398        }
13399        goto onError;
13400    }
13401    else {
13402        /* Integer input truncated to a character */
13403        long x;
13404        x = PyLong_AsLong(v);
13405        if (x == -1 && PyErr_Occurred())
13406            goto onError;
13407
13408        if (x < 0 || x > MAX_UNICODE) {
13409            PyErr_SetString(PyExc_OverflowError,
13410                            "%c arg not in range(0x110000)");
13411            return (Py_UCS4) -1;
13412        }
13413
13414        return (Py_UCS4) x;
13415    }
13416
13417  onError:
13418    PyErr_SetString(PyExc_TypeError,
13419                    "%c requires int or char");
13420    return (Py_UCS4) -1;
13421}
13422
13423PyObject *
13424PyUnicode_Format(PyObject *format, PyObject *args)
13425{
13426    Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
13427    int args_owned = 0;
13428    PyObject *dict = NULL;
13429    PyObject *temp = NULL;
13430    PyObject *second = NULL;
13431    PyObject *uformat;
13432    void *fmt;
13433    enum PyUnicode_Kind kind, fmtkind;
13434    _PyUnicodeWriter writer;
13435    Py_ssize_t sublen;
13436    Py_UCS4 maxchar;
13437
13438    if (format == NULL || args == NULL) {
13439        PyErr_BadInternalCall();
13440        return NULL;
13441    }
13442    uformat = PyUnicode_FromObject(format);
13443    if (uformat == NULL)
13444        return NULL;
13445    if (PyUnicode_READY(uformat) == -1) {
13446        Py_DECREF(uformat);
13447        return NULL;
13448    }
13449
13450    fmt = PyUnicode_DATA(uformat);
13451    fmtkind = PyUnicode_KIND(uformat);
13452    fmtcnt = PyUnicode_GET_LENGTH(uformat);
13453    fmtpos = 0;
13454
13455    _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
13456
13457    if (PyTuple_Check(args)) {
13458        arglen = PyTuple_Size(args);
13459        argidx = 0;
13460    }
13461    else {
13462        arglen = -1;
13463        argidx = -2;
13464    }
13465    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
13466        dict = args;
13467
13468    while (--fmtcnt >= 0) {
13469        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13470            Py_ssize_t nonfmtpos;
13471            nonfmtpos = fmtpos++;
13472            while (fmtcnt >= 0 &&
13473                   PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13474                fmtpos++;
13475                fmtcnt--;
13476            }
13477            if (fmtcnt < 0)
13478                fmtpos--;
13479            sublen = fmtpos - nonfmtpos;
13480            maxchar = _PyUnicode_FindMaxChar(uformat,
13481                                             nonfmtpos, nonfmtpos + sublen);
13482            if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
13483                goto onError;
13484
13485            _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13486                                          uformat, nonfmtpos, sublen);
13487            writer.pos += sublen;
13488        }
13489        else {
13490            /* Got a format specifier */
13491            int flags = 0;
13492            Py_ssize_t width = -1;
13493            int prec = -1;
13494            Py_UCS4 c = '\0';
13495            Py_UCS4 fill;
13496            int sign;
13497            Py_UCS4 signchar;
13498            int isnumok;
13499            PyObject *v = NULL;
13500            void *pbuf = NULL;
13501            Py_ssize_t pindex, len;
13502            Py_UCS4 bufmaxchar;
13503            Py_ssize_t buflen;
13504
13505            fmtpos++;
13506            c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13507            if (c == '(') {
13508                Py_ssize_t keystart;
13509                Py_ssize_t keylen;
13510                PyObject *key;
13511                int pcount = 1;
13512
13513                if (dict == NULL) {
13514                    PyErr_SetString(PyExc_TypeError,
13515                                    "format requires a mapping");
13516                    goto onError;
13517                }
13518                ++fmtpos;
13519                --fmtcnt;
13520                keystart = fmtpos;
13521                /* Skip over balanced parentheses */
13522                while (pcount > 0 && --fmtcnt >= 0) {
13523                    c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13524                    if (c == ')')
13525                        --pcount;
13526                    else if (c == '(')
13527                        ++pcount;
13528                    fmtpos++;
13529                }
13530                keylen = fmtpos - keystart - 1;
13531                if (fmtcnt < 0 || pcount > 0) {
13532                    PyErr_SetString(PyExc_ValueError,
13533                                    "incomplete format key");
13534                    goto onError;
13535                }
13536                key = PyUnicode_Substring(uformat,
13537                                          keystart, keystart + keylen);
13538                if (key == NULL)
13539                    goto onError;
13540                if (args_owned) {
13541                    Py_DECREF(args);
13542                    args_owned = 0;
13543                }
13544                args = PyObject_GetItem(dict, key);
13545                Py_DECREF(key);
13546                if (args == NULL) {
13547                    goto onError;
13548                }
13549                args_owned = 1;
13550                arglen = -1;
13551                argidx = -2;
13552            }
13553            while (--fmtcnt >= 0) {
13554                c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13555                switch (c) {
13556                case '-': flags |= F_LJUST; continue;
13557                case '+': flags |= F_SIGN; continue;
13558                case ' ': flags |= F_BLANK; continue;
13559                case '#': flags |= F_ALT; continue;
13560                case '0': flags |= F_ZERO; continue;
13561                }
13562                break;
13563            }
13564            if (c == '*') {
13565                v = getnextarg(args, arglen, &argidx);
13566                if (v == NULL)
13567                    goto onError;
13568                if (!PyLong_Check(v)) {
13569                    PyErr_SetString(PyExc_TypeError,
13570                                    "* wants int");
13571                    goto onError;
13572                }
13573                width = PyLong_AsLong(v);
13574                if (width == -1 && PyErr_Occurred())
13575                    goto onError;
13576                if (width < 0) {
13577                    flags |= F_LJUST;
13578                    width = -width;
13579                }
13580                if (--fmtcnt >= 0)
13581                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13582            }
13583            else if (c >= '0' && c <= '9') {
13584                width = c - '0';
13585                while (--fmtcnt >= 0) {
13586                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13587                    if (c < '0' || c > '9')
13588                        break;
13589                    /* Since c is unsigned, the RHS would end up as unsigned,
13590                       mixing signed and unsigned comparison. Since c is between
13591                       '0' and '9', casting to int is safe. */
13592                    if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
13593                        PyErr_SetString(PyExc_ValueError,
13594                                        "width too big");
13595                        goto onError;
13596                    }
13597                    width = width*10 + (c - '0');
13598                }
13599            }
13600            if (c == '.') {
13601                prec = 0;
13602                if (--fmtcnt >= 0)
13603                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13604                if (c == '*') {
13605                    v = getnextarg(args, arglen, &argidx);
13606                    if (v == NULL)
13607                        goto onError;
13608                    if (!PyLong_Check(v)) {
13609                        PyErr_SetString(PyExc_TypeError,
13610                                        "* wants int");
13611                        goto onError;
13612                    }
13613                    prec = PyLong_AsLong(v);
13614                    if (prec == -1 && PyErr_Occurred())
13615                        goto onError;
13616                    if (prec < 0)
13617                        prec = 0;
13618                    if (--fmtcnt >= 0)
13619                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13620                }
13621                else if (c >= '0' && c <= '9') {
13622                    prec = c - '0';
13623                    while (--fmtcnt >= 0) {
13624                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13625                        if (c < '0' || c > '9')
13626                            break;
13627                        if (prec > (INT_MAX - ((int)c - '0')) / 10) {
13628                            PyErr_SetString(PyExc_ValueError,
13629                                            "prec too big");
13630                            goto onError;
13631                        }
13632                        prec = prec*10 + (c - '0');
13633                    }
13634                }
13635            } /* prec */
13636            if (fmtcnt >= 0) {
13637                if (c == 'h' || c == 'l' || c == 'L') {
13638                    if (--fmtcnt >= 0)
13639                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13640                }
13641            }
13642            if (fmtcnt < 0) {
13643                PyErr_SetString(PyExc_ValueError,
13644                                "incomplete format");
13645                goto onError;
13646            }
13647            if (fmtcnt == 0)
13648                writer.overallocate = 0;
13649
13650            if (c == '%') {
13651                if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
13652                    goto onError;
13653                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13654                writer.pos += 1;
13655                continue;
13656            }
13657
13658            v = getnextarg(args, arglen, &argidx);
13659            if (v == NULL)
13660                goto onError;
13661
13662            sign = 0;
13663            signchar = '\0';
13664            fill = ' ';
13665            switch (c) {
13666
13667            case 's':
13668            case 'r':
13669            case 'a':
13670                if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13671                    /* Fast path */
13672                    if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13673                        goto onError;
13674                    goto nextarg;
13675                }
13676
13677                if (PyUnicode_CheckExact(v) && c == 's') {
13678                    temp = v;
13679                    Py_INCREF(temp);
13680                }
13681                else {
13682                    if (c == 's')
13683                        temp = PyObject_Str(v);
13684                    else if (c == 'r')
13685                        temp = PyObject_Repr(v);
13686                    else
13687                        temp = PyObject_ASCII(v);
13688                }
13689                break;
13690
13691            case 'i':
13692            case 'd':
13693            case 'u':
13694            case 'o':
13695            case 'x':
13696            case 'X':
13697                if (PyLong_CheckExact(v)
13698                    && width == -1 && prec == -1
13699                    && !(flags & (F_SIGN | F_BLANK)))
13700                {
13701                    /* Fast path */
13702                    switch(c)
13703                    {
13704                    case 'd':
13705                    case 'i':
13706                    case 'u':
13707                        if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13708                            goto onError;
13709                        goto nextarg;
13710                    case 'x':
13711                        if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13712                            goto onError;
13713                        goto nextarg;
13714                    case 'o':
13715                        if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13716                            goto onError;
13717                        goto nextarg;
13718                    default:
13719                        break;
13720                    }
13721                }
13722
13723                isnumok = 0;
13724                if (PyNumber_Check(v)) {
13725                    PyObject *iobj=NULL;
13726
13727                    if (PyLong_Check(v)) {
13728                        iobj = v;
13729                        Py_INCREF(iobj);
13730                    }
13731                    else {
13732                        iobj = PyNumber_Long(v);
13733                    }
13734                    if (iobj!=NULL) {
13735                        if (PyLong_Check(iobj)) {
13736                            isnumok = 1;
13737                            sign = 1;
13738                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13739                            Py_DECREF(iobj);
13740                        }
13741                        else {
13742                            Py_DECREF(iobj);
13743                        }
13744                    }
13745                }
13746                if (!isnumok) {
13747                    PyErr_Format(PyExc_TypeError,
13748                                 "%%%c format: a number is required, "
13749                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13750                    goto onError;
13751                }
13752                if (flags & F_ZERO)
13753                    fill = '0';
13754                break;
13755
13756            case 'e':
13757            case 'E':
13758            case 'f':
13759            case 'F':
13760            case 'g':
13761            case 'G':
13762                if (width == -1 && prec == -1
13763                    && !(flags & (F_SIGN | F_BLANK)))
13764                {
13765                    /* Fast path */
13766                    if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13767                        goto onError;
13768                    goto nextarg;
13769                }
13770
13771                sign = 1;
13772                if (flags & F_ZERO)
13773                    fill = '0';
13774                if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13775                    temp = NULL;
13776                break;
13777
13778            case 'c':
13779            {
13780                Py_UCS4 ch = formatchar(v);
13781                if (ch == (Py_UCS4) -1)
13782                    goto onError;
13783                if (width == -1 && prec == -1) {
13784                    /* Fast path */
13785                    if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13786                        goto onError;
13787                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13788                    writer.pos += 1;
13789                    goto nextarg;
13790                }
13791                temp = PyUnicode_FromOrdinal(ch);
13792                break;
13793            }
13794
13795            default:
13796                PyErr_Format(PyExc_ValueError,
13797                             "unsupported format character '%c' (0x%x) "
13798                             "at index %zd",
13799                             (31<=c && c<=126) ? (char)c : '?',
13800                             (int)c,
13801                             fmtpos - 1);
13802                goto onError;
13803            }
13804            if (temp == NULL)
13805                goto onError;
13806            assert (PyUnicode_Check(temp));
13807
13808            if (width == -1 && prec == -1
13809                && !(flags & (F_SIGN | F_BLANK)))
13810            {
13811                /* Fast path */
13812                if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13813                    goto onError;
13814                goto nextarg;
13815            }
13816
13817            if (PyUnicode_READY(temp) == -1) {
13818                Py_CLEAR(temp);
13819                goto onError;
13820            }
13821            kind = PyUnicode_KIND(temp);
13822            pbuf = PyUnicode_DATA(temp);
13823            len = PyUnicode_GET_LENGTH(temp);
13824
13825            if (c == 's' || c == 'r' || c == 'a') {
13826                if (prec >= 0 && len > prec)
13827                    len = prec;
13828            }
13829
13830            /* pbuf is initialized here. */
13831            pindex = 0;
13832            if (sign) {
13833                Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13834                if (ch == '-' || ch == '+') {
13835                    signchar = ch;
13836                    len--;
13837                    pindex++;
13838                }
13839                else if (flags & F_SIGN)
13840                    signchar = '+';
13841                else if (flags & F_BLANK)
13842                    signchar = ' ';
13843                else
13844                    sign = 0;
13845            }
13846            if (width < len)
13847                width = len;
13848
13849            /* Compute the length and maximum character of the
13850               written characters */
13851            bufmaxchar = 127;
13852            if (!(flags & F_LJUST)) {
13853                if (sign) {
13854                    if ((width-1) > len)
13855                        bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13856                }
13857                else {
13858                    if (width > len)
13859                        bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13860                }
13861            }
13862            maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
13863            bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13864
13865            buflen = width;
13866            if (sign && len == width)
13867                buflen++;
13868
13869            if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
13870                goto onError;
13871
13872            /* Write characters */
13873            if (sign) {
13874                if (fill != ' ') {
13875                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13876                    writer.pos += 1;
13877                }
13878                if (width > len)
13879                    width--;
13880            }
13881            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13882                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13883                assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13884                if (fill != ' ') {
13885                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13886                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13887                    writer.pos += 2;
13888                    pindex += 2;
13889                }
13890                width -= 2;
13891                if (width < 0)
13892                    width = 0;
13893                len -= 2;
13894            }
13895            if (width > len && !(flags & F_LJUST)) {
13896                sublen = width - len;
13897                FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13898                writer.pos += sublen;
13899                width = len;
13900            }
13901            if (fill == ' ') {
13902                if (sign) {
13903                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13904                    writer.pos += 1;
13905                }
13906                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13907                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13908                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13909                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13910                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13911                    writer.pos += 2;
13912                    pindex += 2;
13913                }
13914            }
13915
13916            if (len) {
13917                _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13918                                              temp, pindex, len);
13919                writer.pos += len;
13920            }
13921            if (width > len) {
13922                sublen = width - len;
13923                FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13924                writer.pos += sublen;
13925            }
13926
13927nextarg:
13928            if (dict && (argidx < arglen) && c != '%') {
13929                PyErr_SetString(PyExc_TypeError,
13930                                "not all arguments converted during string formatting");
13931                goto onError;
13932            }
13933            Py_CLEAR(temp);
13934        } /* '%' */
13935    } /* until end */
13936    if (argidx < arglen && !dict) {
13937        PyErr_SetString(PyExc_TypeError,
13938                        "not all arguments converted during string formatting");
13939        goto onError;
13940    }
13941
13942    if (args_owned) {
13943        Py_DECREF(args);
13944    }
13945    Py_DECREF(uformat);
13946    Py_XDECREF(temp);
13947    Py_XDECREF(second);
13948    return _PyUnicodeWriter_Finish(&writer);
13949
13950  onError:
13951    Py_DECREF(uformat);
13952    Py_XDECREF(temp);
13953    Py_XDECREF(second);
13954    _PyUnicodeWriter_Dealloc(&writer);
13955    if (args_owned) {
13956        Py_DECREF(args);
13957    }
13958    return NULL;
13959}
13960
13961static PyObject *
13962unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13963
13964static PyObject *
13965unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13966{
13967    PyObject *x = NULL;
13968    static char *kwlist[] = {"object", "encoding", "errors", 0};
13969    char *encoding = NULL;
13970    char *errors = NULL;
13971
13972    if (type != &PyUnicode_Type)
13973        return unicode_subtype_new(type, args, kwds);
13974    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13975                                     kwlist, &x, &encoding, &errors))
13976        return NULL;
13977    if (x == NULL) {
13978        Py_INCREF(unicode_empty);
13979        return unicode_empty;
13980    }
13981    if (encoding == NULL && errors == NULL)
13982        return PyObject_Str(x);
13983    else
13984        return PyUnicode_FromEncodedObject(x, encoding, errors);
13985}
13986
13987static PyObject *
13988unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13989{
13990    PyObject *unicode, *self;
13991    Py_ssize_t length, char_size;
13992    int share_wstr, share_utf8;
13993    unsigned int kind;
13994    void *data;
13995
13996    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13997
13998    unicode = unicode_new(&PyUnicode_Type, args, kwds);
13999    if (unicode == NULL)
14000        return NULL;
14001    assert(_PyUnicode_CHECK(unicode));
14002    if (PyUnicode_READY(unicode) == -1) {
14003        Py_DECREF(unicode);
14004        return NULL;
14005    }
14006
14007    self = type->tp_alloc(type, 0);
14008    if (self == NULL) {
14009        Py_DECREF(unicode);
14010        return NULL;
14011    }
14012    kind = PyUnicode_KIND(unicode);
14013    length = PyUnicode_GET_LENGTH(unicode);
14014
14015    _PyUnicode_LENGTH(self) = length;
14016#ifdef Py_DEBUG
14017    _PyUnicode_HASH(self) = -1;
14018#else
14019    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14020#endif
14021    _PyUnicode_STATE(self).interned = 0;
14022    _PyUnicode_STATE(self).kind = kind;
14023    _PyUnicode_STATE(self).compact = 0;
14024    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14025    _PyUnicode_STATE(self).ready = 1;
14026    _PyUnicode_WSTR(self) = NULL;
14027    _PyUnicode_UTF8_LENGTH(self) = 0;
14028    _PyUnicode_UTF8(self) = NULL;
14029    _PyUnicode_WSTR_LENGTH(self) = 0;
14030    _PyUnicode_DATA_ANY(self) = NULL;
14031
14032    share_utf8 = 0;
14033    share_wstr = 0;
14034    if (kind == PyUnicode_1BYTE_KIND) {
14035        char_size = 1;
14036        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14037            share_utf8 = 1;
14038    }
14039    else if (kind == PyUnicode_2BYTE_KIND) {
14040        char_size = 2;
14041        if (sizeof(wchar_t) == 2)
14042            share_wstr = 1;
14043    }
14044    else {
14045        assert(kind == PyUnicode_4BYTE_KIND);
14046        char_size = 4;
14047        if (sizeof(wchar_t) == 4)
14048            share_wstr = 1;
14049    }
14050
14051    /* Ensure we won't overflow the length. */
14052    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14053        PyErr_NoMemory();
14054        goto onError;
14055    }
14056    data = PyObject_MALLOC((length + 1) * char_size);
14057    if (data == NULL) {
14058        PyErr_NoMemory();
14059        goto onError;
14060    }
14061
14062    _PyUnicode_DATA_ANY(self) = data;
14063    if (share_utf8) {
14064        _PyUnicode_UTF8_LENGTH(self) = length;
14065        _PyUnicode_UTF8(self) = data;
14066    }
14067    if (share_wstr) {
14068        _PyUnicode_WSTR_LENGTH(self) = length;
14069        _PyUnicode_WSTR(self) = (wchar_t *)data;
14070    }
14071
14072    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14073              kind * (length + 1));
14074    assert(_PyUnicode_CheckConsistency(self, 1));
14075#ifdef Py_DEBUG
14076    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14077#endif
14078    Py_DECREF(unicode);
14079    return self;
14080
14081onError:
14082    Py_DECREF(unicode);
14083    Py_DECREF(self);
14084    return NULL;
14085}
14086
14087PyDoc_STRVAR(unicode_doc,
14088             "str(object[, encoding[, errors]]) -> str\n\
14089\n\
14090Create a new string object from the given object. If encoding or\n\
14091errors is specified, then the object must expose a data buffer\n\
14092that will be decoded using the given encoding and error handler.\n\
14093Otherwise, returns the result of object.__str__() (if defined)\n\
14094or repr(object).\n\
14095encoding defaults to sys.getdefaultencoding().\n\
14096errors defaults to 'strict'.");
14097
14098static PyObject *unicode_iter(PyObject *seq);
14099
14100PyTypeObject PyUnicode_Type = {
14101    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14102    "str",              /* tp_name */
14103    sizeof(PyUnicodeObject),        /* tp_size */
14104    0,                  /* tp_itemsize */
14105    /* Slots */
14106    (destructor)unicode_dealloc,    /* tp_dealloc */
14107    0,                  /* tp_print */
14108    0,                  /* tp_getattr */
14109    0,                  /* tp_setattr */
14110    0,                  /* tp_reserved */
14111    unicode_repr,           /* tp_repr */
14112    &unicode_as_number,         /* tp_as_number */
14113    &unicode_as_sequence,       /* tp_as_sequence */
14114    &unicode_as_mapping,        /* tp_as_mapping */
14115    (hashfunc) unicode_hash,        /* tp_hash*/
14116    0,                  /* tp_call*/
14117    (reprfunc) unicode_str,     /* tp_str */
14118    PyObject_GenericGetAttr,        /* tp_getattro */
14119    0,                  /* tp_setattro */
14120    0,                  /* tp_as_buffer */
14121    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14122    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14123    unicode_doc,            /* tp_doc */
14124    0,                  /* tp_traverse */
14125    0,                  /* tp_clear */
14126    PyUnicode_RichCompare,      /* tp_richcompare */
14127    0,                  /* tp_weaklistoffset */
14128    unicode_iter,           /* tp_iter */
14129    0,                  /* tp_iternext */
14130    unicode_methods,            /* tp_methods */
14131    0,                  /* tp_members */
14132    0,                  /* tp_getset */
14133    &PyBaseObject_Type,         /* tp_base */
14134    0,                  /* tp_dict */
14135    0,                  /* tp_descr_get */
14136    0,                  /* tp_descr_set */
14137    0,                  /* tp_dictoffset */
14138    0,                  /* tp_init */
14139    0,                  /* tp_alloc */
14140    unicode_new,            /* tp_new */
14141    PyObject_Del,           /* tp_free */
14142};
14143
14144/* Initialize the Unicode implementation */
14145
14146int _PyUnicode_Init(void)
14147{
14148    int i;
14149
14150    /* XXX - move this array to unicodectype.c ? */
14151    Py_UCS2 linebreak[] = {
14152        0x000A, /* LINE FEED */
14153        0x000D, /* CARRIAGE RETURN */
14154        0x001C, /* FILE SEPARATOR */
14155        0x001D, /* GROUP SEPARATOR */
14156        0x001E, /* RECORD SEPARATOR */
14157        0x0085, /* NEXT LINE */
14158        0x2028, /* LINE SEPARATOR */
14159        0x2029, /* PARAGRAPH SEPARATOR */
14160    };
14161
14162    /* Init the implementation */
14163    unicode_empty = PyUnicode_New(0, 0);
14164    if (!unicode_empty)
14165        Py_FatalError("Can't create empty string");
14166    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
14167
14168    for (i = 0; i < 256; i++)
14169        unicode_latin1[i] = NULL;
14170    if (PyType_Ready(&PyUnicode_Type) < 0)
14171        Py_FatalError("Can't initialize 'unicode'");
14172
14173    /* initialize the linebreak bloom filter */
14174    bloom_linebreak = make_bloom_mask(
14175        PyUnicode_2BYTE_KIND, linebreak,
14176        Py_ARRAY_LENGTH(linebreak));
14177
14178    PyType_Ready(&EncodingMapType);
14179
14180#ifdef HAVE_MBCS
14181    winver.dwOSVersionInfoSize = sizeof(winver);
14182    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14183        PyErr_SetFromWindowsErr(0);
14184        return -1;
14185    }
14186#endif
14187    return 0;
14188}
14189
14190/* Finalize the Unicode implementation */
14191
14192int
14193PyUnicode_ClearFreeList(void)
14194{
14195    return 0;
14196}
14197
14198void
14199_PyUnicode_Fini(void)
14200{
14201    int i;
14202
14203    Py_XDECREF(unicode_empty);
14204    unicode_empty = NULL;
14205
14206    for (i = 0; i < 256; i++) {
14207        if (unicode_latin1[i]) {
14208            Py_DECREF(unicode_latin1[i]);
14209            unicode_latin1[i] = NULL;
14210        }
14211    }
14212    _PyUnicode_ClearStaticStrings();
14213    (void)PyUnicode_ClearFreeList();
14214}
14215
14216void
14217PyUnicode_InternInPlace(PyObject **p)
14218{
14219    register PyObject *s = *p;
14220    PyObject *t;
14221#ifdef Py_DEBUG
14222    assert(s != NULL);
14223    assert(_PyUnicode_CHECK(s));
14224#else
14225    if (s == NULL || !PyUnicode_Check(s))
14226        return;
14227#endif
14228    /* If it's a subclass, we don't really know what putting
14229       it in the interned dict might do. */
14230    if (!PyUnicode_CheckExact(s))
14231        return;
14232    if (PyUnicode_CHECK_INTERNED(s))
14233        return;
14234    if (interned == NULL) {
14235        interned = PyDict_New();
14236        if (interned == NULL) {
14237            PyErr_Clear(); /* Don't leave an exception */
14238            return;
14239        }
14240    }
14241    /* It might be that the GetItem call fails even
14242       though the key is present in the dictionary,
14243       namely when this happens during a stack overflow. */
14244    Py_ALLOW_RECURSION
14245    t = PyDict_GetItem(interned, s);
14246    Py_END_ALLOW_RECURSION
14247
14248        if (t) {
14249            Py_INCREF(t);
14250            Py_DECREF(*p);
14251            *p = t;
14252            return;
14253        }
14254
14255    PyThreadState_GET()->recursion_critical = 1;
14256    if (PyDict_SetItem(interned, s, s) < 0) {
14257        PyErr_Clear();
14258        PyThreadState_GET()->recursion_critical = 0;
14259        return;
14260    }
14261    PyThreadState_GET()->recursion_critical = 0;
14262    /* The two references in interned are not counted by refcnt.
14263       The deallocator will take care of this */
14264    Py_REFCNT(s) -= 2;
14265    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
14266}
14267
14268void
14269PyUnicode_InternImmortal(PyObject **p)
14270{
14271    PyUnicode_InternInPlace(p);
14272    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
14273        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
14274        Py_INCREF(*p);
14275    }
14276}
14277
14278PyObject *
14279PyUnicode_InternFromString(const char *cp)
14280{
14281    PyObject *s = PyUnicode_FromString(cp);
14282    if (s == NULL)
14283        return NULL;
14284    PyUnicode_InternInPlace(&s);
14285    return s;
14286}
14287
14288void
14289_Py_ReleaseInternedUnicodeStrings(void)
14290{
14291    PyObject *keys;
14292    PyObject *s;
14293    Py_ssize_t i, n;
14294    Py_ssize_t immortal_size = 0, mortal_size = 0;
14295
14296    if (interned == NULL || !PyDict_Check(interned))
14297        return;
14298    keys = PyDict_Keys(interned);
14299    if (keys == NULL || !PyList_Check(keys)) {
14300        PyErr_Clear();
14301        return;
14302    }
14303
14304    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14305       detector, interned unicode strings are not forcibly deallocated;
14306       rather, we give them their stolen references back, and then clear
14307       and DECREF the interned dict. */
14308
14309    n = PyList_GET_SIZE(keys);
14310    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14311            n);
14312    for (i = 0; i < n; i++) {
14313        s = PyList_GET_ITEM(keys, i);
14314        if (PyUnicode_READY(s) == -1) {
14315            assert(0 && "could not ready string");
14316            fprintf(stderr, "could not ready string\n");
14317        }
14318        switch (PyUnicode_CHECK_INTERNED(s)) {
14319        case SSTATE_NOT_INTERNED:
14320            /* XXX Shouldn't happen */
14321            break;
14322        case SSTATE_INTERNED_IMMORTAL:
14323            Py_REFCNT(s) += 1;
14324            immortal_size += PyUnicode_GET_LENGTH(s);
14325            break;
14326        case SSTATE_INTERNED_MORTAL:
14327            Py_REFCNT(s) += 2;
14328            mortal_size += PyUnicode_GET_LENGTH(s);
14329            break;
14330        default:
14331            Py_FatalError("Inconsistent interned string state.");
14332        }
14333        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14334    }
14335    fprintf(stderr, "total size of all interned strings: "
14336            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14337            "mortal/immortal\n", mortal_size, immortal_size);
14338    Py_DECREF(keys);
14339    PyDict_Clear(interned);
14340    Py_DECREF(interned);
14341    interned = NULL;
14342}
14343
14344
14345/********************* Unicode Iterator **************************/
14346
14347typedef struct {
14348    PyObject_HEAD
14349    Py_ssize_t it_index;
14350    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14351} unicodeiterobject;
14352
14353static void
14354unicodeiter_dealloc(unicodeiterobject *it)
14355{
14356    _PyObject_GC_UNTRACK(it);
14357    Py_XDECREF(it->it_seq);
14358    PyObject_GC_Del(it);
14359}
14360
14361static int
14362unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14363{
14364    Py_VISIT(it->it_seq);
14365    return 0;
14366}
14367
14368static PyObject *
14369unicodeiter_next(unicodeiterobject *it)
14370{
14371    PyObject *seq, *item;
14372
14373    assert(it != NULL);
14374    seq = it->it_seq;
14375    if (seq == NULL)
14376        return NULL;
14377    assert(_PyUnicode_CHECK(seq));
14378
14379    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14380        int kind = PyUnicode_KIND(seq);
14381        void *data = PyUnicode_DATA(seq);
14382        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14383        item = PyUnicode_FromOrdinal(chr);
14384        if (item != NULL)
14385            ++it->it_index;
14386        return item;
14387    }
14388
14389    Py_DECREF(seq);
14390    it->it_seq = NULL;
14391    return NULL;
14392}
14393
14394static PyObject *
14395unicodeiter_len(unicodeiterobject *it)
14396{
14397    Py_ssize_t len = 0;
14398    if (it->it_seq)
14399        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14400    return PyLong_FromSsize_t(len);
14401}
14402
14403PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14404
14405static PyObject *
14406unicodeiter_reduce(unicodeiterobject *it)
14407{
14408    if (it->it_seq != NULL) {
14409        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
14410                             it->it_seq, it->it_index);
14411    } else {
14412        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14413        if (u == NULL)
14414            return NULL;
14415        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
14416    }
14417}
14418
14419PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14420
14421static PyObject *
14422unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14423{
14424    Py_ssize_t index = PyLong_AsSsize_t(state);
14425    if (index == -1 && PyErr_Occurred())
14426        return NULL;
14427    if (index < 0)
14428        index = 0;
14429    it->it_index = index;
14430    Py_RETURN_NONE;
14431}
14432
14433PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14434
14435static PyMethodDef unicodeiter_methods[] = {
14436    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14437     length_hint_doc},
14438    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14439     reduce_doc},
14440    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
14441     setstate_doc},
14442    {NULL,      NULL}       /* sentinel */
14443};
14444
14445PyTypeObject PyUnicodeIter_Type = {
14446    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14447    "str_iterator",         /* tp_name */
14448    sizeof(unicodeiterobject),      /* tp_basicsize */
14449    0,                  /* tp_itemsize */
14450    /* methods */
14451    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14452    0,                  /* tp_print */
14453    0,                  /* tp_getattr */
14454    0,                  /* tp_setattr */
14455    0,                  /* tp_reserved */
14456    0,                  /* tp_repr */
14457    0,                  /* tp_as_number */
14458    0,                  /* tp_as_sequence */
14459    0,                  /* tp_as_mapping */
14460    0,                  /* tp_hash */
14461    0,                  /* tp_call */
14462    0,                  /* tp_str */
14463    PyObject_GenericGetAttr,        /* tp_getattro */
14464    0,                  /* tp_setattro */
14465    0,                  /* tp_as_buffer */
14466    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14467    0,                  /* tp_doc */
14468    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14469    0,                  /* tp_clear */
14470    0,                  /* tp_richcompare */
14471    0,                  /* tp_weaklistoffset */
14472    PyObject_SelfIter,          /* tp_iter */
14473    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14474    unicodeiter_methods,            /* tp_methods */
14475    0,
14476};
14477
14478static PyObject *
14479unicode_iter(PyObject *seq)
14480{
14481    unicodeiterobject *it;
14482
14483    if (!PyUnicode_Check(seq)) {
14484        PyErr_BadInternalCall();
14485        return NULL;
14486    }
14487    if (PyUnicode_READY(seq) == -1)
14488        return NULL;
14489    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14490    if (it == NULL)
14491        return NULL;
14492    it->it_index = 0;
14493    Py_INCREF(seq);
14494    it->it_seq = seq;
14495    _PyObject_GC_TRACK(it);
14496    return (PyObject *)it;
14497}
14498
14499
14500size_t
14501Py_UNICODE_strlen(const Py_UNICODE *u)
14502{
14503    int res = 0;
14504    while(*u++)
14505        res++;
14506    return res;
14507}
14508
14509Py_UNICODE*
14510Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14511{
14512    Py_UNICODE *u = s1;
14513    while ((*u++ = *s2++));
14514    return s1;
14515}
14516
14517Py_UNICODE*
14518Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14519{
14520    Py_UNICODE *u = s1;
14521    while ((*u++ = *s2++))
14522        if (n-- == 0)
14523            break;
14524    return s1;
14525}
14526
14527Py_UNICODE*
14528Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14529{
14530    Py_UNICODE *u1 = s1;
14531    u1 += Py_UNICODE_strlen(u1);
14532    Py_UNICODE_strcpy(u1, s2);
14533    return s1;
14534}
14535
14536int
14537Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14538{
14539    while (*s1 && *s2 && *s1 == *s2)
14540        s1++, s2++;
14541    if (*s1 && *s2)
14542        return (*s1 < *s2) ? -1 : +1;
14543    if (*s1)
14544        return 1;
14545    if (*s2)
14546        return -1;
14547    return 0;
14548}
14549
14550int
14551Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14552{
14553    register Py_UNICODE u1, u2;
14554    for (; n != 0; n--) {
14555        u1 = *s1;
14556        u2 = *s2;
14557        if (u1 != u2)
14558            return (u1 < u2) ? -1 : +1;
14559        if (u1 == '\0')
14560            return 0;
14561        s1++;
14562        s2++;
14563    }
14564    return 0;
14565}
14566
14567Py_UNICODE*
14568Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14569{
14570    const Py_UNICODE *p;
14571    for (p = s; *p; p++)
14572        if (*p == c)
14573            return (Py_UNICODE*)p;
14574    return NULL;
14575}
14576
14577Py_UNICODE*
14578Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14579{
14580    const Py_UNICODE *p;
14581    p = s + Py_UNICODE_strlen(s);
14582    while (p != s) {
14583        p--;
14584        if (*p == c)
14585            return (Py_UNICODE*)p;
14586    }
14587    return NULL;
14588}
14589
14590Py_UNICODE*
14591PyUnicode_AsUnicodeCopy(PyObject *unicode)
14592{
14593    Py_UNICODE *u, *copy;
14594    Py_ssize_t len, size;
14595
14596    if (!PyUnicode_Check(unicode)) {
14597        PyErr_BadArgument();
14598        return NULL;
14599    }
14600    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14601    if (u == NULL)
14602        return NULL;
14603    /* Ensure we won't overflow the size. */
14604    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14605        PyErr_NoMemory();
14606        return NULL;
14607    }
14608    size = len + 1; /* copy the null character */
14609    size *= sizeof(Py_UNICODE);
14610    copy = PyMem_Malloc(size);
14611    if (copy == NULL) {
14612        PyErr_NoMemory();
14613        return NULL;
14614    }
14615    memcpy(copy, u, size);
14616    return copy;
14617}
14618
14619/* A _string module, to export formatter_parser and formatter_field_name_split
14620   to the string.Formatter class implemented in Python. */
14621
14622static PyMethodDef _string_methods[] = {
14623    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14624     METH_O, PyDoc_STR("split the argument as a field name")},
14625    {"formatter_parser", (PyCFunction) formatter_parser,
14626     METH_O, PyDoc_STR("parse the argument as a format string")},
14627    {NULL, NULL}
14628};
14629
14630static struct PyModuleDef _string_module = {
14631    PyModuleDef_HEAD_INIT,
14632    "_string",
14633    PyDoc_STR("string helper module"),
14634    0,
14635    _string_methods,
14636    NULL,
14637    NULL,
14638    NULL,
14639    NULL
14640};
14641
14642PyMODINIT_FUNC
14643PyInit__string(void)
14644{
14645    return PyModule_Create(&_string_module);
14646}
14647
14648
14649#ifdef __cplusplus
14650}
14651#endif
14652