unicodeobject.c revision c379ead9afe114e1023ad64a9dea9a3a9a869ecf
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49/* Limit for the Unicode object free list */
50
51#define PyUnicode_MAXFREELIST       1024
52
53/* Limit for the Unicode object free list stay alive optimization.
54
55   The implementation will keep allocated Unicode memory intact for
56   all objects on the free list having a size less than this
57   limit. This reduces malloc() overhead for small Unicode objects.
58
59   At worst this will result in PyUnicode_MAXFREELIST *
60   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
61   malloc()-overhead) bytes of unused garbage.
62
63   Setting the limit to 0 effectively turns the feature off.
64
65   Note: This is an experimental feature ! If you get core dumps when
66   using Unicode objects, turn this feature off.
67
68*/
69
70#define KEEPALIVE_SIZE_LIMIT       9
71
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
80/* --- Globals ------------------------------------------------------------
81
82   The globals are initialized by the _PyUnicode_Init() API and should
83   not be used before calling that API.
84
85*/
86
87
88#ifdef __cplusplus
89extern "C" {
90#endif
91
92#ifdef Py_DEBUG
93#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
97
98#define _PyUnicode_UTF8(op)                             \
99    (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op)                              \
101    (assert(_PyUnicode_CHECK(op)),                      \
102     assert(PyUnicode_IS_READY(op)),                    \
103     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
104         ((char*)((PyASCIIObject*)(op) + 1)) :          \
105         _PyUnicode_UTF8(op))
106#define _PyUnicode_UTF8_LENGTH(op)                      \
107    (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     assert(PyUnicode_IS_READY(op)),                    \
111     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
112         ((PyASCIIObject*)(op))->length :               \
113         _PyUnicode_UTF8_LENGTH(op))
114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
119#define _PyUnicode_KIND(op)                             \
120    (assert(_PyUnicode_CHECK(op)),                      \
121     ((PyASCIIObject *)(op))->state.kind)
122#define _PyUnicode_GET_LENGTH(op)                       \
123    (assert(_PyUnicode_CHECK(op)),                      \
124     ((PyASCIIObject *)(op))->length)
125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
126
127#undef PyUnicode_READY
128#define PyUnicode_READY(op)                             \
129    (assert(_PyUnicode_CHECK(op)),                      \
130     (PyUnicode_IS_READY(op) ?                          \
131      0 : _PyUnicode_Ready((PyObject *)(op))))
132
133#define _PyUnicode_SHARE_UTF8(op)                       \
134    (assert(_PyUnicode_CHECK(op)),                      \
135     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
136     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
137#define _PyUnicode_SHARE_WSTR(op)                       \
138    (assert(_PyUnicode_CHECK(op)),                      \
139     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
140
141/* true if the Unicode object has an allocated UTF-8 memory block
142   (not shared with other data) */
143#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
144    (assert(_PyUnicode_CHECK(op)),                      \
145     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
146      && _PyUnicode_UTF8(op)                            \
147      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
149/* Generic helper macro to convert characters of different types.
150   from_type and to_type have to be valid type names, begin and end
151   are pointers to the source characters which should be of type
152   "from_type *".  to is a pointer of type "to_type *" and points to the
153   buffer where the result characters are written to. */
154#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
155    do {                                                \
156        const from_type *iter_; to_type *to_;           \
157        for (iter_ = (begin), to_ = (to_type *)(to);    \
158             iter_ < (end);                             \
159             ++iter_, ++to_) {                          \
160            *to_ = (to_type)*iter_;                     \
161        }                                               \
162    } while (0)
163
164/* The Unicode string has been modified: reset the hash */
165#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
166
167/* This dictionary holds all interned unicode strings.  Note that references
168   to strings in this dictionary are *not* counted in the string's ob_refcnt.
169   When the interned string reaches a refcnt of 0 the string deallocation
170   function will delete the reference from this dictionary.
171
172   Another way to look at this is that to say that the actual reference
173   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
174*/
175static PyObject *interned;
176
177/* The empty Unicode object is shared to improve performance. */
178static PyObject *unicode_empty;
179
180/* Single character Unicode strings in the Latin-1 range are being
181   shared as well. */
182static PyObject *unicode_latin1[256];
183
184/* Fast detection of the most frequent whitespace characters */
185const unsigned char _Py_ascii_whitespace[] = {
186    0, 0, 0, 0, 0, 0, 0, 0,
187/*     case 0x0009: * CHARACTER TABULATION */
188/*     case 0x000A: * LINE FEED */
189/*     case 0x000B: * LINE TABULATION */
190/*     case 0x000C: * FORM FEED */
191/*     case 0x000D: * CARRIAGE RETURN */
192    0, 1, 1, 1, 1, 1, 0, 0,
193    0, 0, 0, 0, 0, 0, 0, 0,
194/*     case 0x001C: * FILE SEPARATOR */
195/*     case 0x001D: * GROUP SEPARATOR */
196/*     case 0x001E: * RECORD SEPARATOR */
197/*     case 0x001F: * UNIT SEPARATOR */
198    0, 0, 0, 0, 1, 1, 1, 1,
199/*     case 0x0020: * SPACE */
200    1, 0, 0, 0, 0, 0, 0, 0,
201    0, 0, 0, 0, 0, 0, 0, 0,
202    0, 0, 0, 0, 0, 0, 0, 0,
203    0, 0, 0, 0, 0, 0, 0, 0,
204
205    0, 0, 0, 0, 0, 0, 0, 0,
206    0, 0, 0, 0, 0, 0, 0, 0,
207    0, 0, 0, 0, 0, 0, 0, 0,
208    0, 0, 0, 0, 0, 0, 0, 0,
209    0, 0, 0, 0, 0, 0, 0, 0,
210    0, 0, 0, 0, 0, 0, 0, 0,
211    0, 0, 0, 0, 0, 0, 0, 0,
212    0, 0, 0, 0, 0, 0, 0, 0
213};
214
215static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
216
217static PyObject *
218unicode_encode_call_errorhandler(const char *errors,
219       PyObject **errorHandler,const char *encoding, const char *reason,
220       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
221       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
222
223static void
224raise_encode_exception(PyObject **exceptionObject,
225                       const char *encoding,
226                       const Py_UNICODE *unicode, Py_ssize_t size,
227                       Py_ssize_t startpos, Py_ssize_t endpos,
228                       const char *reason);
229
230/* Same for linebreaks */
231static unsigned char ascii_linebreak[] = {
232    0, 0, 0, 0, 0, 0, 0, 0,
233/*         0x000A, * LINE FEED */
234/*         0x000B, * LINE TABULATION */
235/*         0x000C, * FORM FEED */
236/*         0x000D, * CARRIAGE RETURN */
237    0, 0, 1, 1, 1, 1, 0, 0,
238    0, 0, 0, 0, 0, 0, 0, 0,
239/*         0x001C, * FILE SEPARATOR */
240/*         0x001D, * GROUP SEPARATOR */
241/*         0x001E, * RECORD SEPARATOR */
242    0, 0, 0, 0, 1, 1, 1, 0,
243    0, 0, 0, 0, 0, 0, 0, 0,
244    0, 0, 0, 0, 0, 0, 0, 0,
245    0, 0, 0, 0, 0, 0, 0, 0,
246    0, 0, 0, 0, 0, 0, 0, 0,
247
248    0, 0, 0, 0, 0, 0, 0, 0,
249    0, 0, 0, 0, 0, 0, 0, 0,
250    0, 0, 0, 0, 0, 0, 0, 0,
251    0, 0, 0, 0, 0, 0, 0, 0,
252    0, 0, 0, 0, 0, 0, 0, 0,
253    0, 0, 0, 0, 0, 0, 0, 0,
254    0, 0, 0, 0, 0, 0, 0, 0,
255    0, 0, 0, 0, 0, 0, 0, 0
256};
257
258/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
259   This function is kept for backward compatibility with the old API. */
260Py_UNICODE
261PyUnicode_GetMax(void)
262{
263#ifdef Py_UNICODE_WIDE
264    return 0x10FFFF;
265#else
266    /* This is actually an illegal character, so it should
267       not be passed to unichr. */
268    return 0xFFFF;
269#endif
270}
271
272#ifdef Py_DEBUG
273static int
274_PyUnicode_CheckConsistency(void *op)
275{
276    PyASCIIObject *ascii;
277    unsigned int kind;
278
279    assert(PyUnicode_Check(op));
280
281    ascii = (PyASCIIObject *)op;
282    kind = ascii->state.kind;
283
284    if (ascii->state.ascii == 1) {
285        assert(kind == PyUnicode_1BYTE_KIND);
286        assert(ascii->state.compact == 1);
287        assert(ascii->state.ready == 1);
288    }
289    else if (ascii->state.compact == 1) {
290        assert(kind == PyUnicode_1BYTE_KIND
291               || kind == PyUnicode_2BYTE_KIND
292               || kind == PyUnicode_4BYTE_KIND);
293        assert(ascii->state.compact == 1);
294        assert(ascii->state.ascii == 0);
295        assert(ascii->state.ready == 1);
296    } else {
297        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
298        PyUnicodeObject *unicode = (PyUnicodeObject *)op;
299
300        if (kind == PyUnicode_WCHAR_KIND) {
301            assert(!ascii->state.compact == 1);
302            assert(ascii->state.ascii == 0);
303            assert(!ascii->state.ready == 1);
304            assert(ascii->wstr != NULL);
305            assert(unicode->data.any == NULL);
306            assert(compact->utf8 == NULL);
307            assert(ascii->state.interned == SSTATE_NOT_INTERNED);
308        }
309        else {
310            assert(kind == PyUnicode_1BYTE_KIND
311                   || kind == PyUnicode_2BYTE_KIND
312                   || kind == PyUnicode_4BYTE_KIND);
313            assert(!ascii->state.compact == 1);
314            assert(ascii->state.ready == 1);
315            assert(unicode->data.any != NULL);
316            assert(ascii->state.ascii == 0);
317        }
318    }
319    return 1;
320}
321#endif
322
323/* --- Bloom Filters ----------------------------------------------------- */
324
325/* stuff to implement simple "bloom filters" for Unicode characters.
326   to keep things simple, we use a single bitmask, using the least 5
327   bits from each unicode characters as the bit index. */
328
329/* the linebreak mask is set up by Unicode_Init below */
330
331#if LONG_BIT >= 128
332#define BLOOM_WIDTH 128
333#elif LONG_BIT >= 64
334#define BLOOM_WIDTH 64
335#elif LONG_BIT >= 32
336#define BLOOM_WIDTH 32
337#else
338#error "LONG_BIT is smaller than 32"
339#endif
340
341#define BLOOM_MASK unsigned long
342
343static BLOOM_MASK bloom_linebreak;
344
345#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
346#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
347
348#define BLOOM_LINEBREAK(ch)                                             \
349    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
350     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
351
352Py_LOCAL_INLINE(BLOOM_MASK)
353make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
354{
355    /* calculate simple bloom-style bitmask for a given unicode string */
356
357    BLOOM_MASK mask;
358    Py_ssize_t i;
359
360    mask = 0;
361    for (i = 0; i < len; i++)
362        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
363
364    return mask;
365}
366
367#define BLOOM_MEMBER(mask, chr, str) \
368    (BLOOM(mask, chr) \
369     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
370
371/* --- Unicode Object ----------------------------------------------------- */
372
373static PyObject *
374fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
375
376Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
377                                 Py_ssize_t size, Py_UCS4 ch,
378                                 int direction)
379{
380    /* like wcschr, but doesn't stop at NULL characters */
381    Py_ssize_t i;
382    if (direction == 1) {
383        for(i = 0; i < size; i++)
384            if (PyUnicode_READ(kind, s, i) == ch)
385                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
386    }
387    else {
388        for(i = size-1; i >= 0; i--)
389            if (PyUnicode_READ(kind, s, i) == ch)
390                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
391    }
392    return NULL;
393}
394
395static PyObject*
396resize_compact(PyObject *unicode, Py_ssize_t length)
397{
398    Py_ssize_t char_size;
399    Py_ssize_t struct_size;
400    Py_ssize_t new_size;
401    int share_wstr;
402
403    assert(PyUnicode_IS_READY(unicode));
404    char_size = PyUnicode_CHARACTER_SIZE(unicode);
405    if (PyUnicode_IS_COMPACT_ASCII(unicode))
406        struct_size = sizeof(PyASCIIObject);
407    else
408        struct_size = sizeof(PyCompactUnicodeObject);
409    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
410
411    _Py_DEC_REFTOTAL;
412    _Py_ForgetReference(unicode);
413
414    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
415        PyErr_NoMemory();
416        return NULL;
417    }
418    new_size = (struct_size + (length + 1) * char_size);
419
420    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
421    if (unicode == NULL) {
422        PyObject_Del(unicode);
423        PyErr_NoMemory();
424        return NULL;
425    }
426    _Py_NewReference(unicode);
427    _PyUnicode_LENGTH(unicode) = length;
428    if (share_wstr) {
429        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
430        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
431            _PyUnicode_WSTR_LENGTH(unicode) = length;
432    }
433    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
434                    length, 0);
435    return unicode;
436}
437
438static int
439resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
440{
441    void *oldstr;
442
443    assert(!PyUnicode_IS_COMPACT(unicode));
444
445    assert(Py_REFCNT(unicode) == 1);
446    _PyUnicode_DIRTY(unicode);
447
448    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
449    {
450        PyObject_DEL(_PyUnicode_UTF8(unicode));
451        _PyUnicode_UTF8(unicode) = NULL;
452    }
453
454    if (PyUnicode_IS_READY(unicode)) {
455        Py_ssize_t char_size;
456        Py_ssize_t new_size;
457        int share_wstr, share_utf8;
458        void *data;
459
460        data = _PyUnicode_DATA_ANY(unicode);
461        assert(data != NULL);
462        char_size = PyUnicode_CHARACTER_SIZE(unicode);
463        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
464        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
465
466        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
467            PyErr_NoMemory();
468            return -1;
469        }
470        new_size = (length + 1) * char_size;
471
472        data = (PyObject *)PyObject_REALLOC(data, new_size);
473        if (data == NULL) {
474            PyErr_NoMemory();
475            return -1;
476        }
477        _PyUnicode_DATA_ANY(unicode) = data;
478        if (share_wstr) {
479            _PyUnicode_WSTR(unicode) = data;
480            _PyUnicode_WSTR_LENGTH(unicode) = length;
481        }
482        if (share_utf8) {
483            _PyUnicode_UTF8(unicode) = data;
484            _PyUnicode_UTF8_LENGTH(unicode) = length;
485        }
486        _PyUnicode_LENGTH(unicode) = length;
487        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
488        if (share_wstr)
489            return 0;
490    }
491    if (_PyUnicode_WSTR(unicode) != NULL) {
492        assert(_PyUnicode_WSTR(unicode) != NULL);
493
494        oldstr = _PyUnicode_WSTR(unicode);
495        _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
496                                         sizeof(Py_UNICODE) * (length + 1));
497        if (!_PyUnicode_WSTR(unicode)) {
498            _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
499            PyErr_NoMemory();
500            return -1;
501        }
502        _PyUnicode_WSTR(unicode)[length] = 0;
503        _PyUnicode_WSTR_LENGTH(unicode) = length;
504    }
505    return 0;
506}
507
508static PyObject*
509resize_copy(PyObject *unicode, Py_ssize_t length)
510{
511    Py_ssize_t copy_length;
512    if (PyUnicode_IS_COMPACT(unicode)) {
513        PyObject *copy;
514        assert(PyUnicode_IS_READY(unicode));
515
516        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
517        if (copy == NULL)
518            return NULL;
519
520        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
521        if (PyUnicode_CopyCharacters(copy, 0,
522                                     unicode, 0,
523                                     copy_length) < 0)
524        {
525            Py_DECREF(copy);
526            return NULL;
527        }
528        return copy;
529    } else {
530        PyUnicodeObject *w;
531        assert(_PyUnicode_WSTR(unicode) != NULL);
532        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
533        w = _PyUnicode_New(length);
534        if (w == NULL)
535            return NULL;
536        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
537        copy_length = Py_MIN(copy_length, length);
538        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
539                        copy_length);
540        return (PyObject*)w;
541    }
542}
543
544/* We allocate one more byte to make sure the string is
545   Ux0000 terminated; some code (e.g. new_identifier)
546   relies on that.
547
548   XXX This allocator could further be enhanced by assuring that the
549   free list never reduces its size below 1.
550
551*/
552
553#ifdef Py_DEBUG
554int unicode_old_new_calls = 0;
555#endif
556
557static PyUnicodeObject *
558_PyUnicode_New(Py_ssize_t length)
559{
560    register PyUnicodeObject *unicode;
561    size_t new_size;
562
563    /* Optimization for empty strings */
564    if (length == 0 && unicode_empty != NULL) {
565        Py_INCREF(unicode_empty);
566        return (PyUnicodeObject*)unicode_empty;
567    }
568
569    /* Ensure we won't overflow the size. */
570    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
571        return (PyUnicodeObject *)PyErr_NoMemory();
572    }
573    if (length < 0) {
574        PyErr_SetString(PyExc_SystemError,
575                        "Negative size passed to _PyUnicode_New");
576        return NULL;
577    }
578
579#ifdef Py_DEBUG
580    ++unicode_old_new_calls;
581#endif
582
583    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
584    if (unicode == NULL)
585        return NULL;
586    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
587    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
588    if (!_PyUnicode_WSTR(unicode)) {
589        PyErr_NoMemory();
590        goto onError;
591    }
592
593    /* Initialize the first element to guard against cases where
594     * the caller fails before initializing str -- unicode_resize()
595     * reads str[0], and the Keep-Alive optimization can keep memory
596     * allocated for str alive across a call to unicode_dealloc(unicode).
597     * We don't want unicode_resize to read uninitialized memory in
598     * that case.
599     */
600    _PyUnicode_WSTR(unicode)[0] = 0;
601    _PyUnicode_WSTR(unicode)[length] = 0;
602    _PyUnicode_WSTR_LENGTH(unicode) = length;
603    _PyUnicode_HASH(unicode) = -1;
604    _PyUnicode_STATE(unicode).interned = 0;
605    _PyUnicode_STATE(unicode).kind = 0;
606    _PyUnicode_STATE(unicode).compact = 0;
607    _PyUnicode_STATE(unicode).ready = 0;
608    _PyUnicode_STATE(unicode).ascii = 0;
609    _PyUnicode_DATA_ANY(unicode) = NULL;
610    _PyUnicode_LENGTH(unicode) = 0;
611    _PyUnicode_UTF8(unicode) = NULL;
612    _PyUnicode_UTF8_LENGTH(unicode) = 0;
613    return unicode;
614
615  onError:
616    /* XXX UNREF/NEWREF interface should be more symmetrical */
617    _Py_DEC_REFTOTAL;
618    _Py_ForgetReference((PyObject *)unicode);
619    PyObject_Del(unicode);
620    return NULL;
621}
622
623static const char*
624unicode_kind_name(PyObject *unicode)
625{
626    assert(_PyUnicode_CHECK(unicode));
627    if (!PyUnicode_IS_COMPACT(unicode))
628    {
629        if (!PyUnicode_IS_READY(unicode))
630            return "wstr";
631        switch(PyUnicode_KIND(unicode))
632        {
633        case PyUnicode_1BYTE_KIND:
634            if (PyUnicode_IS_COMPACT_ASCII(unicode))
635                return "legacy ascii";
636            else
637                return "legacy latin1";
638        case PyUnicode_2BYTE_KIND:
639            return "legacy UCS2";
640        case PyUnicode_4BYTE_KIND:
641            return "legacy UCS4";
642        default:
643            return "<legacy invalid kind>";
644        }
645    }
646    assert(PyUnicode_IS_READY(unicode));
647    switch(PyUnicode_KIND(unicode))
648    {
649    case PyUnicode_1BYTE_KIND:
650        if (PyUnicode_IS_COMPACT_ASCII(unicode))
651            return "ascii";
652        else
653            return "compact latin1";
654    case PyUnicode_2BYTE_KIND:
655        return "compact UCS2";
656    case PyUnicode_4BYTE_KIND:
657        return "compact UCS4";
658    default:
659        return "<invalid compact kind>";
660    }
661}
662
663#ifdef Py_DEBUG
664int unicode_new_new_calls = 0;
665
666/* Functions wrapping macros for use in debugger */
667char *_PyUnicode_utf8(void *unicode){
668    return PyUnicode_UTF8(unicode);
669}
670
671void *_PyUnicode_compact_data(void *unicode) {
672    return _PyUnicode_COMPACT_DATA(unicode);
673}
674void *_PyUnicode_data(void *unicode){
675    printf("obj %p\n", unicode);
676    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
677    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
678    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
679    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
680    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
681    return PyUnicode_DATA(unicode);
682}
683
684void
685_PyUnicode_Dump(PyObject *op)
686{
687    PyASCIIObject *ascii = (PyASCIIObject *)op;
688    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
689    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
690    void *data;
691    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
692    if (ascii->state.compact)
693        data = (compact + 1);
694    else
695        data = unicode->data.any;
696    if (ascii->wstr == data)
697        printf("shared ");
698    printf("wstr=%p", ascii->wstr);
699    if (!ascii->state.ascii) {
700        printf(" (%zu), ", compact->wstr_length);
701        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
702            printf("shared ");
703        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
704    }
705    printf(", data=%p\n", data);
706}
707#endif
708
709PyObject *
710PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
711{
712    PyObject *obj;
713    PyCompactUnicodeObject *unicode;
714    void *data;
715    int kind_state;
716    int is_sharing = 0, is_ascii = 0;
717    Py_ssize_t char_size;
718    Py_ssize_t struct_size;
719
720    /* Optimization for empty strings */
721    if (size == 0 && unicode_empty != NULL) {
722        Py_INCREF(unicode_empty);
723        return unicode_empty;
724    }
725
726#ifdef Py_DEBUG
727    ++unicode_new_new_calls;
728#endif
729
730    struct_size = sizeof(PyCompactUnicodeObject);
731    if (maxchar < 128) {
732        kind_state = PyUnicode_1BYTE_KIND;
733        char_size = 1;
734        is_ascii = 1;
735        struct_size = sizeof(PyASCIIObject);
736    }
737    else if (maxchar < 256) {
738        kind_state = PyUnicode_1BYTE_KIND;
739        char_size = 1;
740    }
741    else if (maxchar < 65536) {
742        kind_state = PyUnicode_2BYTE_KIND;
743        char_size = 2;
744        if (sizeof(wchar_t) == 2)
745            is_sharing = 1;
746    }
747    else {
748        kind_state = PyUnicode_4BYTE_KIND;
749        char_size = 4;
750        if (sizeof(wchar_t) == 4)
751            is_sharing = 1;
752    }
753
754    /* Ensure we won't overflow the size. */
755    if (size < 0) {
756        PyErr_SetString(PyExc_SystemError,
757                        "Negative size passed to PyUnicode_New");
758        return NULL;
759    }
760    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
761        return PyErr_NoMemory();
762
763    /* Duplicated allocation code from _PyObject_New() instead of a call to
764     * PyObject_New() so we are able to allocate space for the object and
765     * it's data buffer.
766     */
767    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
768    if (obj == NULL)
769        return PyErr_NoMemory();
770    obj = PyObject_INIT(obj, &PyUnicode_Type);
771    if (obj == NULL)
772        return NULL;
773
774    unicode = (PyCompactUnicodeObject *)obj;
775    if (is_ascii)
776        data = ((PyASCIIObject*)obj) + 1;
777    else
778        data = unicode + 1;
779    _PyUnicode_LENGTH(unicode) = size;
780    _PyUnicode_HASH(unicode) = -1;
781    _PyUnicode_STATE(unicode).interned = 0;
782    _PyUnicode_STATE(unicode).kind = kind_state;
783    _PyUnicode_STATE(unicode).compact = 1;
784    _PyUnicode_STATE(unicode).ready = 1;
785    _PyUnicode_STATE(unicode).ascii = is_ascii;
786    if (is_ascii) {
787        ((char*)data)[size] = 0;
788        _PyUnicode_WSTR(unicode) = NULL;
789    }
790    else if (kind_state == PyUnicode_1BYTE_KIND) {
791        ((char*)data)[size] = 0;
792        _PyUnicode_WSTR(unicode) = NULL;
793        _PyUnicode_WSTR_LENGTH(unicode) = 0;
794        unicode->utf8_length = 0;
795        unicode->utf8 = NULL;
796        }
797    else {
798        unicode->utf8 = NULL;
799        if (kind_state == PyUnicode_2BYTE_KIND)
800            ((Py_UCS2*)data)[size] = 0;
801        else /* kind_state == PyUnicode_4BYTE_KIND */
802            ((Py_UCS4*)data)[size] = 0;
803        if (is_sharing) {
804            _PyUnicode_WSTR_LENGTH(unicode) = size;
805            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
806        }
807        else {
808            _PyUnicode_WSTR_LENGTH(unicode) = 0;
809            _PyUnicode_WSTR(unicode) = NULL;
810        }
811    }
812    return obj;
813}
814
815#if SIZEOF_WCHAR_T == 2
816/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
817   will decode surrogate pairs, the other conversions are implemented as macros
818   for efficency.
819
820   This function assumes that unicode can hold one more code point than wstr
821   characters for a terminating null character. */
822static void
823unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
824                              PyUnicodeObject *unicode)
825{
826    const wchar_t *iter;
827    Py_UCS4 *ucs4_out;
828
829    assert(unicode != NULL);
830    assert(_PyUnicode_CHECK(unicode));
831    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
832    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
833
834    for (iter = begin; iter < end; ) {
835        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
836                           _PyUnicode_GET_LENGTH(unicode)));
837        if (*iter >= 0xD800 && *iter <= 0xDBFF
838            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
839        {
840            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
841            iter += 2;
842        }
843        else {
844            *ucs4_out++ = *iter;
845            iter++;
846        }
847    }
848    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
849                        _PyUnicode_GET_LENGTH(unicode)));
850
851}
852#endif
853
854static int
855_PyUnicode_Dirty(PyObject *unicode)
856{
857    assert(_PyUnicode_CHECK(unicode));
858    if (Py_REFCNT(unicode) != 1) {
859        PyErr_SetString(PyExc_ValueError,
860                        "Cannot modify a string having more than 1 reference");
861        return -1;
862    }
863    _PyUnicode_DIRTY(unicode);
864    return 0;
865}
866
867Py_ssize_t
868PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
869                         PyObject *from, Py_ssize_t from_start,
870                         Py_ssize_t how_many)
871{
872    unsigned int from_kind, to_kind;
873    void *from_data, *to_data;
874
875    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
876        PyErr_BadInternalCall();
877        return -1;
878    }
879
880    if (PyUnicode_READY(from))
881        return -1;
882    if (PyUnicode_READY(to))
883        return -1;
884
885    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
886    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
887        PyErr_Format(PyExc_ValueError,
888                     "Cannot write %zi characters at %zi "
889                     "in a string of %zi characters",
890                     how_many, to_start, PyUnicode_GET_LENGTH(to));
891        return -1;
892    }
893    if (how_many == 0)
894        return 0;
895
896    if (_PyUnicode_Dirty(to))
897        return -1;
898
899    from_kind = PyUnicode_KIND(from);
900    from_data = PyUnicode_DATA(from);
901    to_kind = PyUnicode_KIND(to);
902    to_data = PyUnicode_DATA(to);
903
904    if (from_kind == to_kind
905        /* deny latin1 => ascii */
906        && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
907    {
908        Py_MEMCPY((char*)to_data
909                      + PyUnicode_KIND_SIZE(to_kind, to_start),
910                  (char*)from_data
911                      + PyUnicode_KIND_SIZE(from_kind, from_start),
912                  PyUnicode_KIND_SIZE(to_kind, how_many));
913    }
914    else if (from_kind == PyUnicode_1BYTE_KIND
915             && to_kind == PyUnicode_2BYTE_KIND)
916    {
917        _PyUnicode_CONVERT_BYTES(
918            Py_UCS1, Py_UCS2,
919            PyUnicode_1BYTE_DATA(from) + from_start,
920            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
921            PyUnicode_2BYTE_DATA(to) + to_start
922            );
923    }
924    else if (from_kind == PyUnicode_1BYTE_KIND
925             && to_kind == PyUnicode_4BYTE_KIND)
926    {
927        _PyUnicode_CONVERT_BYTES(
928            Py_UCS1, Py_UCS4,
929            PyUnicode_1BYTE_DATA(from) + from_start,
930            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
931            PyUnicode_4BYTE_DATA(to) + to_start
932            );
933    }
934    else if (from_kind == PyUnicode_2BYTE_KIND
935             && to_kind == PyUnicode_4BYTE_KIND)
936    {
937        _PyUnicode_CONVERT_BYTES(
938            Py_UCS2, Py_UCS4,
939            PyUnicode_2BYTE_DATA(from) + from_start,
940            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
941            PyUnicode_4BYTE_DATA(to) + to_start
942            );
943    }
944    else {
945        int invalid_kinds;
946
947        /* check if max_char(from substring) <= max_char(to) */
948        if (from_kind > to_kind
949                /* latin1 => ascii */
950            || (PyUnicode_IS_COMPACT_ASCII(to)
951                && to_kind == PyUnicode_1BYTE_KIND
952                && !PyUnicode_IS_COMPACT_ASCII(from)))
953        {
954            /* slow path to check for character overflow */
955            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
956            Py_UCS4 ch, maxchar;
957            Py_ssize_t i;
958
959            maxchar = 0;
960            invalid_kinds = 0;
961            for (i=0; i < how_many; i++) {
962                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
963                if (ch > maxchar) {
964                    maxchar = ch;
965                    if (maxchar > to_maxchar) {
966                        invalid_kinds = 1;
967                        break;
968                    }
969                }
970                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
971            }
972        }
973        else
974            invalid_kinds = 1;
975        if (invalid_kinds) {
976            PyErr_Format(PyExc_ValueError,
977                         "Cannot copy %s characters "
978                         "into a string of %s characters",
979                         unicode_kind_name(from),
980                         unicode_kind_name(to));
981            return -1;
982        }
983    }
984    return how_many;
985}
986
987/* Find the maximum code point and count the number of surrogate pairs so a
988   correct string length can be computed before converting a string to UCS4.
989   This function counts single surrogates as a character and not as a pair.
990
991   Return 0 on success, or -1 on error. */
992static int
993find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
994                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
995{
996    const wchar_t *iter;
997
998    assert(num_surrogates != NULL && maxchar != NULL);
999    if (num_surrogates == NULL || maxchar == NULL) {
1000        PyErr_SetString(PyExc_SystemError,
1001                        "unexpected NULL arguments to "
1002                        "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1003        return -1;
1004    }
1005
1006    *num_surrogates = 0;
1007    *maxchar = 0;
1008
1009    for (iter = begin; iter < end; ) {
1010        if (*iter > *maxchar)
1011            *maxchar = *iter;
1012#if SIZEOF_WCHAR_T == 2
1013        if (*iter >= 0xD800 && *iter <= 0xDBFF
1014            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1015        {
1016            Py_UCS4 surrogate_val;
1017            surrogate_val = (((iter[0] & 0x3FF)<<10)
1018                             | (iter[1] & 0x3FF)) + 0x10000;
1019            ++(*num_surrogates);
1020            if (surrogate_val > *maxchar)
1021                *maxchar = surrogate_val;
1022            iter += 2;
1023        }
1024        else
1025            iter++;
1026#else
1027        iter++;
1028#endif
1029    }
1030    return 0;
1031}
1032
1033#ifdef Py_DEBUG
1034int unicode_ready_calls = 0;
1035#endif
1036
1037int
1038_PyUnicode_Ready(PyObject *obj)
1039{
1040    PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
1041    wchar_t *end;
1042    Py_UCS4 maxchar = 0;
1043    Py_ssize_t num_surrogates;
1044#if SIZEOF_WCHAR_T == 2
1045    Py_ssize_t length_wo_surrogates;
1046#endif
1047
1048    /* _PyUnicode_Ready() is only intented for old-style API usage where
1049       strings were created using _PyObject_New() and where no canonical
1050       representation (the str field) has been set yet aka strings
1051       which are not yet ready. */
1052    assert(_PyUnicode_CHECK(unicode));
1053    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1054    assert(_PyUnicode_WSTR(unicode) != NULL);
1055    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1056    assert(_PyUnicode_UTF8(unicode) == NULL);
1057    /* Actually, it should neither be interned nor be anything else: */
1058    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1059
1060#ifdef Py_DEBUG
1061    ++unicode_ready_calls;
1062#endif
1063
1064    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1065    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1066                                &maxchar, &num_surrogates) == -1)
1067        return -1;
1068
1069    if (maxchar < 256) {
1070        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1071        if (!_PyUnicode_DATA_ANY(unicode)) {
1072            PyErr_NoMemory();
1073            return -1;
1074        }
1075        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1076                                _PyUnicode_WSTR(unicode), end,
1077                                PyUnicode_1BYTE_DATA(unicode));
1078        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1079        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1080        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1081        if (maxchar < 128) {
1082            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1083            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1084        }
1085        else {
1086            _PyUnicode_UTF8(unicode) = NULL;
1087            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1088        }
1089        PyObject_FREE(_PyUnicode_WSTR(unicode));
1090        _PyUnicode_WSTR(unicode) = NULL;
1091        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1092    }
1093    /* In this case we might have to convert down from 4-byte native
1094       wchar_t to 2-byte unicode. */
1095    else if (maxchar < 65536) {
1096        assert(num_surrogates == 0 &&
1097               "FindMaxCharAndNumSurrogatePairs() messed up");
1098
1099#if SIZEOF_WCHAR_T == 2
1100        /* We can share representations and are done. */
1101        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1102        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1103        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1104        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1105        _PyUnicode_UTF8(unicode) = NULL;
1106        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1107#else
1108        /* sizeof(wchar_t) == 4 */
1109        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1110            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1111        if (!_PyUnicode_DATA_ANY(unicode)) {
1112            PyErr_NoMemory();
1113            return -1;
1114        }
1115        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1116                                _PyUnicode_WSTR(unicode), end,
1117                                PyUnicode_2BYTE_DATA(unicode));
1118        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1119        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1120        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1121        _PyUnicode_UTF8(unicode) = NULL;
1122        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1123        PyObject_FREE(_PyUnicode_WSTR(unicode));
1124        _PyUnicode_WSTR(unicode) = NULL;
1125        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1126#endif
1127    }
1128    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1129    else {
1130#if SIZEOF_WCHAR_T == 2
1131        /* in case the native representation is 2-bytes, we need to allocate a
1132           new normalized 4-byte version. */
1133        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1134        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1135        if (!_PyUnicode_DATA_ANY(unicode)) {
1136            PyErr_NoMemory();
1137            return -1;
1138        }
1139        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1140        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1141        _PyUnicode_UTF8(unicode) = NULL;
1142        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1143        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1144        _PyUnicode_STATE(unicode).ready = 1;
1145        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1146        PyObject_FREE(_PyUnicode_WSTR(unicode));
1147        _PyUnicode_WSTR(unicode) = NULL;
1148        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1149#else
1150        assert(num_surrogates == 0);
1151
1152        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1153        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1154        _PyUnicode_UTF8(unicode) = NULL;
1155        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1156        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1157#endif
1158        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1159    }
1160    _PyUnicode_STATE(unicode).ready = 1;
1161    return 0;
1162}
1163
1164static void
1165unicode_dealloc(register PyUnicodeObject *unicode)
1166{
1167    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1168    case SSTATE_NOT_INTERNED:
1169        break;
1170
1171    case SSTATE_INTERNED_MORTAL:
1172        /* revive dead object temporarily for DelItem */
1173        Py_REFCNT(unicode) = 3;
1174        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1175            Py_FatalError(
1176                "deletion of interned string failed");
1177        break;
1178
1179    case SSTATE_INTERNED_IMMORTAL:
1180        Py_FatalError("Immortal interned string died.");
1181
1182    default:
1183        Py_FatalError("Inconsistent interned string state.");
1184    }
1185
1186    if (_PyUnicode_WSTR(unicode) &&
1187        (!PyUnicode_IS_READY(unicode) ||
1188         _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1189        PyObject_DEL(_PyUnicode_WSTR(unicode));
1190    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1191        PyObject_DEL(_PyUnicode_UTF8(unicode));
1192
1193    if (PyUnicode_IS_COMPACT(unicode)) {
1194        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1195    }
1196    else {
1197        if (_PyUnicode_DATA_ANY(unicode))
1198            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1199        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1200    }
1201}
1202
1203static int
1204unicode_resizable(PyObject *unicode)
1205{
1206    Py_ssize_t len;
1207    if (Py_REFCNT(unicode) != 1)
1208        return 0;
1209    if (PyUnicode_CHECK_INTERNED(unicode))
1210        return 0;
1211    if (unicode == unicode_empty)
1212        return 0;
1213    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1214        len = PyUnicode_WSTR_LENGTH(unicode);
1215    else
1216        len = PyUnicode_GET_LENGTH(unicode);
1217    if (len == 1) {
1218        Py_UCS4 ch;
1219        if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1220            ch = _PyUnicode_WSTR(unicode)[0];
1221        else
1222            ch = PyUnicode_READ_CHAR(unicode, 0);
1223        if (ch < 256 && unicode_latin1[ch] == unicode)
1224            return 0;
1225    }
1226    return 1;
1227}
1228
1229static int
1230unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1231{
1232    PyObject *unicode;
1233    Py_ssize_t old_length;
1234
1235    assert(p_unicode != NULL);
1236    unicode = *p_unicode;
1237
1238    assert(unicode != NULL);
1239    assert(PyUnicode_Check(unicode));
1240    assert(0 <= length);
1241
1242    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1243        old_length = PyUnicode_WSTR_LENGTH(unicode);
1244    else
1245        old_length = PyUnicode_GET_LENGTH(unicode);
1246    if (old_length == length)
1247        return 0;
1248
1249    /* FIXME: really create a new object? */
1250    if (!unicode_resizable(unicode)) {
1251        PyObject *copy = resize_copy(unicode, length);
1252        if (copy == NULL)
1253            return -1;
1254        Py_DECREF(*p_unicode);
1255        *p_unicode = copy;
1256        return 0;
1257    }
1258
1259    if (PyUnicode_IS_COMPACT(unicode)) {
1260        *p_unicode = resize_compact(unicode, length);
1261        if (*p_unicode == NULL)
1262            return -1;
1263        return 0;
1264    } else
1265        return resize_inplace((PyUnicodeObject*)unicode, length);
1266}
1267
1268int
1269PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1270{
1271    PyObject *unicode;
1272    if (p_unicode == NULL) {
1273        PyErr_BadInternalCall();
1274        return -1;
1275    }
1276    unicode = *p_unicode;
1277    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1278        || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1279    {
1280        PyErr_BadInternalCall();
1281        return -1;
1282    }
1283    return unicode_resize(p_unicode, length);
1284}
1285
1286static PyObject*
1287get_latin1_char(unsigned char ch)
1288{
1289    PyObject *unicode = unicode_latin1[ch];
1290    if (!unicode) {
1291        unicode = PyUnicode_New(1, ch);
1292        if (!unicode)
1293            return NULL;
1294        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1295        unicode_latin1[ch] = unicode;
1296    }
1297    Py_INCREF(unicode);
1298    return unicode;
1299}
1300
1301PyObject *
1302PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1303{
1304    PyUnicodeObject *unicode;
1305    Py_UCS4 maxchar = 0;
1306    Py_ssize_t num_surrogates;
1307
1308    if (u == NULL)
1309        return (PyObject*)_PyUnicode_New(size);
1310
1311    /* If the Unicode data is known at construction time, we can apply
1312       some optimizations which share commonly used objects. */
1313
1314    /* Optimization for empty strings */
1315    if (size == 0 && unicode_empty != NULL) {
1316        Py_INCREF(unicode_empty);
1317        return unicode_empty;
1318    }
1319
1320    /* Single character Unicode objects in the Latin-1 range are
1321       shared when using this constructor */
1322    if (size == 1 && *u < 256)
1323        return get_latin1_char((unsigned char)*u);
1324
1325    /* If not empty and not single character, copy the Unicode data
1326       into the new object */
1327    if (find_maxchar_surrogates(u, u + size,
1328                                &maxchar, &num_surrogates) == -1)
1329        return NULL;
1330
1331    unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1332                                                maxchar);
1333    if (!unicode)
1334        return NULL;
1335
1336    switch (PyUnicode_KIND(unicode)) {
1337    case PyUnicode_1BYTE_KIND:
1338        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1339                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1340        break;
1341    case PyUnicode_2BYTE_KIND:
1342#if Py_UNICODE_SIZE == 2
1343        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1344#else
1345        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1346                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1347#endif
1348        break;
1349    case PyUnicode_4BYTE_KIND:
1350#if SIZEOF_WCHAR_T == 2
1351        /* This is the only case which has to process surrogates, thus
1352           a simple copy loop is not enough and we need a function. */
1353        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1354#else
1355        assert(num_surrogates == 0);
1356        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1357#endif
1358        break;
1359    default:
1360        assert(0 && "Impossible state");
1361    }
1362
1363    return (PyObject *)unicode;
1364}
1365
1366PyObject *
1367PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1368{
1369    PyUnicodeObject *unicode;
1370
1371    if (size < 0) {
1372        PyErr_SetString(PyExc_SystemError,
1373                        "Negative size passed to PyUnicode_FromStringAndSize");
1374        return NULL;
1375    }
1376
1377    /* If the Unicode data is known at construction time, we can apply
1378       some optimizations which share commonly used objects.
1379       Also, this means the input must be UTF-8, so fall back to the
1380       UTF-8 decoder at the end. */
1381    if (u != NULL) {
1382
1383        /* Optimization for empty strings */
1384        if (size == 0 && unicode_empty != NULL) {
1385            Py_INCREF(unicode_empty);
1386            return unicode_empty;
1387        }
1388
1389        /* Single characters are shared when using this constructor.
1390           Restrict to ASCII, since the input must be UTF-8. */
1391        if (size == 1 && Py_CHARMASK(*u) < 128)
1392            return get_latin1_char(Py_CHARMASK(*u));
1393
1394        return PyUnicode_DecodeUTF8(u, size, NULL);
1395    }
1396
1397    unicode = _PyUnicode_New(size);
1398    if (!unicode)
1399        return NULL;
1400
1401    return (PyObject *)unicode;
1402}
1403
1404PyObject *
1405PyUnicode_FromString(const char *u)
1406{
1407    size_t size = strlen(u);
1408    if (size > PY_SSIZE_T_MAX) {
1409        PyErr_SetString(PyExc_OverflowError, "input too long");
1410        return NULL;
1411    }
1412
1413    return PyUnicode_FromStringAndSize(u, size);
1414}
1415
1416static PyObject*
1417_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1418{
1419    PyObject *res;
1420    unsigned char max = 127;
1421    Py_ssize_t i;
1422    for (i = 0; i < size; i++) {
1423        if (u[i] & 0x80) {
1424            max = 255;
1425            break;
1426        }
1427    }
1428    res = PyUnicode_New(size, max);
1429    if (!res)
1430        return NULL;
1431    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1432    return res;
1433}
1434
1435static PyObject*
1436_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1437{
1438    PyObject *res;
1439    Py_UCS2 max = 0;
1440    Py_ssize_t i;
1441    for (i = 0; i < size; i++)
1442        if (u[i] > max)
1443            max = u[i];
1444    res = PyUnicode_New(size, max);
1445    if (!res)
1446        return NULL;
1447    if (max >= 256)
1448        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1449    else
1450        for (i = 0; i < size; i++)
1451            PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1452    return res;
1453}
1454
1455static PyObject*
1456_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1457{
1458    PyObject *res;
1459    Py_UCS4 max = 0;
1460    Py_ssize_t i;
1461    for (i = 0; i < size; i++)
1462        if (u[i] > max)
1463            max = u[i];
1464    res = PyUnicode_New(size, max);
1465    if (!res)
1466        return NULL;
1467    if (max >= 0x10000)
1468        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1469    else {
1470        int kind = PyUnicode_KIND(res);
1471        void *data = PyUnicode_DATA(res);
1472        for (i = 0; i < size; i++)
1473            PyUnicode_WRITE(kind, data, i, u[i]);
1474    }
1475    return res;
1476}
1477
1478PyObject*
1479PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1480{
1481    switch(kind) {
1482    case PyUnicode_1BYTE_KIND:
1483        return _PyUnicode_FromUCS1(buffer, size);
1484    case PyUnicode_2BYTE_KIND:
1485        return _PyUnicode_FromUCS2(buffer, size);
1486    case PyUnicode_4BYTE_KIND:
1487        return _PyUnicode_FromUCS4(buffer, size);
1488    }
1489    PyErr_SetString(PyExc_ValueError, "invalid kind");
1490    return NULL;
1491}
1492
1493PyObject*
1494PyUnicode_Copy(PyObject *unicode)
1495{
1496    Py_ssize_t size;
1497    PyObject *copy;
1498    void *data;
1499
1500    if (!PyUnicode_Check(unicode)) {
1501        PyErr_BadInternalCall();
1502        return NULL;
1503    }
1504    if (PyUnicode_READY(unicode))
1505        return NULL;
1506
1507    size = PyUnicode_GET_LENGTH(unicode);
1508    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1509    if (!copy)
1510        return NULL;
1511    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1512
1513    data = PyUnicode_DATA(unicode);
1514    switch (PyUnicode_KIND(unicode))
1515    {
1516    case PyUnicode_1BYTE_KIND:
1517        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1518        break;
1519    case PyUnicode_2BYTE_KIND:
1520        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1521        break;
1522    case PyUnicode_4BYTE_KIND:
1523        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1524        break;
1525    default:
1526        assert(0);
1527        break;
1528    }
1529    return copy;
1530}
1531
1532
1533/* Widen Unicode objects to larger buffers. Don't write terminating null
1534   character. Return NULL on error. */
1535
1536void*
1537_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1538{
1539    Py_ssize_t len;
1540    void *result;
1541    unsigned int skind;
1542
1543    if (PyUnicode_READY(s))
1544        return NULL;
1545
1546    len = PyUnicode_GET_LENGTH(s);
1547    skind = PyUnicode_KIND(s);
1548    if (skind >= kind) {
1549        PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1550        return NULL;
1551    }
1552    switch(kind) {
1553    case PyUnicode_2BYTE_KIND:
1554        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1555        if (!result)
1556            return PyErr_NoMemory();
1557        assert(skind == PyUnicode_1BYTE_KIND);
1558        _PyUnicode_CONVERT_BYTES(
1559            Py_UCS1, Py_UCS2,
1560            PyUnicode_1BYTE_DATA(s),
1561            PyUnicode_1BYTE_DATA(s) + len,
1562            result);
1563        return result;
1564    case PyUnicode_4BYTE_KIND:
1565        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1566        if (!result)
1567            return PyErr_NoMemory();
1568        if (skind == PyUnicode_2BYTE_KIND) {
1569            _PyUnicode_CONVERT_BYTES(
1570                Py_UCS2, Py_UCS4,
1571                PyUnicode_2BYTE_DATA(s),
1572                PyUnicode_2BYTE_DATA(s) + len,
1573                result);
1574        }
1575        else {
1576            assert(skind == PyUnicode_1BYTE_KIND);
1577            _PyUnicode_CONVERT_BYTES(
1578                Py_UCS1, Py_UCS4,
1579                PyUnicode_1BYTE_DATA(s),
1580                PyUnicode_1BYTE_DATA(s) + len,
1581                result);
1582        }
1583        return result;
1584    default:
1585        break;
1586    }
1587    PyErr_SetString(PyExc_ValueError, "invalid kind");
1588    return NULL;
1589}
1590
1591static Py_UCS4*
1592as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1593        int copy_null)
1594{
1595    int kind;
1596    void *data;
1597    Py_ssize_t len, targetlen;
1598    if (PyUnicode_READY(string) == -1)
1599        return NULL;
1600    kind = PyUnicode_KIND(string);
1601    data = PyUnicode_DATA(string);
1602    len = PyUnicode_GET_LENGTH(string);
1603    targetlen = len;
1604    if (copy_null)
1605        targetlen++;
1606    if (!target) {
1607        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1608            PyErr_NoMemory();
1609            return NULL;
1610        }
1611        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1612        if (!target) {
1613            PyErr_NoMemory();
1614            return NULL;
1615        }
1616    }
1617    else {
1618        if (targetsize < targetlen) {
1619            PyErr_Format(PyExc_SystemError,
1620                         "string is longer than the buffer");
1621            if (copy_null && 0 < targetsize)
1622                target[0] = 0;
1623            return NULL;
1624        }
1625    }
1626    if (kind != PyUnicode_4BYTE_KIND) {
1627        Py_ssize_t i;
1628        for (i = 0; i < len; i++)
1629            target[i] = PyUnicode_READ(kind, data, i);
1630    }
1631    else
1632        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1633    if (copy_null)
1634        target[len] = 0;
1635    return target;
1636}
1637
1638Py_UCS4*
1639PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1640                 int copy_null)
1641{
1642    if (target == NULL || targetsize < 1) {
1643        PyErr_BadInternalCall();
1644        return NULL;
1645    }
1646    return as_ucs4(string, target, targetsize, copy_null);
1647}
1648
1649Py_UCS4*
1650PyUnicode_AsUCS4Copy(PyObject *string)
1651{
1652    return as_ucs4(string, NULL, 0, 1);
1653}
1654
1655#ifdef HAVE_WCHAR_H
1656
1657PyObject *
1658PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
1659{
1660    if (w == NULL) {
1661        if (size == 0)
1662            return PyUnicode_New(0, 0);
1663        PyErr_BadInternalCall();
1664        return NULL;
1665    }
1666
1667    if (size == -1) {
1668        size = wcslen(w);
1669    }
1670
1671    return PyUnicode_FromUnicode(w, size);
1672}
1673
1674#endif /* HAVE_WCHAR_H */
1675
1676static void
1677makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1678        int zeropad, int width, int precision, char c)
1679{
1680    *fmt++ = '%';
1681    if (width) {
1682        if (zeropad)
1683            *fmt++ = '0';
1684        fmt += sprintf(fmt, "%d", width);
1685    }
1686    if (precision)
1687        fmt += sprintf(fmt, ".%d", precision);
1688    if (longflag)
1689        *fmt++ = 'l';
1690    else if (longlongflag) {
1691        /* longlongflag should only ever be nonzero on machines with
1692           HAVE_LONG_LONG defined */
1693#ifdef HAVE_LONG_LONG
1694        char *f = PY_FORMAT_LONG_LONG;
1695        while (*f)
1696            *fmt++ = *f++;
1697#else
1698        /* we shouldn't ever get here */
1699        assert(0);
1700        *fmt++ = 'l';
1701#endif
1702    }
1703    else if (size_tflag) {
1704        char *f = PY_FORMAT_SIZE_T;
1705        while (*f)
1706            *fmt++ = *f++;
1707    }
1708    *fmt++ = c;
1709    *fmt = '\0';
1710}
1711
1712/* helper for PyUnicode_FromFormatV() */
1713
1714static const char*
1715parse_format_flags(const char *f,
1716                   int *p_width, int *p_precision,
1717                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1718{
1719    int width, precision, longflag, longlongflag, size_tflag;
1720
1721    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1722    f++;
1723    width = 0;
1724    while (Py_ISDIGIT((unsigned)*f))
1725        width = (width*10) + *f++ - '0';
1726    precision = 0;
1727    if (*f == '.') {
1728        f++;
1729        while (Py_ISDIGIT((unsigned)*f))
1730            precision = (precision*10) + *f++ - '0';
1731        if (*f == '%') {
1732            /* "%.3%s" => f points to "3" */
1733            f--;
1734        }
1735    }
1736    if (*f == '\0') {
1737        /* bogus format "%.1" => go backward, f points to "1" */
1738        f--;
1739    }
1740    if (p_width != NULL)
1741        *p_width = width;
1742    if (p_precision != NULL)
1743        *p_precision = precision;
1744
1745    /* Handle %ld, %lu, %lld and %llu. */
1746    longflag = 0;
1747    longlongflag = 0;
1748    size_tflag = 0;
1749
1750    if (*f == 'l') {
1751        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
1752            longflag = 1;
1753            ++f;
1754        }
1755#ifdef HAVE_LONG_LONG
1756        else if (f[1] == 'l' &&
1757                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
1758            longlongflag = 1;
1759            f += 2;
1760        }
1761#endif
1762    }
1763    /* handle the size_t flag. */
1764    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
1765        size_tflag = 1;
1766        ++f;
1767    }
1768    if (p_longflag != NULL)
1769        *p_longflag = longflag;
1770    if (p_longlongflag != NULL)
1771        *p_longlongflag = longlongflag;
1772    if (p_size_tflag != NULL)
1773        *p_size_tflag = size_tflag;
1774    return f;
1775}
1776
1777/* maximum number of characters required for output of %ld.  21 characters
1778   allows for 64-bit integers (in decimal) and an optional sign. */
1779#define MAX_LONG_CHARS 21
1780/* maximum number of characters required for output of %lld.
1781   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1782   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
1783#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1784
1785PyObject *
1786PyUnicode_FromFormatV(const char *format, va_list vargs)
1787{
1788    va_list count;
1789    Py_ssize_t callcount = 0;
1790    PyObject **callresults = NULL;
1791    PyObject **callresult = NULL;
1792    Py_ssize_t n = 0;
1793    int width = 0;
1794    int precision = 0;
1795    int zeropad;
1796    const char* f;
1797    PyUnicodeObject *string;
1798    /* used by sprintf */
1799    char fmt[61]; /* should be enough for %0width.precisionlld */
1800    Py_UCS4 maxchar = 127; /* result is ASCII by default */
1801    Py_UCS4 argmaxchar;
1802    Py_ssize_t numbersize = 0;
1803    char *numberresults = NULL;
1804    char *numberresult = NULL;
1805    Py_ssize_t i;
1806    int kind;
1807    void *data;
1808
1809    Py_VA_COPY(count, vargs);
1810    /* step 1: count the number of %S/%R/%A/%s format specifications
1811     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1812     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
1813     * result in an array)
1814     * also esimate a upper bound for all the number formats in the string,
1815     * numbers will be formated in step 3 and be keept in a '\0'-separated
1816     * buffer before putting everything together. */
1817    for (f = format; *f; f++) {
1818        if (*f == '%') {
1819            int longlongflag;
1820            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1821            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1822            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1823                ++callcount;
1824
1825            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
1826#ifdef HAVE_LONG_LONG
1827                if (longlongflag) {
1828                    if (width < MAX_LONG_LONG_CHARS)
1829                        width = MAX_LONG_LONG_CHARS;
1830                }
1831                else
1832#endif
1833                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1834                       including sign.  Decimal takes the most space.  This
1835                       isn't enough for octal.  If a width is specified we
1836                       need more (which we allocate later). */
1837                    if (width < MAX_LONG_CHARS)
1838                        width = MAX_LONG_CHARS;
1839
1840                /* account for the size + '\0' to separate numbers
1841                   inside of the numberresults buffer */
1842                numbersize += (width + 1);
1843            }
1844        }
1845        else if ((unsigned char)*f > 127) {
1846            PyErr_Format(PyExc_ValueError,
1847                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1848                "string, got a non-ASCII byte: 0x%02x",
1849                (unsigned char)*f);
1850            return NULL;
1851        }
1852    }
1853    /* step 2: allocate memory for the results of
1854     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1855    if (callcount) {
1856        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1857        if (!callresults) {
1858            PyErr_NoMemory();
1859            return NULL;
1860        }
1861        callresult = callresults;
1862    }
1863    /* step 2.5: allocate memory for the results of formating numbers */
1864    if (numbersize) {
1865        numberresults = PyObject_Malloc(numbersize);
1866        if (!numberresults) {
1867            PyErr_NoMemory();
1868            goto fail;
1869        }
1870        numberresult = numberresults;
1871    }
1872
1873    /* step 3: format numbers and figure out how large a buffer we need */
1874    for (f = format; *f; f++) {
1875        if (*f == '%') {
1876            const char* p;
1877            int longflag;
1878            int longlongflag;
1879            int size_tflag;
1880            int numprinted;
1881
1882            p = f;
1883            zeropad = (f[1] == '0');
1884            f = parse_format_flags(f, &width, &precision,
1885                                   &longflag, &longlongflag, &size_tflag);
1886            switch (*f) {
1887            case 'c':
1888            {
1889                Py_UCS4 ordinal = va_arg(count, int);
1890                maxchar = Py_MAX(maxchar, ordinal);
1891                n++;
1892                break;
1893            }
1894            case '%':
1895                n++;
1896                break;
1897            case 'i':
1898            case 'd':
1899                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1900                        width, precision, *f);
1901                if (longflag)
1902                    numprinted = sprintf(numberresult, fmt,
1903                                         va_arg(count, long));
1904#ifdef HAVE_LONG_LONG
1905                else if (longlongflag)
1906                    numprinted = sprintf(numberresult, fmt,
1907                                         va_arg(count, PY_LONG_LONG));
1908#endif
1909                else if (size_tflag)
1910                    numprinted = sprintf(numberresult, fmt,
1911                                         va_arg(count, Py_ssize_t));
1912                else
1913                    numprinted = sprintf(numberresult, fmt,
1914                                         va_arg(count, int));
1915                n += numprinted;
1916                /* advance by +1 to skip over the '\0' */
1917                numberresult += (numprinted + 1);
1918                assert(*(numberresult - 1) == '\0');
1919                assert(*(numberresult - 2) != '\0');
1920                assert(numprinted >= 0);
1921                assert(numberresult <= numberresults + numbersize);
1922                break;
1923            case 'u':
1924                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1925                        width, precision, 'u');
1926                if (longflag)
1927                    numprinted = sprintf(numberresult, fmt,
1928                                         va_arg(count, unsigned long));
1929#ifdef HAVE_LONG_LONG
1930                else if (longlongflag)
1931                    numprinted = sprintf(numberresult, fmt,
1932                                         va_arg(count, unsigned PY_LONG_LONG));
1933#endif
1934                else if (size_tflag)
1935                    numprinted = sprintf(numberresult, fmt,
1936                                         va_arg(count, size_t));
1937                else
1938                    numprinted = sprintf(numberresult, fmt,
1939                                         va_arg(count, unsigned int));
1940                n += numprinted;
1941                numberresult += (numprinted + 1);
1942                assert(*(numberresult - 1) == '\0');
1943                assert(*(numberresult - 2) != '\0');
1944                assert(numprinted >= 0);
1945                assert(numberresult <= numberresults + numbersize);
1946                break;
1947            case 'x':
1948                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1949                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1950                n += numprinted;
1951                numberresult += (numprinted + 1);
1952                assert(*(numberresult - 1) == '\0');
1953                assert(*(numberresult - 2) != '\0');
1954                assert(numprinted >= 0);
1955                assert(numberresult <= numberresults + numbersize);
1956                break;
1957            case 'p':
1958                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1959                /* %p is ill-defined:  ensure leading 0x. */
1960                if (numberresult[1] == 'X')
1961                    numberresult[1] = 'x';
1962                else if (numberresult[1] != 'x') {
1963                    memmove(numberresult + 2, numberresult,
1964                            strlen(numberresult) + 1);
1965                    numberresult[0] = '0';
1966                    numberresult[1] = 'x';
1967                    numprinted += 2;
1968                }
1969                n += numprinted;
1970                numberresult += (numprinted + 1);
1971                assert(*(numberresult - 1) == '\0');
1972                assert(*(numberresult - 2) != '\0');
1973                assert(numprinted >= 0);
1974                assert(numberresult <= numberresults + numbersize);
1975                break;
1976            case 's':
1977            {
1978                /* UTF-8 */
1979                const char *s = va_arg(count, const char*);
1980                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1981                if (!str)
1982                    goto fail;
1983                /* since PyUnicode_DecodeUTF8 returns already flexible
1984                   unicode objects, there is no need to call ready on them */
1985                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
1986                maxchar = Py_MAX(maxchar, argmaxchar);
1987                n += PyUnicode_GET_LENGTH(str);
1988                /* Remember the str and switch to the next slot */
1989                *callresult++ = str;
1990                break;
1991            }
1992            case 'U':
1993            {
1994                PyObject *obj = va_arg(count, PyObject *);
1995                assert(obj && _PyUnicode_CHECK(obj));
1996                if (PyUnicode_READY(obj) == -1)
1997                    goto fail;
1998                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
1999                maxchar = Py_MAX(maxchar, argmaxchar);
2000                n += PyUnicode_GET_LENGTH(obj);
2001                break;
2002            }
2003            case 'V':
2004            {
2005                PyObject *obj = va_arg(count, PyObject *);
2006                const char *str = va_arg(count, const char *);
2007                PyObject *str_obj;
2008                assert(obj || str);
2009                assert(!obj || _PyUnicode_CHECK(obj));
2010                if (obj) {
2011                    if (PyUnicode_READY(obj) == -1)
2012                        goto fail;
2013                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2014                    maxchar = Py_MAX(maxchar, argmaxchar);
2015                    n += PyUnicode_GET_LENGTH(obj);
2016                    *callresult++ = NULL;
2017                }
2018                else {
2019                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2020                    if (!str_obj)
2021                        goto fail;
2022                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2023                    maxchar = Py_MAX(maxchar, argmaxchar);
2024                    n += PyUnicode_GET_LENGTH(str_obj);
2025                    *callresult++ = str_obj;
2026                }
2027                break;
2028            }
2029            case 'S':
2030            {
2031                PyObject *obj = va_arg(count, PyObject *);
2032                PyObject *str;
2033                assert(obj);
2034                str = PyObject_Str(obj);
2035                if (!str || PyUnicode_READY(str) == -1)
2036                    goto fail;
2037                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2038                maxchar = Py_MAX(maxchar, argmaxchar);
2039                n += PyUnicode_GET_LENGTH(str);
2040                /* Remember the str and switch to the next slot */
2041                *callresult++ = str;
2042                break;
2043            }
2044            case 'R':
2045            {
2046                PyObject *obj = va_arg(count, PyObject *);
2047                PyObject *repr;
2048                assert(obj);
2049                repr = PyObject_Repr(obj);
2050                if (!repr || PyUnicode_READY(repr) == -1)
2051                    goto fail;
2052                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2053                maxchar = Py_MAX(maxchar, argmaxchar);
2054                n += PyUnicode_GET_LENGTH(repr);
2055                /* Remember the repr and switch to the next slot */
2056                *callresult++ = repr;
2057                break;
2058            }
2059            case 'A':
2060            {
2061                PyObject *obj = va_arg(count, PyObject *);
2062                PyObject *ascii;
2063                assert(obj);
2064                ascii = PyObject_ASCII(obj);
2065                if (!ascii || PyUnicode_READY(ascii) == -1)
2066                    goto fail;
2067                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2068                maxchar = Py_MAX(maxchar, argmaxchar);
2069                n += PyUnicode_GET_LENGTH(ascii);
2070                /* Remember the repr and switch to the next slot */
2071                *callresult++ = ascii;
2072                break;
2073            }
2074            default:
2075                /* if we stumble upon an unknown
2076                   formatting code, copy the rest of
2077                   the format string to the output
2078                   string. (we cannot just skip the
2079                   code, since there's no way to know
2080                   what's in the argument list) */
2081                n += strlen(p);
2082                goto expand;
2083            }
2084        } else
2085            n++;
2086    }
2087  expand:
2088    /* step 4: fill the buffer */
2089    /* Since we've analyzed how much space we need,
2090       we don't have to resize the string.
2091       There can be no errors beyond this point. */
2092    string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
2093    if (!string)
2094        goto fail;
2095    kind = PyUnicode_KIND(string);
2096    data = PyUnicode_DATA(string);
2097    callresult = callresults;
2098    numberresult = numberresults;
2099
2100    for (i = 0, f = format; *f; f++) {
2101        if (*f == '%') {
2102            const char* p;
2103
2104            p = f;
2105            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2106            /* checking for == because the last argument could be a empty
2107               string, which causes i to point to end, the assert at the end of
2108               the loop */
2109            assert(i <= PyUnicode_GET_LENGTH(string));
2110
2111            switch (*f) {
2112            case 'c':
2113            {
2114                const int ordinal = va_arg(vargs, int);
2115                PyUnicode_WRITE(kind, data, i++, ordinal);
2116                break;
2117            }
2118            case 'i':
2119            case 'd':
2120            case 'u':
2121            case 'x':
2122            case 'p':
2123                /* unused, since we already have the result */
2124                if (*f == 'p')
2125                    (void) va_arg(vargs, void *);
2126                else
2127                    (void) va_arg(vargs, int);
2128                /* extract the result from numberresults and append. */
2129                for (; *numberresult; ++i, ++numberresult)
2130                    PyUnicode_WRITE(kind, data, i, *numberresult);
2131                /* skip over the separating '\0' */
2132                assert(*numberresult == '\0');
2133                numberresult++;
2134                assert(numberresult <= numberresults + numbersize);
2135                break;
2136            case 's':
2137            {
2138                /* unused, since we already have the result */
2139                Py_ssize_t size;
2140                (void) va_arg(vargs, char *);
2141                size = PyUnicode_GET_LENGTH(*callresult);
2142                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2143                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2144                                             *callresult, 0,
2145                                             size) < 0)
2146                    goto fail;
2147                i += size;
2148                /* We're done with the unicode()/repr() => forget it */
2149                Py_DECREF(*callresult);
2150                /* switch to next unicode()/repr() result */
2151                ++callresult;
2152                break;
2153            }
2154            case 'U':
2155            {
2156                PyObject *obj = va_arg(vargs, PyObject *);
2157                Py_ssize_t size;
2158                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2159                size = PyUnicode_GET_LENGTH(obj);
2160                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2161                                             obj, 0,
2162                                             size) < 0)
2163                    goto fail;
2164                i += size;
2165                break;
2166            }
2167            case 'V':
2168            {
2169                Py_ssize_t size;
2170                PyObject *obj = va_arg(vargs, PyObject *);
2171                va_arg(vargs, const char *);
2172                if (obj) {
2173                    size = PyUnicode_GET_LENGTH(obj);
2174                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2175                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2176                                                 obj, 0,
2177                                                 size) < 0)
2178                        goto fail;
2179                    i += size;
2180                } else {
2181                    size = PyUnicode_GET_LENGTH(*callresult);
2182                    assert(PyUnicode_KIND(*callresult) <=
2183                           PyUnicode_KIND(string));
2184                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2185                                                 *callresult,
2186                                                 0, size) < 0)
2187                        goto fail;
2188                    i += size;
2189                    Py_DECREF(*callresult);
2190                }
2191                ++callresult;
2192                break;
2193            }
2194            case 'S':
2195            case 'R':
2196            case 'A':
2197            {
2198                /* unused, since we already have the result */
2199                (void) va_arg(vargs, PyObject *);
2200                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2201                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2202                                             *callresult, 0,
2203                                             PyUnicode_GET_LENGTH(*callresult)) < 0)
2204                    goto fail;
2205                i += PyUnicode_GET_LENGTH(*callresult);
2206                /* We're done with the unicode()/repr() => forget it */
2207                Py_DECREF(*callresult);
2208                /* switch to next unicode()/repr() result */
2209                ++callresult;
2210                break;
2211            }
2212            case '%':
2213                PyUnicode_WRITE(kind, data, i++, '%');
2214                break;
2215            default:
2216                for (; *p; ++p, ++i)
2217                    PyUnicode_WRITE(kind, data, i, *p);
2218                assert(i == PyUnicode_GET_LENGTH(string));
2219                goto end;
2220            }
2221        }
2222        else {
2223            assert(i < PyUnicode_GET_LENGTH(string));
2224            PyUnicode_WRITE(kind, data, i++, *f);
2225        }
2226    }
2227    assert(i == PyUnicode_GET_LENGTH(string));
2228
2229  end:
2230    if (callresults)
2231        PyObject_Free(callresults);
2232    if (numberresults)
2233        PyObject_Free(numberresults);
2234    return (PyObject *)string;
2235  fail:
2236    if (callresults) {
2237        PyObject **callresult2 = callresults;
2238        while (callresult2 < callresult) {
2239            Py_XDECREF(*callresult2);
2240            ++callresult2;
2241        }
2242        PyObject_Free(callresults);
2243    }
2244    if (numberresults)
2245        PyObject_Free(numberresults);
2246    return NULL;
2247}
2248
2249PyObject *
2250PyUnicode_FromFormat(const char *format, ...)
2251{
2252    PyObject* ret;
2253    va_list vargs;
2254
2255#ifdef HAVE_STDARG_PROTOTYPES
2256    va_start(vargs, format);
2257#else
2258    va_start(vargs);
2259#endif
2260    ret = PyUnicode_FromFormatV(format, vargs);
2261    va_end(vargs);
2262    return ret;
2263}
2264
2265#ifdef HAVE_WCHAR_H
2266
2267/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2268   convert a Unicode object to a wide character string.
2269
2270   - If w is NULL: return the number of wide characters (including the null
2271     character) required to convert the unicode object. Ignore size argument.
2272
2273   - Otherwise: return the number of wide characters (excluding the null
2274     character) written into w. Write at most size wide characters (including
2275     the null character). */
2276static Py_ssize_t
2277unicode_aswidechar(PyUnicodeObject *unicode,
2278                   wchar_t *w,
2279                   Py_ssize_t size)
2280{
2281    Py_ssize_t res;
2282    const wchar_t *wstr;
2283
2284    wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2285    if (wstr == NULL)
2286        return -1;
2287
2288    if (w != NULL) {
2289        if (size > res)
2290            size = res + 1;
2291        else
2292            res = size;
2293        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2294        return res;
2295    }
2296    else
2297        return res + 1;
2298}
2299
2300Py_ssize_t
2301PyUnicode_AsWideChar(PyObject *unicode,
2302                     wchar_t *w,
2303                     Py_ssize_t size)
2304{
2305    if (unicode == NULL) {
2306        PyErr_BadInternalCall();
2307        return -1;
2308    }
2309    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
2310}
2311
2312wchar_t*
2313PyUnicode_AsWideCharString(PyObject *unicode,
2314                           Py_ssize_t *size)
2315{
2316    wchar_t* buffer;
2317    Py_ssize_t buflen;
2318
2319    if (unicode == NULL) {
2320        PyErr_BadInternalCall();
2321        return NULL;
2322    }
2323
2324    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
2325    if (buflen == -1)
2326        return NULL;
2327    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2328        PyErr_NoMemory();
2329        return NULL;
2330    }
2331
2332    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2333    if (buffer == NULL) {
2334        PyErr_NoMemory();
2335        return NULL;
2336    }
2337    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
2338    if (buflen == -1)
2339        return NULL;
2340    if (size != NULL)
2341        *size = buflen;
2342    return buffer;
2343}
2344
2345#endif /* HAVE_WCHAR_H */
2346
2347PyObject *
2348PyUnicode_FromOrdinal(int ordinal)
2349{
2350    PyObject *v;
2351    if (ordinal < 0 || ordinal > 0x10ffff) {
2352        PyErr_SetString(PyExc_ValueError,
2353                        "chr() arg not in range(0x110000)");
2354        return NULL;
2355    }
2356
2357    if (ordinal < 256)
2358        return get_latin1_char(ordinal);
2359
2360    v = PyUnicode_New(1, ordinal);
2361    if (v == NULL)
2362        return NULL;
2363    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2364    return v;
2365}
2366
2367PyObject *
2368PyUnicode_FromObject(register PyObject *obj)
2369{
2370    /* XXX Perhaps we should make this API an alias of
2371       PyObject_Str() instead ?! */
2372    if (PyUnicode_CheckExact(obj)) {
2373        if (PyUnicode_READY(obj))
2374            return NULL;
2375        Py_INCREF(obj);
2376        return obj;
2377    }
2378    if (PyUnicode_Check(obj)) {
2379        /* For a Unicode subtype that's not a Unicode object,
2380           return a true Unicode object with the same data. */
2381        return PyUnicode_Copy(obj);
2382    }
2383    PyErr_Format(PyExc_TypeError,
2384                 "Can't convert '%.100s' object to str implicitly",
2385                 Py_TYPE(obj)->tp_name);
2386    return NULL;
2387}
2388
2389PyObject *
2390PyUnicode_FromEncodedObject(register PyObject *obj,
2391                            const char *encoding,
2392                            const char *errors)
2393{
2394    Py_buffer buffer;
2395    PyObject *v;
2396
2397    if (obj == NULL) {
2398        PyErr_BadInternalCall();
2399        return NULL;
2400    }
2401
2402    /* Decoding bytes objects is the most common case and should be fast */
2403    if (PyBytes_Check(obj)) {
2404        if (PyBytes_GET_SIZE(obj) == 0) {
2405            Py_INCREF(unicode_empty);
2406            v = unicode_empty;
2407        }
2408        else {
2409            v = PyUnicode_Decode(
2410                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2411                    encoding, errors);
2412        }
2413        return v;
2414    }
2415
2416    if (PyUnicode_Check(obj)) {
2417        PyErr_SetString(PyExc_TypeError,
2418                        "decoding str is not supported");
2419        return NULL;
2420    }
2421
2422    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2423    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2424        PyErr_Format(PyExc_TypeError,
2425                     "coercing to str: need bytes, bytearray "
2426                     "or buffer-like object, %.80s found",
2427                     Py_TYPE(obj)->tp_name);
2428        return NULL;
2429    }
2430
2431    if (buffer.len == 0) {
2432        Py_INCREF(unicode_empty);
2433        v = unicode_empty;
2434    }
2435    else
2436        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2437
2438    PyBuffer_Release(&buffer);
2439    return v;
2440}
2441
2442/* Convert encoding to lower case and replace '_' with '-' in order to
2443   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2444   1 on success. */
2445static int
2446normalize_encoding(const char *encoding,
2447                   char *lower,
2448                   size_t lower_len)
2449{
2450    const char *e;
2451    char *l;
2452    char *l_end;
2453
2454    e = encoding;
2455    l = lower;
2456    l_end = &lower[lower_len - 1];
2457    while (*e) {
2458        if (l == l_end)
2459            return 0;
2460        if (Py_ISUPPER(*e)) {
2461            *l++ = Py_TOLOWER(*e++);
2462        }
2463        else if (*e == '_') {
2464            *l++ = '-';
2465            e++;
2466        }
2467        else {
2468            *l++ = *e++;
2469        }
2470    }
2471    *l = '\0';
2472    return 1;
2473}
2474
2475PyObject *
2476PyUnicode_Decode(const char *s,
2477                 Py_ssize_t size,
2478                 const char *encoding,
2479                 const char *errors)
2480{
2481    PyObject *buffer = NULL, *unicode;
2482    Py_buffer info;
2483    char lower[11];  /* Enough for any encoding shortcut */
2484
2485    if (encoding == NULL)
2486        return PyUnicode_DecodeUTF8(s, size, errors);
2487
2488    /* Shortcuts for common default encodings */
2489    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2490        if ((strcmp(lower, "utf-8") == 0) ||
2491            (strcmp(lower, "utf8") == 0))
2492            return PyUnicode_DecodeUTF8(s, size, errors);
2493        else if ((strcmp(lower, "latin-1") == 0) ||
2494                 (strcmp(lower, "latin1") == 0) ||
2495                 (strcmp(lower, "iso-8859-1") == 0))
2496            return PyUnicode_DecodeLatin1(s, size, errors);
2497#ifdef HAVE_MBCS
2498        else if (strcmp(lower, "mbcs") == 0)
2499            return PyUnicode_DecodeMBCS(s, size, errors);
2500#endif
2501        else if (strcmp(lower, "ascii") == 0)
2502            return PyUnicode_DecodeASCII(s, size, errors);
2503        else if (strcmp(lower, "utf-16") == 0)
2504            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2505        else if (strcmp(lower, "utf-32") == 0)
2506            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2507    }
2508
2509    /* Decode via the codec registry */
2510    buffer = NULL;
2511    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2512        goto onError;
2513    buffer = PyMemoryView_FromBuffer(&info);
2514    if (buffer == NULL)
2515        goto onError;
2516    unicode = PyCodec_Decode(buffer, encoding, errors);
2517    if (unicode == NULL)
2518        goto onError;
2519    if (!PyUnicode_Check(unicode)) {
2520        PyErr_Format(PyExc_TypeError,
2521                     "decoder did not return a str object (type=%.400s)",
2522                     Py_TYPE(unicode)->tp_name);
2523        Py_DECREF(unicode);
2524        goto onError;
2525    }
2526    Py_DECREF(buffer);
2527    if (PyUnicode_READY(unicode)) {
2528        Py_DECREF(unicode);
2529        return NULL;
2530    }
2531    return unicode;
2532
2533  onError:
2534    Py_XDECREF(buffer);
2535    return NULL;
2536}
2537
2538PyObject *
2539PyUnicode_AsDecodedObject(PyObject *unicode,
2540                          const char *encoding,
2541                          const char *errors)
2542{
2543    PyObject *v;
2544
2545    if (!PyUnicode_Check(unicode)) {
2546        PyErr_BadArgument();
2547        goto onError;
2548    }
2549
2550    if (encoding == NULL)
2551        encoding = PyUnicode_GetDefaultEncoding();
2552
2553    /* Decode via the codec registry */
2554    v = PyCodec_Decode(unicode, encoding, errors);
2555    if (v == NULL)
2556        goto onError;
2557    return v;
2558
2559  onError:
2560    return NULL;
2561}
2562
2563PyObject *
2564PyUnicode_AsDecodedUnicode(PyObject *unicode,
2565                           const char *encoding,
2566                           const char *errors)
2567{
2568    PyObject *v;
2569
2570    if (!PyUnicode_Check(unicode)) {
2571        PyErr_BadArgument();
2572        goto onError;
2573    }
2574
2575    if (encoding == NULL)
2576        encoding = PyUnicode_GetDefaultEncoding();
2577
2578    /* Decode via the codec registry */
2579    v = PyCodec_Decode(unicode, encoding, errors);
2580    if (v == NULL)
2581        goto onError;
2582    if (!PyUnicode_Check(v)) {
2583        PyErr_Format(PyExc_TypeError,
2584                     "decoder did not return a str object (type=%.400s)",
2585                     Py_TYPE(v)->tp_name);
2586        Py_DECREF(v);
2587        goto onError;
2588    }
2589    return v;
2590
2591  onError:
2592    return NULL;
2593}
2594
2595PyObject *
2596PyUnicode_Encode(const Py_UNICODE *s,
2597                 Py_ssize_t size,
2598                 const char *encoding,
2599                 const char *errors)
2600{
2601    PyObject *v, *unicode;
2602
2603    unicode = PyUnicode_FromUnicode(s, size);
2604    if (unicode == NULL)
2605        return NULL;
2606    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2607    Py_DECREF(unicode);
2608    return v;
2609}
2610
2611PyObject *
2612PyUnicode_AsEncodedObject(PyObject *unicode,
2613                          const char *encoding,
2614                          const char *errors)
2615{
2616    PyObject *v;
2617
2618    if (!PyUnicode_Check(unicode)) {
2619        PyErr_BadArgument();
2620        goto onError;
2621    }
2622
2623    if (encoding == NULL)
2624        encoding = PyUnicode_GetDefaultEncoding();
2625
2626    /* Encode via the codec registry */
2627    v = PyCodec_Encode(unicode, encoding, errors);
2628    if (v == NULL)
2629        goto onError;
2630    return v;
2631
2632  onError:
2633    return NULL;
2634}
2635
2636PyObject *
2637PyUnicode_EncodeFSDefault(PyObject *unicode)
2638{
2639#ifdef HAVE_MBCS
2640    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2641                                PyUnicode_GET_SIZE(unicode),
2642                                NULL);
2643#elif defined(__APPLE__)
2644    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
2645#else
2646    PyInterpreterState *interp = PyThreadState_GET()->interp;
2647    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2648       cannot use it to encode and decode filenames before it is loaded. Load
2649       the Python codec requires to encode at least its own filename. Use the C
2650       version of the locale codec until the codec registry is initialized and
2651       the Python codec is loaded.
2652
2653       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2654       cannot only rely on it: check also interp->fscodec_initialized for
2655       subinterpreters. */
2656    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2657        return PyUnicode_AsEncodedString(unicode,
2658                                         Py_FileSystemDefaultEncoding,
2659                                         "surrogateescape");
2660    }
2661    else {
2662        /* locale encoding with surrogateescape */
2663        wchar_t *wchar;
2664        char *bytes;
2665        PyObject *bytes_obj;
2666        size_t error_pos;
2667
2668        wchar = PyUnicode_AsWideCharString(unicode, NULL);
2669        if (wchar == NULL)
2670            return NULL;
2671        bytes = _Py_wchar2char(wchar, &error_pos);
2672        if (bytes == NULL) {
2673            if (error_pos != (size_t)-1) {
2674                char *errmsg = strerror(errno);
2675                PyObject *exc = NULL;
2676                if (errmsg == NULL)
2677                    errmsg = "Py_wchar2char() failed";
2678                raise_encode_exception(&exc,
2679                    "filesystemencoding",
2680                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2681                    error_pos, error_pos+1,
2682                    errmsg);
2683                Py_XDECREF(exc);
2684            }
2685            else
2686                PyErr_NoMemory();
2687            PyMem_Free(wchar);
2688            return NULL;
2689        }
2690        PyMem_Free(wchar);
2691
2692        bytes_obj = PyBytes_FromString(bytes);
2693        PyMem_Free(bytes);
2694        return bytes_obj;
2695    }
2696#endif
2697}
2698
2699PyObject *
2700PyUnicode_AsEncodedString(PyObject *unicode,
2701                          const char *encoding,
2702                          const char *errors)
2703{
2704    PyObject *v;
2705    char lower[11];  /* Enough for any encoding shortcut */
2706
2707    if (!PyUnicode_Check(unicode)) {
2708        PyErr_BadArgument();
2709        return NULL;
2710    }
2711
2712    if (encoding == NULL) {
2713        if (errors == NULL || strcmp(errors, "strict") == 0)
2714            return _PyUnicode_AsUTF8String(unicode, NULL);
2715        else
2716            return _PyUnicode_AsUTF8String(unicode, errors);
2717    }
2718
2719    /* Shortcuts for common default encodings */
2720    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2721        if ((strcmp(lower, "utf-8") == 0) ||
2722            (strcmp(lower, "utf8") == 0))
2723        {
2724            if (errors == NULL || strcmp(errors, "strict") == 0)
2725                return _PyUnicode_AsUTF8String(unicode, NULL);
2726            else
2727                return _PyUnicode_AsUTF8String(unicode, errors);
2728        }
2729        else if ((strcmp(lower, "latin-1") == 0) ||
2730                 (strcmp(lower, "latin1") == 0) ||
2731                 (strcmp(lower, "iso-8859-1") == 0))
2732            return _PyUnicode_AsLatin1String(unicode, errors);
2733#ifdef HAVE_MBCS
2734        else if (strcmp(lower, "mbcs") == 0)
2735            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2736                                        PyUnicode_GET_SIZE(unicode),
2737                                        errors);
2738#endif
2739        else if (strcmp(lower, "ascii") == 0)
2740            return _PyUnicode_AsASCIIString(unicode, errors);
2741    }
2742
2743    /* Encode via the codec registry */
2744    v = PyCodec_Encode(unicode, encoding, errors);
2745    if (v == NULL)
2746        return NULL;
2747
2748    /* The normal path */
2749    if (PyBytes_Check(v))
2750        return v;
2751
2752    /* If the codec returns a buffer, raise a warning and convert to bytes */
2753    if (PyByteArray_Check(v)) {
2754        int error;
2755        PyObject *b;
2756
2757        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2758            "encoder %s returned bytearray instead of bytes",
2759            encoding);
2760        if (error) {
2761            Py_DECREF(v);
2762            return NULL;
2763        }
2764
2765        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2766        Py_DECREF(v);
2767        return b;
2768    }
2769
2770    PyErr_Format(PyExc_TypeError,
2771                 "encoder did not return a bytes object (type=%.400s)",
2772                 Py_TYPE(v)->tp_name);
2773    Py_DECREF(v);
2774    return NULL;
2775}
2776
2777PyObject *
2778PyUnicode_AsEncodedUnicode(PyObject *unicode,
2779                           const char *encoding,
2780                           const char *errors)
2781{
2782    PyObject *v;
2783
2784    if (!PyUnicode_Check(unicode)) {
2785        PyErr_BadArgument();
2786        goto onError;
2787    }
2788
2789    if (encoding == NULL)
2790        encoding = PyUnicode_GetDefaultEncoding();
2791
2792    /* Encode via the codec registry */
2793    v = PyCodec_Encode(unicode, encoding, errors);
2794    if (v == NULL)
2795        goto onError;
2796    if (!PyUnicode_Check(v)) {
2797        PyErr_Format(PyExc_TypeError,
2798                     "encoder did not return an str object (type=%.400s)",
2799                     Py_TYPE(v)->tp_name);
2800        Py_DECREF(v);
2801        goto onError;
2802    }
2803    return v;
2804
2805  onError:
2806    return NULL;
2807}
2808
2809PyObject*
2810PyUnicode_DecodeFSDefault(const char *s) {
2811    Py_ssize_t size = (Py_ssize_t)strlen(s);
2812    return PyUnicode_DecodeFSDefaultAndSize(s, size);
2813}
2814
2815PyObject*
2816PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2817{
2818#ifdef HAVE_MBCS
2819    return PyUnicode_DecodeMBCS(s, size, NULL);
2820#elif defined(__APPLE__)
2821    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2822#else
2823    PyInterpreterState *interp = PyThreadState_GET()->interp;
2824    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2825       cannot use it to encode and decode filenames before it is loaded. Load
2826       the Python codec requires to encode at least its own filename. Use the C
2827       version of the locale codec until the codec registry is initialized and
2828       the Python codec is loaded.
2829
2830       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2831       cannot only rely on it: check also interp->fscodec_initialized for
2832       subinterpreters. */
2833    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2834        return PyUnicode_Decode(s, size,
2835                                Py_FileSystemDefaultEncoding,
2836                                "surrogateescape");
2837    }
2838    else {
2839        /* locale encoding with surrogateescape */
2840        wchar_t *wchar;
2841        PyObject *unicode;
2842        size_t len;
2843
2844        if (s[size] != '\0' || size != strlen(s)) {
2845            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2846            return NULL;
2847        }
2848
2849        wchar = _Py_char2wchar(s, &len);
2850        if (wchar == NULL)
2851            return PyErr_NoMemory();
2852
2853        unicode = PyUnicode_FromWideChar(wchar, len);
2854        PyMem_Free(wchar);
2855        return unicode;
2856    }
2857#endif
2858}
2859
2860
2861int
2862PyUnicode_FSConverter(PyObject* arg, void* addr)
2863{
2864    PyObject *output = NULL;
2865    Py_ssize_t size;
2866    void *data;
2867    if (arg == NULL) {
2868        Py_DECREF(*(PyObject**)addr);
2869        return 1;
2870    }
2871    if (PyBytes_Check(arg)) {
2872        output = arg;
2873        Py_INCREF(output);
2874    }
2875    else {
2876        arg = PyUnicode_FromObject(arg);
2877        if (!arg)
2878            return 0;
2879        output = PyUnicode_EncodeFSDefault(arg);
2880        Py_DECREF(arg);
2881        if (!output)
2882            return 0;
2883        if (!PyBytes_Check(output)) {
2884            Py_DECREF(output);
2885            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2886            return 0;
2887        }
2888    }
2889    size = PyBytes_GET_SIZE(output);
2890    data = PyBytes_AS_STRING(output);
2891    if (size != strlen(data)) {
2892        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2893        Py_DECREF(output);
2894        return 0;
2895    }
2896    *(PyObject**)addr = output;
2897    return Py_CLEANUP_SUPPORTED;
2898}
2899
2900
2901int
2902PyUnicode_FSDecoder(PyObject* arg, void* addr)
2903{
2904    PyObject *output = NULL;
2905    if (arg == NULL) {
2906        Py_DECREF(*(PyObject**)addr);
2907        return 1;
2908    }
2909    if (PyUnicode_Check(arg)) {
2910        if (PyUnicode_READY(arg))
2911            return 0;
2912        output = arg;
2913        Py_INCREF(output);
2914    }
2915    else {
2916        arg = PyBytes_FromObject(arg);
2917        if (!arg)
2918            return 0;
2919        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2920                                                  PyBytes_GET_SIZE(arg));
2921        Py_DECREF(arg);
2922        if (!output)
2923            return 0;
2924        if (!PyUnicode_Check(output)) {
2925            Py_DECREF(output);
2926            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2927            return 0;
2928        }
2929    }
2930    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2931                 PyUnicode_GET_LENGTH(output), 0, 1)) {
2932        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2933        Py_DECREF(output);
2934        return 0;
2935    }
2936    *(PyObject**)addr = output;
2937    return Py_CLEANUP_SUPPORTED;
2938}
2939
2940
2941char*
2942PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
2943{
2944    PyObject *bytes;
2945    PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2946
2947    if (!PyUnicode_Check(unicode)) {
2948        PyErr_BadArgument();
2949        return NULL;
2950    }
2951    if (PyUnicode_READY(u) == -1)
2952        return NULL;
2953
2954    if (PyUnicode_UTF8(unicode) == NULL) {
2955        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
2956        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2957        if (bytes == NULL)
2958            return NULL;
2959        _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2960        if (_PyUnicode_UTF8(u) == NULL) {
2961            Py_DECREF(bytes);
2962            return NULL;
2963        }
2964        _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2965        Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
2966        Py_DECREF(bytes);
2967    }
2968
2969    if (psize)
2970        *psize = PyUnicode_UTF8_LENGTH(unicode);
2971    return PyUnicode_UTF8(unicode);
2972}
2973
2974char*
2975PyUnicode_AsUTF8(PyObject *unicode)
2976{
2977    return PyUnicode_AsUTF8AndSize(unicode, NULL);
2978}
2979
2980#ifdef Py_DEBUG
2981int unicode_as_unicode_calls = 0;
2982#endif
2983
2984
2985Py_UNICODE *
2986PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2987{
2988    PyUnicodeObject *u;
2989    const unsigned char *one_byte;
2990#if SIZEOF_WCHAR_T == 4
2991    const Py_UCS2 *two_bytes;
2992#else
2993    const Py_UCS4 *four_bytes;
2994    const Py_UCS4 *ucs4_end;
2995    Py_ssize_t num_surrogates;
2996#endif
2997    wchar_t *w;
2998    wchar_t *wchar_end;
2999
3000    if (!PyUnicode_Check(unicode)) {
3001        PyErr_BadArgument();
3002        return NULL;
3003    }
3004    u = (PyUnicodeObject*)unicode;
3005    if (_PyUnicode_WSTR(u) == NULL) {
3006        /* Non-ASCII compact unicode object */
3007        assert(_PyUnicode_KIND(u) != 0);
3008        assert(PyUnicode_IS_READY(u));
3009
3010#ifdef Py_DEBUG
3011        ++unicode_as_unicode_calls;
3012#endif
3013
3014        if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3015#if SIZEOF_WCHAR_T == 2
3016            four_bytes = PyUnicode_4BYTE_DATA(u);
3017            ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3018            num_surrogates = 0;
3019
3020            for (; four_bytes < ucs4_end; ++four_bytes) {
3021                if (*four_bytes > 0xFFFF)
3022                    ++num_surrogates;
3023            }
3024
3025            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3026                    sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3027            if (!_PyUnicode_WSTR(u)) {
3028                PyErr_NoMemory();
3029                return NULL;
3030            }
3031            _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3032
3033            w = _PyUnicode_WSTR(u);
3034            wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3035            four_bytes = PyUnicode_4BYTE_DATA(u);
3036            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3037                if (*four_bytes > 0xFFFF) {
3038                    /* encode surrogate pair in this case */
3039                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3040                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3041                }
3042                else
3043                    *w = *four_bytes;
3044
3045                if (w > wchar_end) {
3046                    assert(0 && "Miscalculated string end");
3047                }
3048            }
3049            *w = 0;
3050#else
3051            /* sizeof(wchar_t) == 4 */
3052            Py_FatalError("Impossible unicode object state, wstr and str "
3053                          "should share memory already.");
3054            return NULL;
3055#endif
3056        }
3057        else {
3058            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3059                                                  (_PyUnicode_LENGTH(u) + 1));
3060            if (!_PyUnicode_WSTR(u)) {
3061                PyErr_NoMemory();
3062                return NULL;
3063            }
3064            if (!PyUnicode_IS_COMPACT_ASCII(u))
3065                _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3066            w = _PyUnicode_WSTR(u);
3067            wchar_end = w + _PyUnicode_LENGTH(u);
3068
3069            if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3070                one_byte = PyUnicode_1BYTE_DATA(u);
3071                for (; w < wchar_end; ++one_byte, ++w)
3072                    *w = *one_byte;
3073                /* null-terminate the wstr */
3074                *w = 0;
3075            }
3076            else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3077#if SIZEOF_WCHAR_T == 4
3078                two_bytes = PyUnicode_2BYTE_DATA(u);
3079                for (; w < wchar_end; ++two_bytes, ++w)
3080                    *w = *two_bytes;
3081                /* null-terminate the wstr */
3082                *w = 0;
3083#else
3084                /* sizeof(wchar_t) == 2 */
3085                PyObject_FREE(_PyUnicode_WSTR(u));
3086                _PyUnicode_WSTR(u) = NULL;
3087                Py_FatalError("Impossible unicode object state, wstr "
3088                              "and str should share memory already.");
3089                return NULL;
3090#endif
3091            }
3092            else {
3093                assert(0 && "This should never happen.");
3094            }
3095        }
3096    }
3097    if (size != NULL)
3098        *size = PyUnicode_WSTR_LENGTH(u);
3099    return _PyUnicode_WSTR(u);
3100}
3101
3102Py_UNICODE *
3103PyUnicode_AsUnicode(PyObject *unicode)
3104{
3105    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3106}
3107
3108
3109Py_ssize_t
3110PyUnicode_GetSize(PyObject *unicode)
3111{
3112    if (!PyUnicode_Check(unicode)) {
3113        PyErr_BadArgument();
3114        goto onError;
3115    }
3116    return PyUnicode_GET_SIZE(unicode);
3117
3118  onError:
3119    return -1;
3120}
3121
3122Py_ssize_t
3123PyUnicode_GetLength(PyObject *unicode)
3124{
3125    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3126        PyErr_BadArgument();
3127        return -1;
3128    }
3129
3130    return PyUnicode_GET_LENGTH(unicode);
3131}
3132
3133Py_UCS4
3134PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3135{
3136    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3137        PyErr_BadArgument();
3138        return (Py_UCS4)-1;
3139    }
3140    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3141        PyErr_SetString(PyExc_IndexError, "string index out of range");
3142        return (Py_UCS4)-1;
3143    }
3144    return PyUnicode_READ_CHAR(unicode, index);
3145}
3146
3147int
3148PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3149{
3150    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3151        PyErr_BadArgument();
3152        return -1;
3153    }
3154    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3155        PyErr_SetString(PyExc_IndexError, "string index out of range");
3156        return -1;
3157    }
3158    if (_PyUnicode_Dirty(unicode))
3159        return -1;
3160    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3161                    index, ch);
3162    return 0;
3163}
3164
3165const char *
3166PyUnicode_GetDefaultEncoding(void)
3167{
3168    return "utf-8";
3169}
3170
3171/* create or adjust a UnicodeDecodeError */
3172static void
3173make_decode_exception(PyObject **exceptionObject,
3174                      const char *encoding,
3175                      const char *input, Py_ssize_t length,
3176                      Py_ssize_t startpos, Py_ssize_t endpos,
3177                      const char *reason)
3178{
3179    if (*exceptionObject == NULL) {
3180        *exceptionObject = PyUnicodeDecodeError_Create(
3181            encoding, input, length, startpos, endpos, reason);
3182    }
3183    else {
3184        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3185            goto onError;
3186        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3187            goto onError;
3188        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3189            goto onError;
3190    }
3191    return;
3192
3193onError:
3194    Py_DECREF(*exceptionObject);
3195    *exceptionObject = NULL;
3196}
3197
3198/* error handling callback helper:
3199   build arguments, call the callback and check the arguments,
3200   if no exception occurred, copy the replacement to the output
3201   and adjust various state variables.
3202   return 0 on success, -1 on error
3203*/
3204
3205static int
3206unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3207                                 const char *encoding, const char *reason,
3208                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3209                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3210                                 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
3211{
3212    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3213
3214    PyObject *restuple = NULL;
3215    PyObject *repunicode = NULL;
3216    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
3217    Py_ssize_t insize;
3218    Py_ssize_t requiredsize;
3219    Py_ssize_t newpos;
3220    const Py_UNICODE *repptr;
3221    PyObject *inputobj = NULL;
3222    Py_ssize_t repsize;
3223    int res = -1;
3224
3225    if (*errorHandler == NULL) {
3226        *errorHandler = PyCodec_LookupError(errors);
3227        if (*errorHandler == NULL)
3228            goto onError;
3229    }
3230
3231    make_decode_exception(exceptionObject,
3232        encoding,
3233        *input, *inend - *input,
3234        *startinpos, *endinpos,
3235        reason);
3236    if (*exceptionObject == NULL)
3237        goto onError;
3238
3239    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3240    if (restuple == NULL)
3241        goto onError;
3242    if (!PyTuple_Check(restuple)) {
3243        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3244        goto onError;
3245    }
3246    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3247        goto onError;
3248
3249    /* Copy back the bytes variables, which might have been modified by the
3250       callback */
3251    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3252    if (!inputobj)
3253        goto onError;
3254    if (!PyBytes_Check(inputobj)) {
3255        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3256    }
3257    *input = PyBytes_AS_STRING(inputobj);
3258    insize = PyBytes_GET_SIZE(inputobj);
3259    *inend = *input + insize;
3260    /* we can DECREF safely, as the exception has another reference,
3261       so the object won't go away. */
3262    Py_DECREF(inputobj);
3263
3264    if (newpos<0)
3265        newpos = insize+newpos;
3266    if (newpos<0 || newpos>insize) {
3267        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3268        goto onError;
3269    }
3270
3271    /* need more space? (at least enough for what we
3272       have+the replacement+the rest of the string (starting
3273       at the new input position), so we won't have to check space
3274       when there are no errors in the rest of the string) */
3275    repptr = PyUnicode_AS_UNICODE(repunicode);
3276    repsize = PyUnicode_GET_SIZE(repunicode);
3277    requiredsize = *outpos + repsize + insize-newpos;
3278    if (requiredsize > outsize) {
3279        if (requiredsize<2*outsize)
3280            requiredsize = 2*outsize;
3281        if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
3282            goto onError;
3283        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
3284    }
3285    *endinpos = newpos;
3286    *inptr = *input + newpos;
3287    Py_UNICODE_COPY(*outptr, repptr, repsize);
3288    *outptr += repsize;
3289    *outpos += repsize;
3290
3291    /* we made it! */
3292    res = 0;
3293
3294  onError:
3295    Py_XDECREF(restuple);
3296    return res;
3297}
3298
3299/* --- UTF-7 Codec -------------------------------------------------------- */
3300
3301/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3302
3303/* Three simple macros defining base-64. */
3304
3305/* Is c a base-64 character? */
3306
3307#define IS_BASE64(c) \
3308    (((c) >= 'A' && (c) <= 'Z') ||     \
3309     ((c) >= 'a' && (c) <= 'z') ||     \
3310     ((c) >= '0' && (c) <= '9') ||     \
3311     (c) == '+' || (c) == '/')
3312
3313/* given that c is a base-64 character, what is its base-64 value? */
3314
3315#define FROM_BASE64(c)                                                  \
3316    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3317     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3318     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3319     (c) == '+' ? 62 : 63)
3320
3321/* What is the base-64 character of the bottom 6 bits of n? */
3322
3323#define TO_BASE64(n)  \
3324    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3325
3326/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3327 * decoded as itself.  We are permissive on decoding; the only ASCII
3328 * byte not decoding to itself is the + which begins a base64
3329 * string. */
3330
3331#define DECODE_DIRECT(c)                                \
3332    ((c) <= 127 && (c) != '+')
3333
3334/* The UTF-7 encoder treats ASCII characters differently according to
3335 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3336 * the above).  See RFC2152.  This array identifies these different
3337 * sets:
3338 * 0 : "Set D"
3339 *     alphanumeric and '(),-./:?
3340 * 1 : "Set O"
3341 *     !"#$%&*;<=>@[]^_`{|}
3342 * 2 : "whitespace"
3343 *     ht nl cr sp
3344 * 3 : special (must be base64 encoded)
3345 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3346 */
3347
3348static
3349char utf7_category[128] = {
3350/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3351    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3352/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3353    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3354/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3355    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3356/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3357    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3358/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3359    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3360/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3361    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3362/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3363    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3364/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3365    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3366};
3367
3368/* ENCODE_DIRECT: this character should be encoded as itself.  The
3369 * answer depends on whether we are encoding set O as itself, and also
3370 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3371 * clear that the answers to these questions vary between
3372 * applications, so this code needs to be flexible.  */
3373
3374#define ENCODE_DIRECT(c, directO, directWS)             \
3375    ((c) < 128 && (c) > 0 &&                            \
3376     ((utf7_category[(c)] == 0) ||                      \
3377      (directWS && (utf7_category[(c)] == 2)) ||        \
3378      (directO && (utf7_category[(c)] == 1))))
3379
3380PyObject *
3381PyUnicode_DecodeUTF7(const char *s,
3382                     Py_ssize_t size,
3383                     const char *errors)
3384{
3385    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3386}
3387
3388/* The decoder.  The only state we preserve is our read position,
3389 * i.e. how many characters we have consumed.  So if we end in the
3390 * middle of a shift sequence we have to back off the read position
3391 * and the output to the beginning of the sequence, otherwise we lose
3392 * all the shift state (seen bits, number of bits seen, high
3393 * surrogate). */
3394
3395PyObject *
3396PyUnicode_DecodeUTF7Stateful(const char *s,
3397                             Py_ssize_t size,
3398                             const char *errors,
3399                             Py_ssize_t *consumed)
3400{
3401    const char *starts = s;
3402    Py_ssize_t startinpos;
3403    Py_ssize_t endinpos;
3404    Py_ssize_t outpos;
3405    const char *e;
3406    PyUnicodeObject *unicode;
3407    Py_UNICODE *p;
3408    const char *errmsg = "";
3409    int inShift = 0;
3410    Py_UNICODE *shiftOutStart;
3411    unsigned int base64bits = 0;
3412    unsigned long base64buffer = 0;
3413    Py_UNICODE surrogate = 0;
3414    PyObject *errorHandler = NULL;
3415    PyObject *exc = NULL;
3416
3417    unicode = _PyUnicode_New(size);
3418    if (!unicode)
3419        return NULL;
3420    if (size == 0) {
3421        if (consumed)
3422            *consumed = 0;
3423        return (PyObject *)unicode;
3424    }
3425
3426    p = PyUnicode_AS_UNICODE(unicode);
3427    shiftOutStart = p;
3428    e = s + size;
3429
3430    while (s < e) {
3431        Py_UNICODE ch;
3432      restart:
3433        ch = (unsigned char) *s;
3434
3435        if (inShift) { /* in a base-64 section */
3436            if (IS_BASE64(ch)) { /* consume a base-64 character */
3437                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3438                base64bits += 6;
3439                s++;
3440                if (base64bits >= 16) {
3441                    /* we have enough bits for a UTF-16 value */
3442                    Py_UNICODE outCh = (Py_UNICODE)
3443                                       (base64buffer >> (base64bits-16));
3444                    base64bits -= 16;
3445                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3446                    if (surrogate) {
3447                        /* expecting a second surrogate */
3448                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3449#ifdef Py_UNICODE_WIDE
3450                            *p++ = (((surrogate & 0x3FF)<<10)
3451                                    | (outCh & 0x3FF)) + 0x10000;
3452#else
3453                            *p++ = surrogate;
3454                            *p++ = outCh;
3455#endif
3456                            surrogate = 0;
3457                        }
3458                        else {
3459                            surrogate = 0;
3460                            errmsg = "second surrogate missing";
3461                            goto utf7Error;
3462                        }
3463                    }
3464                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3465                        /* first surrogate */
3466                        surrogate = outCh;
3467                    }
3468                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3469                        errmsg = "unexpected second surrogate";
3470                        goto utf7Error;
3471                    }
3472                    else {
3473                        *p++ = outCh;
3474                    }
3475                }
3476            }
3477            else { /* now leaving a base-64 section */
3478                inShift = 0;
3479                s++;
3480                if (surrogate) {
3481                    errmsg = "second surrogate missing at end of shift sequence";
3482                    goto utf7Error;
3483                }
3484                if (base64bits > 0) { /* left-over bits */
3485                    if (base64bits >= 6) {
3486                        /* We've seen at least one base-64 character */
3487                        errmsg = "partial character in shift sequence";
3488                        goto utf7Error;
3489                    }
3490                    else {
3491                        /* Some bits remain; they should be zero */
3492                        if (base64buffer != 0) {
3493                            errmsg = "non-zero padding bits in shift sequence";
3494                            goto utf7Error;
3495                        }
3496                    }
3497                }
3498                if (ch != '-') {
3499                    /* '-' is absorbed; other terminating
3500                       characters are preserved */
3501                    *p++ = ch;
3502                }
3503            }
3504        }
3505        else if ( ch == '+' ) {
3506            startinpos = s-starts;
3507            s++; /* consume '+' */
3508            if (s < e && *s == '-') { /* '+-' encodes '+' */
3509                s++;
3510                *p++ = '+';
3511            }
3512            else { /* begin base64-encoded section */
3513                inShift = 1;
3514                shiftOutStart = p;
3515                base64bits = 0;
3516            }
3517        }
3518        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3519            *p++ = ch;
3520            s++;
3521        }
3522        else {
3523            startinpos = s-starts;
3524            s++;
3525            errmsg = "unexpected special character";
3526            goto utf7Error;
3527        }
3528        continue;
3529utf7Error:
3530        outpos = p-PyUnicode_AS_UNICODE(unicode);
3531        endinpos = s-starts;
3532        if (unicode_decode_call_errorhandler(
3533                errors, &errorHandler,
3534                "utf7", errmsg,
3535                &starts, &e, &startinpos, &endinpos, &exc, &s,
3536                &unicode, &outpos, &p))
3537            goto onError;
3538    }
3539
3540    /* end of string */
3541
3542    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3543        /* if we're in an inconsistent state, that's an error */
3544        if (surrogate ||
3545                (base64bits >= 6) ||
3546                (base64bits > 0 && base64buffer != 0)) {
3547            outpos = p-PyUnicode_AS_UNICODE(unicode);
3548            endinpos = size;
3549            if (unicode_decode_call_errorhandler(
3550                    errors, &errorHandler,
3551                    "utf7", "unterminated shift sequence",
3552                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3553                    &unicode, &outpos, &p))
3554                goto onError;
3555            if (s < e)
3556                goto restart;
3557        }
3558    }
3559
3560    /* return state */
3561    if (consumed) {
3562        if (inShift) {
3563            p = shiftOutStart; /* back off output */
3564            *consumed = startinpos;
3565        }
3566        else {
3567            *consumed = s-starts;
3568        }
3569    }
3570
3571    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
3572        goto onError;
3573
3574    Py_XDECREF(errorHandler);
3575    Py_XDECREF(exc);
3576    if (PyUnicode_READY(unicode) == -1) {
3577        Py_DECREF(unicode);
3578        return NULL;
3579    }
3580    return (PyObject *)unicode;
3581
3582  onError:
3583    Py_XDECREF(errorHandler);
3584    Py_XDECREF(exc);
3585    Py_DECREF(unicode);
3586    return NULL;
3587}
3588
3589
3590PyObject *
3591PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3592                     Py_ssize_t size,
3593                     int base64SetO,
3594                     int base64WhiteSpace,
3595                     const char *errors)
3596{
3597    PyObject *v;
3598    /* It might be possible to tighten this worst case */
3599    Py_ssize_t allocated = 8 * size;
3600    int inShift = 0;
3601    Py_ssize_t i = 0;
3602    unsigned int base64bits = 0;
3603    unsigned long base64buffer = 0;
3604    char * out;
3605    char * start;
3606
3607    if (size == 0)
3608        return PyBytes_FromStringAndSize(NULL, 0);
3609
3610    if (allocated / 8 != size)
3611        return PyErr_NoMemory();
3612
3613    v = PyBytes_FromStringAndSize(NULL, allocated);
3614    if (v == NULL)
3615        return NULL;
3616
3617    start = out = PyBytes_AS_STRING(v);
3618    for (;i < size; ++i) {
3619        Py_UNICODE ch = s[i];
3620
3621        if (inShift) {
3622            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3623                /* shifting out */
3624                if (base64bits) { /* output remaining bits */
3625                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
3626                    base64buffer = 0;
3627                    base64bits = 0;
3628                }
3629                inShift = 0;
3630                /* Characters not in the BASE64 set implicitly unshift the sequence
3631                   so no '-' is required, except if the character is itself a '-' */
3632                if (IS_BASE64(ch) || ch == '-') {
3633                    *out++ = '-';
3634                }
3635                *out++ = (char) ch;
3636            }
3637            else {
3638                goto encode_char;
3639            }
3640        }
3641        else { /* not in a shift sequence */
3642            if (ch == '+') {
3643                *out++ = '+';
3644                        *out++ = '-';
3645            }
3646            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3647                *out++ = (char) ch;
3648            }
3649            else {
3650                *out++ = '+';
3651                inShift = 1;
3652                goto encode_char;
3653            }
3654        }
3655        continue;
3656encode_char:
3657#ifdef Py_UNICODE_WIDE
3658        if (ch >= 0x10000) {
3659            /* code first surrogate */
3660            base64bits += 16;
3661            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3662            while (base64bits >= 6) {
3663                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3664                base64bits -= 6;
3665            }
3666            /* prepare second surrogate */
3667            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
3668        }
3669#endif
3670        base64bits += 16;
3671        base64buffer = (base64buffer << 16) | ch;
3672        while (base64bits >= 6) {
3673            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3674            base64bits -= 6;
3675        }
3676    }
3677    if (base64bits)
3678        *out++= TO_BASE64(base64buffer << (6-base64bits) );
3679    if (inShift)
3680        *out++ = '-';
3681    if (_PyBytes_Resize(&v, out - start) < 0)
3682        return NULL;
3683    return v;
3684}
3685
3686#undef IS_BASE64
3687#undef FROM_BASE64
3688#undef TO_BASE64
3689#undef DECODE_DIRECT
3690#undef ENCODE_DIRECT
3691
3692/* --- UTF-8 Codec -------------------------------------------------------- */
3693
3694static
3695char utf8_code_length[256] = {
3696    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
3697       illegal prefix.  See RFC 3629 for details */
3698    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3699    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3700    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3701    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3702    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3703    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3704    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3705    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3706    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
3707    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3708    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3709    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3710    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3711    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3712    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3713    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
3714};
3715
3716PyObject *
3717PyUnicode_DecodeUTF8(const char *s,
3718                     Py_ssize_t size,
3719                     const char *errors)
3720{
3721    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3722}
3723
3724/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3725#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3726
3727/* Mask to quickly check whether a C 'long' contains a
3728   non-ASCII, UTF8-encoded char. */
3729#if (SIZEOF_LONG == 8)
3730# define ASCII_CHAR_MASK 0x8080808080808080L
3731#elif (SIZEOF_LONG == 4)
3732# define ASCII_CHAR_MASK 0x80808080L
3733#else
3734# error C 'long' size should be either 4 or 8!
3735#endif
3736
3737/* Scans a UTF-8 string and returns the maximum character to be expected,
3738   the size of the decoded unicode string and if any major errors were
3739   encountered.
3740
3741   This function does check basic UTF-8 sanity, it does however NOT CHECK
3742   if the string contains surrogates, and if all continuation bytes are
3743   within the correct ranges, these checks are performed in
3744   PyUnicode_DecodeUTF8Stateful.
3745
3746   If it sets has_errors to 1, it means the value of unicode_size and max_char
3747   will be bogus and you should not rely on useful information in them.
3748   */
3749static Py_UCS4
3750utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3751                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3752                                  int *has_errors)
3753{
3754    Py_ssize_t n;
3755    Py_ssize_t char_count = 0;
3756    Py_UCS4 max_char = 127, new_max;
3757    Py_UCS4 upper_bound;
3758    const unsigned char *p = (const unsigned char *)s;
3759    const unsigned char *end = p + string_size;
3760    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3761    int err = 0;
3762
3763    for (; p < end && !err; ++p, ++char_count) {
3764        /* Only check value if it's not a ASCII char... */
3765        if (*p < 0x80) {
3766            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3767               an explanation. */
3768            if (!((size_t) p & LONG_PTR_MASK)) {
3769                /* Help register allocation */
3770                register const unsigned char *_p = p;
3771                while (_p < aligned_end) {
3772                    unsigned long value = *(unsigned long *) _p;
3773                    if (value & ASCII_CHAR_MASK)
3774                        break;
3775                    _p += SIZEOF_LONG;
3776                    char_count += SIZEOF_LONG;
3777                }
3778                p = _p;
3779                if (p == end)
3780                    break;
3781            }
3782        }
3783        if (*p >= 0x80) {
3784            n = utf8_code_length[*p];
3785            new_max = max_char;
3786            switch (n) {
3787            /* invalid start byte */
3788            case 0:
3789                err = 1;
3790                break;
3791            case 2:
3792                /* Code points between 0x00FF and 0x07FF inclusive.
3793                   Approximate the upper bound of the code point,
3794                   if this flips over 255 we can be sure it will be more
3795                   than 255 and the string will need 2 bytes per code coint,
3796                   if it stays under or equal to 255, we can be sure 1 byte
3797                   is enough.
3798                   ((*p & 0b00011111) << 6) | 0b00111111 */
3799                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3800                if (max_char < upper_bound)
3801                    new_max = upper_bound;
3802                /* Ensure we track at least that we left ASCII space. */
3803                if (new_max < 128)
3804                    new_max = 128;
3805                break;
3806            case 3:
3807                /* Between 0x0FFF and 0xFFFF inclusive, so values are
3808                   always > 255 and <= 65535 and will always need 2 bytes. */
3809                if (max_char < 65535)
3810                    new_max = 65535;
3811                break;
3812            case 4:
3813                /* Code point will be above 0xFFFF for sure in this case. */
3814                new_max = 65537;
3815                break;
3816            /* Internal error, this should be caught by the first if */
3817            case 1:
3818            default:
3819                assert(0 && "Impossible case in utf8_max_char_and_size");
3820                err = 1;
3821            }
3822            /* Instead of number of overall bytes for this code point,
3823               n containts the number of following bytes: */
3824            --n;
3825            /* Check if the follow up chars are all valid continuation bytes */
3826            if (n >= 1) {
3827                const unsigned char *cont;
3828                if ((p + n) >= end) {
3829                    if (consumed == 0)
3830                        /* incomplete data, non-incremental decoding */
3831                        err = 1;
3832                    break;
3833                }
3834                for (cont = p + 1; cont < (p + n); ++cont) {
3835                    if ((*cont & 0xc0) != 0x80) {
3836                        err = 1;
3837                        break;
3838                    }
3839                }
3840                p += n;
3841            }
3842            else
3843                err = 1;
3844            max_char = new_max;
3845        }
3846    }
3847
3848    if (unicode_size)
3849        *unicode_size = char_count;
3850    if (has_errors)
3851        *has_errors = err;
3852    return max_char;
3853}
3854
3855/* Similar to PyUnicode_WRITE but can also write into wstr field
3856   of the legacy unicode representation */
3857#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3858    do { \
3859        const int k_ = (kind); \
3860        if (k_ == PyUnicode_WCHAR_KIND) \
3861            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3862        else if (k_ == PyUnicode_1BYTE_KIND) \
3863            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3864        else if (k_ == PyUnicode_2BYTE_KIND) \
3865            ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3866        else \
3867            ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3868    } while (0)
3869
3870PyObject *
3871PyUnicode_DecodeUTF8Stateful(const char *s,
3872                             Py_ssize_t size,
3873                             const char *errors,
3874                             Py_ssize_t *consumed)
3875{
3876    const char *starts = s;
3877    int n;
3878    int k;
3879    Py_ssize_t startinpos;
3880    Py_ssize_t endinpos;
3881    const char *e, *aligned_end;
3882    PyUnicodeObject *unicode;
3883    const char *errmsg = "";
3884    PyObject *errorHandler = NULL;
3885    PyObject *exc = NULL;
3886    Py_UCS4 maxchar = 0;
3887    Py_ssize_t unicode_size;
3888    Py_ssize_t i;
3889    int kind;
3890    void *data;
3891    int has_errors;
3892    Py_UNICODE *error_outptr;
3893#if SIZEOF_WCHAR_T == 2
3894    Py_ssize_t wchar_offset = 0;
3895#endif
3896
3897    if (size == 0) {
3898        if (consumed)
3899            *consumed = 0;
3900        return (PyObject *)PyUnicode_New(0, 0);
3901    }
3902    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3903                                                consumed, &has_errors);
3904    if (has_errors) {
3905        unicode = _PyUnicode_New(size);
3906        if (!unicode)
3907            return NULL;
3908        kind = PyUnicode_WCHAR_KIND;
3909        data = PyUnicode_AS_UNICODE(unicode);
3910        assert(data != NULL);
3911    }
3912    else {
3913        unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3914        if (!unicode)
3915            return NULL;
3916        /* When the string is ASCII only, just use memcpy and return.
3917           unicode_size may be != size if there is an incomplete UTF-8
3918           sequence at the end of the ASCII block.  */
3919        if (maxchar < 128 && size == unicode_size) {
3920            Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3921            return (PyObject *)unicode;
3922        }
3923        kind = PyUnicode_KIND(unicode);
3924        data = PyUnicode_DATA(unicode);
3925    }
3926    /* Unpack UTF-8 encoded data */
3927    i = 0;
3928    e = s + size;
3929    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
3930
3931    while (s < e) {
3932        Py_UCS4 ch = (unsigned char)*s;
3933
3934        if (ch < 0x80) {
3935            /* Fast path for runs of ASCII characters. Given that common UTF-8
3936               input will consist of an overwhelming majority of ASCII
3937               characters, we try to optimize for this case by checking
3938               as many characters as a C 'long' can contain.
3939               First, check if we can do an aligned read, as most CPUs have
3940               a penalty for unaligned reads.
3941            */
3942            if (!((size_t) s & LONG_PTR_MASK)) {
3943                /* Help register allocation */
3944                register const char *_s = s;
3945                register Py_ssize_t _i = i;
3946                while (_s < aligned_end) {
3947                    /* Read a whole long at a time (either 4 or 8 bytes),
3948                       and do a fast unrolled copy if it only contains ASCII
3949                       characters. */
3950                    unsigned long value = *(unsigned long *) _s;
3951                    if (value & ASCII_CHAR_MASK)
3952                        break;
3953                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3954                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3955                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3956                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
3957#if (SIZEOF_LONG == 8)
3958                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3959                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3960                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3961                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
3962#endif
3963                    _s += SIZEOF_LONG;
3964                    _i += SIZEOF_LONG;
3965                }
3966                s = _s;
3967                i = _i;
3968                if (s == e)
3969                    break;
3970                ch = (unsigned char)*s;
3971            }
3972        }
3973
3974        if (ch < 0x80) {
3975            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
3976            s++;
3977            continue;
3978        }
3979
3980        n = utf8_code_length[ch];
3981
3982        if (s + n > e) {
3983            if (consumed)
3984                break;
3985            else {
3986                errmsg = "unexpected end of data";
3987                startinpos = s-starts;
3988                endinpos = startinpos+1;
3989                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3990                    endinpos++;
3991                goto utf8Error;
3992            }
3993        }
3994
3995        switch (n) {
3996
3997        case 0:
3998            errmsg = "invalid start byte";
3999            startinpos = s-starts;
4000            endinpos = startinpos+1;
4001            goto utf8Error;
4002
4003        case 1:
4004            errmsg = "internal error";
4005            startinpos = s-starts;
4006            endinpos = startinpos+1;
4007            goto utf8Error;
4008
4009        case 2:
4010            if ((s[1] & 0xc0) != 0x80) {
4011                errmsg = "invalid continuation byte";
4012                startinpos = s-starts;
4013                endinpos = startinpos + 1;
4014                goto utf8Error;
4015            }
4016            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4017            assert ((ch > 0x007F) && (ch <= 0x07FF));
4018            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4019            break;
4020
4021        case 3:
4022            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4023               will result in surrogates in range d800-dfff. Surrogates are
4024               not valid UTF-8 so they are rejected.
4025               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4026               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4027            if ((s[1] & 0xc0) != 0x80 ||
4028                (s[2] & 0xc0) != 0x80 ||
4029                ((unsigned char)s[0] == 0xE0 &&
4030                 (unsigned char)s[1] < 0xA0) ||
4031                ((unsigned char)s[0] == 0xED &&
4032                 (unsigned char)s[1] > 0x9F)) {
4033                errmsg = "invalid continuation byte";
4034                startinpos = s-starts;
4035                endinpos = startinpos + 1;
4036
4037                /* if s[1] first two bits are 1 and 0, then the invalid
4038                   continuation byte is s[2], so increment endinpos by 1,
4039                   if not, s[1] is invalid and endinpos doesn't need to
4040                   be incremented. */
4041                if ((s[1] & 0xC0) == 0x80)
4042                    endinpos++;
4043                goto utf8Error;
4044            }
4045            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4046            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4047            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4048            break;
4049
4050        case 4:
4051            if ((s[1] & 0xc0) != 0x80 ||
4052                (s[2] & 0xc0) != 0x80 ||
4053                (s[3] & 0xc0) != 0x80 ||
4054                ((unsigned char)s[0] == 0xF0 &&
4055                 (unsigned char)s[1] < 0x90) ||
4056                ((unsigned char)s[0] == 0xF4 &&
4057                 (unsigned char)s[1] > 0x8F)) {
4058                errmsg = "invalid continuation byte";
4059                startinpos = s-starts;
4060                endinpos = startinpos + 1;
4061                if ((s[1] & 0xC0) == 0x80) {
4062                    endinpos++;
4063                    if ((s[2] & 0xC0) == 0x80)
4064                        endinpos++;
4065                }
4066                goto utf8Error;
4067            }
4068            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4069                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4070            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4071
4072            /* If the string is flexible or we have native UCS-4, write
4073               directly.. */
4074            if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4075                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4076
4077            else {
4078                /* compute and append the two surrogates: */
4079
4080                /* translate from 10000..10FFFF to 0..FFFF */
4081                ch -= 0x10000;
4082
4083                /* high surrogate = top 10 bits added to D800 */
4084                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4085                                       (Py_UNICODE)(0xD800 + (ch >> 10)));
4086
4087                /* low surrogate = bottom 10 bits added to DC00 */
4088                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4089                                       (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4090            }
4091#if SIZEOF_WCHAR_T == 2
4092            wchar_offset++;
4093#endif
4094            break;
4095        }
4096        s += n;
4097        continue;
4098
4099      utf8Error:
4100        /* If this is not yet a resizable string, make it one.. */
4101        if (kind != PyUnicode_WCHAR_KIND) {
4102            const Py_UNICODE *u;
4103            PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4104            if (!new_unicode)
4105                goto onError;
4106            u = PyUnicode_AsUnicode((PyObject *)unicode);
4107            if (!u)
4108                goto onError;
4109#if SIZEOF_WCHAR_T == 2
4110            i += wchar_offset;
4111#endif
4112            Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4113            Py_DECREF(unicode);
4114            unicode = new_unicode;
4115            kind = 0;
4116            data = PyUnicode_AS_UNICODE(new_unicode);
4117            assert(data != NULL);
4118        }
4119        error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
4120        if (unicode_decode_call_errorhandler(
4121                errors, &errorHandler,
4122                "utf8", errmsg,
4123                &starts, &e, &startinpos, &endinpos, &exc, &s,
4124                &unicode, &i, &error_outptr))
4125            goto onError;
4126        /* Update data because unicode_decode_call_errorhandler might have
4127           re-created or resized the unicode object. */
4128        data = PyUnicode_AS_UNICODE(unicode);
4129        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4130    }
4131    /* Ensure the unicode_size calculation above was correct: */
4132    assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4133
4134    if (consumed)
4135        *consumed = s-starts;
4136
4137    /* Adjust length and ready string when it contained errors and
4138       is of the old resizable kind. */
4139    if (kind == PyUnicode_WCHAR_KIND) {
4140        if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
4141            PyUnicode_READY(unicode) == -1)
4142            goto onError;
4143    }
4144
4145    Py_XDECREF(errorHandler);
4146    Py_XDECREF(exc);
4147    if (PyUnicode_READY(unicode) == -1) {
4148        Py_DECREF(unicode);
4149        return NULL;
4150    }
4151    return (PyObject *)unicode;
4152
4153  onError:
4154    Py_XDECREF(errorHandler);
4155    Py_XDECREF(exc);
4156    Py_DECREF(unicode);
4157    return NULL;
4158}
4159
4160#undef WRITE_FLEXIBLE_OR_WSTR
4161
4162#ifdef __APPLE__
4163
4164/* Simplified UTF-8 decoder using surrogateescape error handler,
4165   used to decode the command line arguments on Mac OS X. */
4166
4167wchar_t*
4168_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4169{
4170    int n;
4171    const char *e;
4172    wchar_t *unicode, *p;
4173
4174    /* Note: size will always be longer than the resulting Unicode
4175       character count */
4176    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4177        PyErr_NoMemory();
4178        return NULL;
4179    }
4180    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4181    if (!unicode)
4182        return NULL;
4183
4184    /* Unpack UTF-8 encoded data */
4185    p = unicode;
4186    e = s + size;
4187    while (s < e) {
4188        Py_UCS4 ch = (unsigned char)*s;
4189
4190        if (ch < 0x80) {
4191            *p++ = (wchar_t)ch;
4192            s++;
4193            continue;
4194        }
4195
4196        n = utf8_code_length[ch];
4197        if (s + n > e) {
4198            goto surrogateescape;
4199        }
4200
4201        switch (n) {
4202        case 0:
4203        case 1:
4204            goto surrogateescape;
4205
4206        case 2:
4207            if ((s[1] & 0xc0) != 0x80)
4208                goto surrogateescape;
4209            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4210            assert ((ch > 0x007F) && (ch <= 0x07FF));
4211            *p++ = (wchar_t)ch;
4212            break;
4213
4214        case 3:
4215            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4216               will result in surrogates in range d800-dfff. Surrogates are
4217               not valid UTF-8 so they are rejected.
4218               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4219               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4220            if ((s[1] & 0xc0) != 0x80 ||
4221                (s[2] & 0xc0) != 0x80 ||
4222                ((unsigned char)s[0] == 0xE0 &&
4223                 (unsigned char)s[1] < 0xA0) ||
4224                ((unsigned char)s[0] == 0xED &&
4225                 (unsigned char)s[1] > 0x9F)) {
4226
4227                goto surrogateescape;
4228            }
4229            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4230            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4231            *p++ = (wchar_t)ch;
4232            break;
4233
4234        case 4:
4235            if ((s[1] & 0xc0) != 0x80 ||
4236                (s[2] & 0xc0) != 0x80 ||
4237                (s[3] & 0xc0) != 0x80 ||
4238                ((unsigned char)s[0] == 0xF0 &&
4239                 (unsigned char)s[1] < 0x90) ||
4240                ((unsigned char)s[0] == 0xF4 &&
4241                 (unsigned char)s[1] > 0x8F)) {
4242                goto surrogateescape;
4243            }
4244            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4245                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4246            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4247
4248#if SIZEOF_WCHAR_T == 4
4249            *p++ = (wchar_t)ch;
4250#else
4251            /*  compute and append the two surrogates: */
4252
4253            /*  translate from 10000..10FFFF to 0..FFFF */
4254            ch -= 0x10000;
4255
4256            /*  high surrogate = top 10 bits added to D800 */
4257            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4258
4259            /*  low surrogate = bottom 10 bits added to DC00 */
4260            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4261#endif
4262            break;
4263        }
4264        s += n;
4265        continue;
4266
4267      surrogateescape:
4268        *p++ = 0xDC00 + ch;
4269        s++;
4270    }
4271    *p = L'\0';
4272    return unicode;
4273}
4274
4275#endif /* __APPLE__ */
4276
4277/* Primary internal function which creates utf8 encoded bytes objects.
4278
4279   Allocation strategy:  if the string is short, convert into a stack buffer
4280   and allocate exactly as much space needed at the end.  Else allocate the
4281   maximum possible needed (4 result bytes per Unicode character), and return
4282   the excess memory at the end.
4283*/
4284PyObject *
4285_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
4286{
4287#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4288
4289    Py_ssize_t i;                /* index into s of next input byte */
4290    PyObject *result;            /* result string object */
4291    char *p;                     /* next free byte in output buffer */
4292    Py_ssize_t nallocated;      /* number of result bytes allocated */
4293    Py_ssize_t nneeded;            /* number of result bytes needed */
4294    char stackbuf[MAX_SHORT_UNICHARS * 4];
4295    PyObject *errorHandler = NULL;
4296    PyObject *exc = NULL;
4297    int kind;
4298    void *data;
4299    Py_ssize_t size;
4300    PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4301#if SIZEOF_WCHAR_T == 2
4302    Py_ssize_t wchar_offset = 0;
4303#endif
4304
4305    if (!PyUnicode_Check(unicode)) {
4306        PyErr_BadArgument();
4307        return NULL;
4308    }
4309
4310    if (PyUnicode_READY(unicode) == -1)
4311        return NULL;
4312
4313    if (PyUnicode_UTF8(unicode))
4314        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4315                                         PyUnicode_UTF8_LENGTH(unicode));
4316
4317    kind = PyUnicode_KIND(unicode);
4318    data = PyUnicode_DATA(unicode);
4319    size = PyUnicode_GET_LENGTH(unicode);
4320
4321    assert(size >= 0);
4322
4323    if (size <= MAX_SHORT_UNICHARS) {
4324        /* Write into the stack buffer; nallocated can't overflow.
4325         * At the end, we'll allocate exactly as much heap space as it
4326         * turns out we need.
4327         */
4328        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4329        result = NULL;   /* will allocate after we're done */
4330        p = stackbuf;
4331    }
4332    else {
4333        /* Overallocate on the heap, and give the excess back at the end. */
4334        nallocated = size * 4;
4335        if (nallocated / 4 != size)  /* overflow! */
4336            return PyErr_NoMemory();
4337        result = PyBytes_FromStringAndSize(NULL, nallocated);
4338        if (result == NULL)
4339            return NULL;
4340        p = PyBytes_AS_STRING(result);
4341    }
4342
4343    for (i = 0; i < size;) {
4344        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4345
4346        if (ch < 0x80)
4347            /* Encode ASCII */
4348            *p++ = (char) ch;
4349
4350        else if (ch < 0x0800) {
4351            /* Encode Latin-1 */
4352            *p++ = (char)(0xc0 | (ch >> 6));
4353            *p++ = (char)(0x80 | (ch & 0x3f));
4354        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4355            Py_ssize_t newpos;
4356            PyObject *rep;
4357            Py_ssize_t repsize, k, startpos;
4358            startpos = i-1;
4359#if SIZEOF_WCHAR_T == 2
4360            startpos += wchar_offset;
4361#endif
4362            rep = unicode_encode_call_errorhandler(
4363                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4364                  PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4365                  &exc, startpos, startpos+1, &newpos);
4366            if (!rep)
4367                goto error;
4368
4369            if (PyBytes_Check(rep))
4370                repsize = PyBytes_GET_SIZE(rep);
4371            else
4372                repsize = PyUnicode_GET_SIZE(rep);
4373
4374            if (repsize > 4) {
4375                Py_ssize_t offset;
4376
4377                if (result == NULL)
4378                    offset = p - stackbuf;
4379                else
4380                    offset = p - PyBytes_AS_STRING(result);
4381
4382                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4383                    /* integer overflow */
4384                    PyErr_NoMemory();
4385                    goto error;
4386                }
4387                nallocated += repsize - 4;
4388                if (result != NULL) {
4389                    if (_PyBytes_Resize(&result, nallocated) < 0)
4390                        goto error;
4391                } else {
4392                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4393                    if (result == NULL)
4394                        goto error;
4395                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4396                }
4397                p = PyBytes_AS_STRING(result) + offset;
4398            }
4399
4400            if (PyBytes_Check(rep)) {
4401                char *prep = PyBytes_AS_STRING(rep);
4402                for(k = repsize; k > 0; k--)
4403                    *p++ = *prep++;
4404            } else /* rep is unicode */ {
4405                const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4406                Py_UNICODE c;
4407
4408                for(k=0; k<repsize; k++) {
4409                    c = prep[k];
4410                    if (0x80 <= c) {
4411                        raise_encode_exception(&exc, "utf-8",
4412                                               PyUnicode_AS_UNICODE(unicode),
4413                                               size, i-1, i,
4414                                               "surrogates not allowed");
4415                        goto error;
4416                    }
4417                    *p++ = (char)prep[k];
4418                }
4419            }
4420            Py_DECREF(rep);
4421        } else if (ch < 0x10000) {
4422            *p++ = (char)(0xe0 | (ch >> 12));
4423            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4424            *p++ = (char)(0x80 | (ch & 0x3f));
4425        } else /* ch >= 0x10000 */ {
4426            /* Encode UCS4 Unicode ordinals */
4427            *p++ = (char)(0xf0 | (ch >> 18));
4428            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4429            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4430            *p++ = (char)(0x80 | (ch & 0x3f));
4431#if SIZEOF_WCHAR_T == 2
4432            wchar_offset++;
4433#endif
4434        }
4435    }
4436
4437    if (result == NULL) {
4438        /* This was stack allocated. */
4439        nneeded = p - stackbuf;
4440        assert(nneeded <= nallocated);
4441        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4442    }
4443    else {
4444        /* Cut back to size actually needed. */
4445        nneeded = p - PyBytes_AS_STRING(result);
4446        assert(nneeded <= nallocated);
4447        _PyBytes_Resize(&result, nneeded);
4448    }
4449
4450    Py_XDECREF(errorHandler);
4451    Py_XDECREF(exc);
4452    return result;
4453 error:
4454    Py_XDECREF(errorHandler);
4455    Py_XDECREF(exc);
4456    Py_XDECREF(result);
4457    return NULL;
4458
4459#undef MAX_SHORT_UNICHARS
4460}
4461
4462PyObject *
4463PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4464                     Py_ssize_t size,
4465                     const char *errors)
4466{
4467    PyObject *v, *unicode;
4468
4469    unicode = PyUnicode_FromUnicode(s, size);
4470    if (unicode == NULL)
4471        return NULL;
4472    v = _PyUnicode_AsUTF8String(unicode, errors);
4473    Py_DECREF(unicode);
4474    return v;
4475}
4476
4477PyObject *
4478PyUnicode_AsUTF8String(PyObject *unicode)
4479{
4480    return _PyUnicode_AsUTF8String(unicode, NULL);
4481}
4482
4483/* --- UTF-32 Codec ------------------------------------------------------- */
4484
4485PyObject *
4486PyUnicode_DecodeUTF32(const char *s,
4487                      Py_ssize_t size,
4488                      const char *errors,
4489                      int *byteorder)
4490{
4491    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4492}
4493
4494PyObject *
4495PyUnicode_DecodeUTF32Stateful(const char *s,
4496                              Py_ssize_t size,
4497                              const char *errors,
4498                              int *byteorder,
4499                              Py_ssize_t *consumed)
4500{
4501    const char *starts = s;
4502    Py_ssize_t startinpos;
4503    Py_ssize_t endinpos;
4504    Py_ssize_t outpos;
4505    PyUnicodeObject *unicode;
4506    Py_UNICODE *p;
4507#ifndef Py_UNICODE_WIDE
4508    int pairs = 0;
4509    const unsigned char *qq;
4510#else
4511    const int pairs = 0;
4512#endif
4513    const unsigned char *q, *e;
4514    int bo = 0;       /* assume native ordering by default */
4515    const char *errmsg = "";
4516    /* Offsets from q for retrieving bytes in the right order. */
4517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4518    int iorder[] = {0, 1, 2, 3};
4519#else
4520    int iorder[] = {3, 2, 1, 0};
4521#endif
4522    PyObject *errorHandler = NULL;
4523    PyObject *exc = NULL;
4524
4525    q = (unsigned char *)s;
4526    e = q + size;
4527
4528    if (byteorder)
4529        bo = *byteorder;
4530
4531    /* Check for BOM marks (U+FEFF) in the input and adjust current
4532       byte order setting accordingly. In native mode, the leading BOM
4533       mark is skipped, in all other modes, it is copied to the output
4534       stream as-is (giving a ZWNBSP character). */
4535    if (bo == 0) {
4536        if (size >= 4) {
4537            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4538                (q[iorder[1]] << 8) | q[iorder[0]];
4539#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4540            if (bom == 0x0000FEFF) {
4541                q += 4;
4542                bo = -1;
4543            }
4544            else if (bom == 0xFFFE0000) {
4545                q += 4;
4546                bo = 1;
4547            }
4548#else
4549            if (bom == 0x0000FEFF) {
4550                q += 4;
4551                bo = 1;
4552            }
4553            else if (bom == 0xFFFE0000) {
4554                q += 4;
4555                bo = -1;
4556            }
4557#endif
4558        }
4559    }
4560
4561    if (bo == -1) {
4562        /* force LE */
4563        iorder[0] = 0;
4564        iorder[1] = 1;
4565        iorder[2] = 2;
4566        iorder[3] = 3;
4567    }
4568    else if (bo == 1) {
4569        /* force BE */
4570        iorder[0] = 3;
4571        iorder[1] = 2;
4572        iorder[2] = 1;
4573        iorder[3] = 0;
4574    }
4575
4576    /* On narrow builds we split characters outside the BMP into two
4577       codepoints => count how much extra space we need. */
4578#ifndef Py_UNICODE_WIDE
4579    for (qq = q; qq < e; qq += 4)
4580        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4581            pairs++;
4582#endif
4583
4584    /* This might be one to much, because of a BOM */
4585    unicode = _PyUnicode_New((size+3)/4+pairs);
4586    if (!unicode)
4587        return NULL;
4588    if (size == 0)
4589        return (PyObject *)unicode;
4590
4591    /* Unpack UTF-32 encoded data */
4592    p = PyUnicode_AS_UNICODE(unicode);
4593
4594    while (q < e) {
4595        Py_UCS4 ch;
4596        /* remaining bytes at the end? (size should be divisible by 4) */
4597        if (e-q<4) {
4598            if (consumed)
4599                break;
4600            errmsg = "truncated data";
4601            startinpos = ((const char *)q)-starts;
4602            endinpos = ((const char *)e)-starts;
4603            goto utf32Error;
4604            /* The remaining input chars are ignored if the callback
4605               chooses to skip the input */
4606        }
4607        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4608            (q[iorder[1]] << 8) | q[iorder[0]];
4609
4610        if (ch >= 0x110000)
4611        {
4612            errmsg = "codepoint not in range(0x110000)";
4613            startinpos = ((const char *)q)-starts;
4614            endinpos = startinpos+4;
4615            goto utf32Error;
4616        }
4617#ifndef Py_UNICODE_WIDE
4618        if (ch >= 0x10000)
4619        {
4620            *p++ = 0xD800 | ((ch-0x10000) >> 10);
4621            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4622        }
4623        else
4624#endif
4625            *p++ = ch;
4626        q += 4;
4627        continue;
4628      utf32Error:
4629        outpos = p-PyUnicode_AS_UNICODE(unicode);
4630        if (unicode_decode_call_errorhandler(
4631                errors, &errorHandler,
4632                "utf32", errmsg,
4633                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4634                &unicode, &outpos, &p))
4635            goto onError;
4636    }
4637
4638    if (byteorder)
4639        *byteorder = bo;
4640
4641    if (consumed)
4642        *consumed = (const char *)q-starts;
4643
4644    /* Adjust length */
4645    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
4646        goto onError;
4647
4648    Py_XDECREF(errorHandler);
4649    Py_XDECREF(exc);
4650    if (PyUnicode_READY(unicode) == -1) {
4651        Py_DECREF(unicode);
4652        return NULL;
4653    }
4654    return (PyObject *)unicode;
4655
4656  onError:
4657    Py_DECREF(unicode);
4658    Py_XDECREF(errorHandler);
4659    Py_XDECREF(exc);
4660    return NULL;
4661}
4662
4663PyObject *
4664PyUnicode_EncodeUTF32(const Py_UNICODE *s,
4665                      Py_ssize_t size,
4666                      const char *errors,
4667                      int byteorder)
4668{
4669    PyObject *v;
4670    unsigned char *p;
4671    Py_ssize_t nsize, bytesize;
4672#ifndef Py_UNICODE_WIDE
4673    Py_ssize_t i, pairs;
4674#else
4675    const int pairs = 0;
4676#endif
4677    /* Offsets from p for storing byte pairs in the right order. */
4678#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4679    int iorder[] = {0, 1, 2, 3};
4680#else
4681    int iorder[] = {3, 2, 1, 0};
4682#endif
4683
4684#define STORECHAR(CH)                           \
4685    do {                                        \
4686        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
4687        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
4688        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
4689        p[iorder[0]] = (CH) & 0xff;             \
4690        p += 4;                                 \
4691    } while(0)
4692
4693    /* In narrow builds we can output surrogate pairs as one codepoint,
4694       so we need less space. */
4695#ifndef Py_UNICODE_WIDE
4696    for (i = pairs = 0; i < size-1; i++)
4697        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4698            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4699            pairs++;
4700#endif
4701    nsize = (size - pairs + (byteorder == 0));
4702    bytesize = nsize * 4;
4703    if (bytesize / 4 != nsize)
4704        return PyErr_NoMemory();
4705    v = PyBytes_FromStringAndSize(NULL, bytesize);
4706    if (v == NULL)
4707        return NULL;
4708
4709    p = (unsigned char *)PyBytes_AS_STRING(v);
4710    if (byteorder == 0)
4711        STORECHAR(0xFEFF);
4712    if (size == 0)
4713        goto done;
4714
4715    if (byteorder == -1) {
4716        /* force LE */
4717        iorder[0] = 0;
4718        iorder[1] = 1;
4719        iorder[2] = 2;
4720        iorder[3] = 3;
4721    }
4722    else if (byteorder == 1) {
4723        /* force BE */
4724        iorder[0] = 3;
4725        iorder[1] = 2;
4726        iorder[2] = 1;
4727        iorder[3] = 0;
4728    }
4729
4730    while (size-- > 0) {
4731        Py_UCS4 ch = *s++;
4732#ifndef Py_UNICODE_WIDE
4733        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4734            Py_UCS4 ch2 = *s;
4735            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4736                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4737                s++;
4738                size--;
4739            }
4740        }
4741#endif
4742        STORECHAR(ch);
4743    }
4744
4745  done:
4746    return v;
4747#undef STORECHAR
4748}
4749
4750PyObject *
4751PyUnicode_AsUTF32String(PyObject *unicode)
4752{
4753    if (!PyUnicode_Check(unicode)) {
4754        PyErr_BadArgument();
4755        return NULL;
4756    }
4757    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
4758                                 PyUnicode_GET_SIZE(unicode),
4759                                 NULL,
4760                                 0);
4761}
4762
4763/* --- UTF-16 Codec ------------------------------------------------------- */
4764
4765PyObject *
4766PyUnicode_DecodeUTF16(const char *s,
4767                      Py_ssize_t size,
4768                      const char *errors,
4769                      int *byteorder)
4770{
4771    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4772}
4773
4774/* Two masks for fast checking of whether a C 'long' may contain
4775   UTF16-encoded surrogate characters. This is an efficient heuristic,
4776   assuming that non-surrogate characters with a code point >= 0x8000 are
4777   rare in most input.
4778   FAST_CHAR_MASK is used when the input is in native byte ordering,
4779   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
4780*/
4781#if (SIZEOF_LONG == 8)
4782# define FAST_CHAR_MASK         0x8000800080008000L
4783# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4784#elif (SIZEOF_LONG == 4)
4785# define FAST_CHAR_MASK         0x80008000L
4786# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4787#else
4788# error C 'long' size should be either 4 or 8!
4789#endif
4790
4791PyObject *
4792PyUnicode_DecodeUTF16Stateful(const char *s,
4793                              Py_ssize_t size,
4794                              const char *errors,
4795                              int *byteorder,
4796                              Py_ssize_t *consumed)
4797{
4798    const char *starts = s;
4799    Py_ssize_t startinpos;
4800    Py_ssize_t endinpos;
4801    Py_ssize_t outpos;
4802    PyUnicodeObject *unicode;
4803    Py_UNICODE *p;
4804    const unsigned char *q, *e, *aligned_end;
4805    int bo = 0;       /* assume native ordering by default */
4806    int native_ordering = 0;
4807    const char *errmsg = "";
4808    /* Offsets from q for retrieving byte pairs in the right order. */
4809#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4810    int ihi = 1, ilo = 0;
4811#else
4812    int ihi = 0, ilo = 1;
4813#endif
4814    PyObject *errorHandler = NULL;
4815    PyObject *exc = NULL;
4816
4817    /* Note: size will always be longer than the resulting Unicode
4818       character count */
4819    unicode = _PyUnicode_New(size);
4820    if (!unicode)
4821        return NULL;
4822    if (size == 0)
4823        return (PyObject *)unicode;
4824
4825    /* Unpack UTF-16 encoded data */
4826    p = PyUnicode_AS_UNICODE(unicode);
4827    q = (unsigned char *)s;
4828    e = q + size - 1;
4829
4830    if (byteorder)
4831        bo = *byteorder;
4832
4833    /* Check for BOM marks (U+FEFF) in the input and adjust current
4834       byte order setting accordingly. In native mode, the leading BOM
4835       mark is skipped, in all other modes, it is copied to the output
4836       stream as-is (giving a ZWNBSP character). */
4837    if (bo == 0) {
4838        if (size >= 2) {
4839            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
4840#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4841            if (bom == 0xFEFF) {
4842                q += 2;
4843                bo = -1;
4844            }
4845            else if (bom == 0xFFFE) {
4846                q += 2;
4847                bo = 1;
4848            }
4849#else
4850            if (bom == 0xFEFF) {
4851                q += 2;
4852                bo = 1;
4853            }
4854            else if (bom == 0xFFFE) {
4855                q += 2;
4856                bo = -1;
4857            }
4858#endif
4859        }
4860    }
4861
4862    if (bo == -1) {
4863        /* force LE */
4864        ihi = 1;
4865        ilo = 0;
4866    }
4867    else if (bo == 1) {
4868        /* force BE */
4869        ihi = 0;
4870        ilo = 1;
4871    }
4872#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4873    native_ordering = ilo < ihi;
4874#else
4875    native_ordering = ilo > ihi;
4876#endif
4877
4878    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
4879    while (q < e) {
4880        Py_UNICODE ch;
4881        /* First check for possible aligned read of a C 'long'. Unaligned
4882           reads are more expensive, better to defer to another iteration. */
4883        if (!((size_t) q & LONG_PTR_MASK)) {
4884            /* Fast path for runs of non-surrogate chars. */
4885            register const unsigned char *_q = q;
4886            Py_UNICODE *_p = p;
4887            if (native_ordering) {
4888                /* Native ordering is simple: as long as the input cannot
4889                   possibly contain a surrogate char, do an unrolled copy
4890                   of several 16-bit code points to the target object.
4891                   The non-surrogate check is done on several input bytes
4892                   at a time (as many as a C 'long' can contain). */
4893                while (_q < aligned_end) {
4894                    unsigned long data = * (unsigned long *) _q;
4895                    if (data & FAST_CHAR_MASK)
4896                        break;
4897                    _p[0] = ((unsigned short *) _q)[0];
4898                    _p[1] = ((unsigned short *) _q)[1];
4899#if (SIZEOF_LONG == 8)
4900                    _p[2] = ((unsigned short *) _q)[2];
4901                    _p[3] = ((unsigned short *) _q)[3];
4902#endif
4903                    _q += SIZEOF_LONG;
4904                    _p += SIZEOF_LONG / 2;
4905                }
4906            }
4907            else {
4908                /* Byteswapped ordering is similar, but we must decompose
4909                   the copy bytewise, and take care of zero'ing out the
4910                   upper bytes if the target object is in 32-bit units
4911                   (that is, in UCS-4 builds). */
4912                while (_q < aligned_end) {
4913                    unsigned long data = * (unsigned long *) _q;
4914                    if (data & SWAPPED_FAST_CHAR_MASK)
4915                        break;
4916                    /* Zero upper bytes in UCS-4 builds */
4917#if (Py_UNICODE_SIZE > 2)
4918                    _p[0] = 0;
4919                    _p[1] = 0;
4920#if (SIZEOF_LONG == 8)
4921                    _p[2] = 0;
4922                    _p[3] = 0;
4923#endif
4924#endif
4925                    /* Issue #4916; UCS-4 builds on big endian machines must
4926                       fill the two last bytes of each 4-byte unit. */
4927#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4928# define OFF 2
4929#else
4930# define OFF 0
4931#endif
4932                    ((unsigned char *) _p)[OFF + 1] = _q[0];
4933                    ((unsigned char *) _p)[OFF + 0] = _q[1];
4934                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4935                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4936#if (SIZEOF_LONG == 8)
4937                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4938                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4939                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4940                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4941#endif
4942#undef OFF
4943                    _q += SIZEOF_LONG;
4944                    _p += SIZEOF_LONG / 2;
4945                }
4946            }
4947            p = _p;
4948            q = _q;
4949            if (q >= e)
4950                break;
4951        }
4952        ch = (q[ihi] << 8) | q[ilo];
4953
4954        q += 2;
4955
4956        if (ch < 0xD800 || ch > 0xDFFF) {
4957            *p++ = ch;
4958            continue;
4959        }
4960
4961        /* UTF-16 code pair: */
4962        if (q > e) {
4963            errmsg = "unexpected end of data";
4964            startinpos = (((const char *)q) - 2) - starts;
4965            endinpos = ((const char *)e) + 1 - starts;
4966            goto utf16Error;
4967        }
4968        if (0xD800 <= ch && ch <= 0xDBFF) {
4969            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4970            q += 2;
4971            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4972#ifndef Py_UNICODE_WIDE
4973                *p++ = ch;
4974                *p++ = ch2;
4975#else
4976                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4977#endif
4978                continue;
4979            }
4980            else {
4981                errmsg = "illegal UTF-16 surrogate";
4982                startinpos = (((const char *)q)-4)-starts;
4983                endinpos = startinpos+2;
4984                goto utf16Error;
4985            }
4986
4987        }
4988        errmsg = "illegal encoding";
4989        startinpos = (((const char *)q)-2)-starts;
4990        endinpos = startinpos+2;
4991        /* Fall through to report the error */
4992
4993      utf16Error:
4994        outpos = p - PyUnicode_AS_UNICODE(unicode);
4995        if (unicode_decode_call_errorhandler(
4996                errors,
4997                &errorHandler,
4998                "utf16", errmsg,
4999                &starts,
5000                (const char **)&e,
5001                &startinpos,
5002                &endinpos,
5003                &exc,
5004                (const char **)&q,
5005                &unicode,
5006                &outpos,
5007                &p))
5008            goto onError;
5009    }
5010    /* remaining byte at the end? (size should be even) */
5011    if (e == q) {
5012        if (!consumed) {
5013            errmsg = "truncated data";
5014            startinpos = ((const char *)q) - starts;
5015            endinpos = ((const char *)e) + 1 - starts;
5016            outpos = p - PyUnicode_AS_UNICODE(unicode);
5017            if (unicode_decode_call_errorhandler(
5018                    errors,
5019                    &errorHandler,
5020                    "utf16", errmsg,
5021                    &starts,
5022                    (const char **)&e,
5023                    &startinpos,
5024                    &endinpos,
5025                    &exc,
5026                    (const char **)&q,
5027                    &unicode,
5028                    &outpos,
5029                    &p))
5030                goto onError;
5031            /* The remaining input chars are ignored if the callback
5032               chooses to skip the input */
5033        }
5034    }
5035
5036    if (byteorder)
5037        *byteorder = bo;
5038
5039    if (consumed)
5040        *consumed = (const char *)q-starts;
5041
5042    /* Adjust length */
5043    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5044        goto onError;
5045
5046    Py_XDECREF(errorHandler);
5047    Py_XDECREF(exc);
5048    if (PyUnicode_READY(unicode) == -1) {
5049        Py_DECREF(unicode);
5050        return NULL;
5051    }
5052    return (PyObject *)unicode;
5053
5054  onError:
5055    Py_DECREF(unicode);
5056    Py_XDECREF(errorHandler);
5057    Py_XDECREF(exc);
5058    return NULL;
5059}
5060
5061#undef FAST_CHAR_MASK
5062#undef SWAPPED_FAST_CHAR_MASK
5063
5064PyObject *
5065PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5066                      Py_ssize_t size,
5067                      const char *errors,
5068                      int byteorder)
5069{
5070    PyObject *v;
5071    unsigned char *p;
5072    Py_ssize_t nsize, bytesize;
5073#ifdef Py_UNICODE_WIDE
5074    Py_ssize_t i, pairs;
5075#else
5076    const int pairs = 0;
5077#endif
5078    /* Offsets from p for storing byte pairs in the right order. */
5079#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5080    int ihi = 1, ilo = 0;
5081#else
5082    int ihi = 0, ilo = 1;
5083#endif
5084
5085#define STORECHAR(CH)                           \
5086    do {                                        \
5087        p[ihi] = ((CH) >> 8) & 0xff;            \
5088        p[ilo] = (CH) & 0xff;                   \
5089        p += 2;                                 \
5090    } while(0)
5091
5092#ifdef Py_UNICODE_WIDE
5093    for (i = pairs = 0; i < size; i++)
5094        if (s[i] >= 0x10000)
5095            pairs++;
5096#endif
5097    /* 2 * (size + pairs + (byteorder == 0)) */
5098    if (size > PY_SSIZE_T_MAX ||
5099        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5100        return PyErr_NoMemory();
5101    nsize = size + pairs + (byteorder == 0);
5102    bytesize = nsize * 2;
5103    if (bytesize / 2 != nsize)
5104        return PyErr_NoMemory();
5105    v = PyBytes_FromStringAndSize(NULL, bytesize);
5106    if (v == NULL)
5107        return NULL;
5108
5109    p = (unsigned char *)PyBytes_AS_STRING(v);
5110    if (byteorder == 0)
5111        STORECHAR(0xFEFF);
5112    if (size == 0)
5113        goto done;
5114
5115    if (byteorder == -1) {
5116        /* force LE */
5117        ihi = 1;
5118        ilo = 0;
5119    }
5120    else if (byteorder == 1) {
5121        /* force BE */
5122        ihi = 0;
5123        ilo = 1;
5124    }
5125
5126    while (size-- > 0) {
5127        Py_UNICODE ch = *s++;
5128        Py_UNICODE ch2 = 0;
5129#ifdef Py_UNICODE_WIDE
5130        if (ch >= 0x10000) {
5131            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5132            ch  = 0xD800 | ((ch-0x10000) >> 10);
5133        }
5134#endif
5135        STORECHAR(ch);
5136        if (ch2)
5137            STORECHAR(ch2);
5138    }
5139
5140  done:
5141    return v;
5142#undef STORECHAR
5143}
5144
5145PyObject *
5146PyUnicode_AsUTF16String(PyObject *unicode)
5147{
5148    if (!PyUnicode_Check(unicode)) {
5149        PyErr_BadArgument();
5150        return NULL;
5151    }
5152    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
5153                                 PyUnicode_GET_SIZE(unicode),
5154                                 NULL,
5155                                 0);
5156}
5157
5158/* --- Unicode Escape Codec ----------------------------------------------- */
5159
5160/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5161   if all the escapes in the string make it still a valid ASCII string.
5162   Returns -1 if any escapes were found which cause the string to
5163   pop out of ASCII range.  Otherwise returns the length of the
5164   required buffer to hold the string.
5165   */
5166Py_ssize_t
5167length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5168{
5169    const unsigned char *p = (const unsigned char *)s;
5170    const unsigned char *end = p + size;
5171    Py_ssize_t length = 0;
5172
5173    if (size < 0)
5174        return -1;
5175
5176    for (; p < end; ++p) {
5177        if (*p > 127) {
5178            /* Non-ASCII */
5179            return -1;
5180        }
5181        else if (*p != '\\') {
5182            /* Normal character */
5183            ++length;
5184        }
5185        else {
5186            /* Backslash-escape, check next char */
5187            ++p;
5188            /* Escape sequence reaches till end of string or
5189               non-ASCII follow-up. */
5190            if (p >= end || *p > 127)
5191                return -1;
5192            switch (*p) {
5193            case '\n':
5194                /* backslash + \n result in zero characters */
5195                break;
5196            case '\\': case '\'': case '\"':
5197            case 'b': case 'f': case 't':
5198            case 'n': case 'r': case 'v': case 'a':
5199                ++length;
5200                break;
5201            case '0': case '1': case '2': case '3':
5202            case '4': case '5': case '6': case '7':
5203            case 'x': case 'u': case 'U': case 'N':
5204                /* these do not guarantee ASCII characters */
5205                return -1;
5206            default:
5207                /* count the backslash + the other character */
5208                length += 2;
5209            }
5210        }
5211    }
5212    return length;
5213}
5214
5215/* Similar to PyUnicode_WRITE but either write into wstr field
5216   or treat string as ASCII. */
5217#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5218    do { \
5219        if ((kind) != PyUnicode_WCHAR_KIND) \
5220            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5221        else \
5222            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5223    } while (0)
5224
5225#define WRITE_WSTR(buf, index, value) \
5226    assert(kind == PyUnicode_WCHAR_KIND), \
5227    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5228
5229
5230static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5231
5232PyObject *
5233PyUnicode_DecodeUnicodeEscape(const char *s,
5234                              Py_ssize_t size,
5235                              const char *errors)
5236{
5237    const char *starts = s;
5238    Py_ssize_t startinpos;
5239    Py_ssize_t endinpos;
5240    int j;
5241    PyUnicodeObject *v;
5242    Py_UNICODE *p;
5243    const char *end;
5244    char* message;
5245    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5246    PyObject *errorHandler = NULL;
5247    PyObject *exc = NULL;
5248    Py_ssize_t ascii_length;
5249    Py_ssize_t i;
5250    int kind;
5251    void *data;
5252
5253    ascii_length = length_of_escaped_ascii_string(s, size);
5254
5255    /* After length_of_escaped_ascii_string() there are two alternatives,
5256       either the string is pure ASCII with named escapes like \n, etc.
5257       and we determined it's exact size (common case)
5258       or it contains \x, \u, ... escape sequences.  then we create a
5259       legacy wchar string and resize it at the end of this function. */
5260    if (ascii_length >= 0) {
5261        v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5262        if (!v)
5263            goto onError;
5264        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5265        kind = PyUnicode_1BYTE_KIND;
5266        data = PyUnicode_DATA(v);
5267    }
5268    else {
5269        /* Escaped strings will always be longer than the resulting
5270           Unicode string, so we start with size here and then reduce the
5271           length after conversion to the true value.
5272           (but if the error callback returns a long replacement string
5273           we'll have to allocate more space) */
5274        v = _PyUnicode_New(size);
5275        if (!v)
5276            goto onError;
5277        kind = PyUnicode_WCHAR_KIND;
5278        data = PyUnicode_AS_UNICODE(v);
5279    }
5280
5281    if (size == 0)
5282        return (PyObject *)v;
5283    i = 0;
5284    end = s + size;
5285
5286    while (s < end) {
5287        unsigned char c;
5288        Py_UNICODE x;
5289        int digits;
5290
5291        if (kind == PyUnicode_WCHAR_KIND) {
5292            assert(i < _PyUnicode_WSTR_LENGTH(v));
5293        }
5294        else {
5295            /* The only case in which i == ascii_length is a backslash
5296               followed by a newline. */
5297            assert(i <= ascii_length);
5298        }
5299
5300        /* Non-escape characters are interpreted as Unicode ordinals */
5301        if (*s != '\\') {
5302            WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
5303            continue;
5304        }
5305
5306        startinpos = s-starts;
5307        /* \ - Escapes */
5308        s++;
5309        c = *s++;
5310        if (s > end)
5311            c = '\0'; /* Invalid after \ */
5312
5313        if (kind == PyUnicode_WCHAR_KIND) {
5314            assert(i < _PyUnicode_WSTR_LENGTH(v));
5315        }
5316        else {
5317            /* The only case in which i == ascii_length is a backslash
5318               followed by a newline. */
5319            assert(i < ascii_length || (i == ascii_length && c == '\n'));
5320        }
5321
5322        switch (c) {
5323
5324            /* \x escapes */
5325        case '\n': break;
5326        case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5327        case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5328        case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5329        case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5330        /* FF */
5331        case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5332        case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5333        case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5334        case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5335        /* VT */
5336        case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5337        /* BEL, not classic C */
5338        case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
5339
5340            /* \OOO (octal) escapes */
5341        case '0': case '1': case '2': case '3':
5342        case '4': case '5': case '6': case '7':
5343            x = s[-1] - '0';
5344            if (s < end && '0' <= *s && *s <= '7') {
5345                x = (x<<3) + *s++ - '0';
5346                if (s < end && '0' <= *s && *s <= '7')
5347                    x = (x<<3) + *s++ - '0';
5348            }
5349            WRITE_WSTR(data, i++, x);
5350            break;
5351
5352            /* hex escapes */
5353            /* \xXX */
5354        case 'x':
5355            digits = 2;
5356            message = "truncated \\xXX escape";
5357            goto hexescape;
5358
5359            /* \uXXXX */
5360        case 'u':
5361            digits = 4;
5362            message = "truncated \\uXXXX escape";
5363            goto hexescape;
5364
5365            /* \UXXXXXXXX */
5366        case 'U':
5367            digits = 8;
5368            message = "truncated \\UXXXXXXXX escape";
5369        hexescape:
5370            chr = 0;
5371            p = PyUnicode_AS_UNICODE(v) + i;
5372            if (s+digits>end) {
5373                endinpos = size;
5374                if (unicode_decode_call_errorhandler(
5375                        errors, &errorHandler,
5376                        "unicodeescape", "end of string in escape sequence",
5377                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5378                        &v, &i, &p))
5379                    goto onError;
5380                data = PyUnicode_AS_UNICODE(v);
5381                goto nextByte;
5382            }
5383            for (j = 0; j < digits; ++j) {
5384                c = (unsigned char) s[j];
5385                if (!Py_ISXDIGIT(c)) {
5386                    endinpos = (s+j+1)-starts;
5387                    p = PyUnicode_AS_UNICODE(v) + i;
5388                    if (unicode_decode_call_errorhandler(
5389                            errors, &errorHandler,
5390                            "unicodeescape", message,
5391                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5392                            &v, &i, &p))
5393                        goto onError;
5394                    data = PyUnicode_AS_UNICODE(v);
5395                    goto nextByte;
5396                }
5397                chr = (chr<<4) & ~0xF;
5398                if (c >= '0' && c <= '9')
5399                    chr += c - '0';
5400                else if (c >= 'a' && c <= 'f')
5401                    chr += 10 + c - 'a';
5402                else
5403                    chr += 10 + c - 'A';
5404            }
5405            s += j;
5406            if (chr == 0xffffffff && PyErr_Occurred())
5407                /* _decoding_error will have already written into the
5408                   target buffer. */
5409                break;
5410        store:
5411            /* when we get here, chr is a 32-bit unicode character */
5412            if (chr <= 0xffff)
5413                /* UCS-2 character */
5414                WRITE_WSTR(data, i++, chr);
5415            else if (chr <= 0x10ffff) {
5416                /* UCS-4 character. Either store directly, or as
5417                   surrogate pair. */
5418#ifdef Py_UNICODE_WIDE
5419                WRITE_WSTR(data, i++, chr);
5420#else
5421                chr -= 0x10000L;
5422                WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5423                WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
5424#endif
5425            } else {
5426                endinpos = s-starts;
5427                p = PyUnicode_AS_UNICODE(v) + i;
5428                if (unicode_decode_call_errorhandler(
5429                        errors, &errorHandler,
5430                        "unicodeescape", "illegal Unicode character",
5431                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5432                        &v, &i, &p))
5433                    goto onError;
5434                data = PyUnicode_AS_UNICODE(v);
5435            }
5436            break;
5437
5438            /* \N{name} */
5439        case 'N':
5440            message = "malformed \\N character escape";
5441            if (ucnhash_CAPI == NULL) {
5442                /* load the unicode data module */
5443                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5444                                                PyUnicodeData_CAPSULE_NAME, 1);
5445                if (ucnhash_CAPI == NULL)
5446                    goto ucnhashError;
5447            }
5448            if (*s == '{') {
5449                const char *start = s+1;
5450                /* look for the closing brace */
5451                while (*s != '}' && s < end)
5452                    s++;
5453                if (s > start && s < end && *s == '}') {
5454                    /* found a name.  look it up in the unicode database */
5455                    message = "unknown Unicode character name";
5456                    s++;
5457                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5458                                              &chr))
5459                        goto store;
5460                }
5461            }
5462            endinpos = s-starts;
5463            p = PyUnicode_AS_UNICODE(v) + i;
5464            if (unicode_decode_call_errorhandler(
5465                    errors, &errorHandler,
5466                    "unicodeescape", message,
5467                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5468                    &v, &i, &p))
5469                goto onError;
5470            data = PyUnicode_AS_UNICODE(v);
5471            break;
5472
5473        default:
5474            if (s > end) {
5475                assert(kind == PyUnicode_WCHAR_KIND);
5476                message = "\\ at end of string";
5477                s--;
5478                endinpos = s-starts;
5479                p = PyUnicode_AS_UNICODE(v) + i;
5480                if (unicode_decode_call_errorhandler(
5481                        errors, &errorHandler,
5482                        "unicodeescape", message,
5483                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5484                        &v, &i, &p))
5485                    goto onError;
5486                data = PyUnicode_AS_UNICODE(v);
5487            }
5488            else {
5489                WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5490                WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
5491            }
5492            break;
5493        }
5494      nextByte:
5495        ;
5496    }
5497    /* Ensure the length prediction worked in case of ASCII strings */
5498    assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5499
5500    if (kind == PyUnicode_WCHAR_KIND)
5501    {
5502        if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5503            goto onError;
5504        if (PyUnicode_READY(v) == -1)
5505            goto onError;
5506    }
5507    Py_XDECREF(errorHandler);
5508    Py_XDECREF(exc);
5509    return (PyObject *)v;
5510
5511  ucnhashError:
5512    PyErr_SetString(
5513        PyExc_UnicodeError,
5514        "\\N escapes not supported (can't load unicodedata module)"
5515        );
5516    Py_XDECREF(v);
5517    Py_XDECREF(errorHandler);
5518    Py_XDECREF(exc);
5519    return NULL;
5520
5521  onError:
5522    Py_XDECREF(v);
5523    Py_XDECREF(errorHandler);
5524    Py_XDECREF(exc);
5525    return NULL;
5526}
5527
5528#undef WRITE_ASCII_OR_WSTR
5529#undef WRITE_WSTR
5530
5531/* Return a Unicode-Escape string version of the Unicode object.
5532
5533   If quotes is true, the string is enclosed in u"" or u'' quotes as
5534   appropriate.
5535
5536*/
5537
5538static const char *hexdigits = "0123456789abcdef";
5539
5540PyObject *
5541PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5542                              Py_ssize_t size)
5543{
5544    PyObject *repr;
5545    char *p;
5546
5547#ifdef Py_UNICODE_WIDE
5548    const Py_ssize_t expandsize = 10;
5549#else
5550    const Py_ssize_t expandsize = 6;
5551#endif
5552
5553    /* XXX(nnorwitz): rather than over-allocating, it would be
5554       better to choose a different scheme.  Perhaps scan the
5555       first N-chars of the string and allocate based on that size.
5556    */
5557    /* Initial allocation is based on the longest-possible unichr
5558       escape.
5559
5560       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5561       unichr, so in this case it's the longest unichr escape. In
5562       narrow (UTF-16) builds this is five chars per source unichr
5563       since there are two unichrs in the surrogate pair, so in narrow
5564       (UTF-16) builds it's not the longest unichr escape.
5565
5566       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5567       so in the narrow (UTF-16) build case it's the longest unichr
5568       escape.
5569    */
5570
5571    if (size == 0)
5572        return PyBytes_FromStringAndSize(NULL, 0);
5573
5574    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5575        return PyErr_NoMemory();
5576
5577    repr = PyBytes_FromStringAndSize(NULL,
5578                                     2
5579                                     + expandsize*size
5580                                     + 1);
5581    if (repr == NULL)
5582        return NULL;
5583
5584    p = PyBytes_AS_STRING(repr);
5585
5586    while (size-- > 0) {
5587        Py_UNICODE ch = *s++;
5588
5589        /* Escape backslashes */
5590        if (ch == '\\') {
5591            *p++ = '\\';
5592            *p++ = (char) ch;
5593            continue;
5594        }
5595
5596#ifdef Py_UNICODE_WIDE
5597        /* Map 21-bit characters to '\U00xxxxxx' */
5598        else if (ch >= 0x10000) {
5599            *p++ = '\\';
5600            *p++ = 'U';
5601            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5602            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5603            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5604            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5605            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5606            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5607            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5608            *p++ = hexdigits[ch & 0x0000000F];
5609            continue;
5610        }
5611#else
5612        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5613        else if (ch >= 0xD800 && ch < 0xDC00) {
5614            Py_UNICODE ch2;
5615            Py_UCS4 ucs;
5616
5617            ch2 = *s++;
5618            size--;
5619            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5620                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5621                *p++ = '\\';
5622                *p++ = 'U';
5623                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5624                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5625                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5626                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5627                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5628                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5629                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5630                *p++ = hexdigits[ucs & 0x0000000F];
5631                continue;
5632            }
5633            /* Fall through: isolated surrogates are copied as-is */
5634            s--;
5635            size++;
5636        }
5637#endif
5638
5639        /* Map 16-bit characters to '\uxxxx' */
5640        if (ch >= 256) {
5641            *p++ = '\\';
5642            *p++ = 'u';
5643            *p++ = hexdigits[(ch >> 12) & 0x000F];
5644            *p++ = hexdigits[(ch >> 8) & 0x000F];
5645            *p++ = hexdigits[(ch >> 4) & 0x000F];
5646            *p++ = hexdigits[ch & 0x000F];
5647        }
5648
5649        /* Map special whitespace to '\t', \n', '\r' */
5650        else if (ch == '\t') {
5651            *p++ = '\\';
5652            *p++ = 't';
5653        }
5654        else if (ch == '\n') {
5655            *p++ = '\\';
5656            *p++ = 'n';
5657        }
5658        else if (ch == '\r') {
5659            *p++ = '\\';
5660            *p++ = 'r';
5661        }
5662
5663        /* Map non-printable US ASCII to '\xhh' */
5664        else if (ch < ' ' || ch >= 0x7F) {
5665            *p++ = '\\';
5666            *p++ = 'x';
5667            *p++ = hexdigits[(ch >> 4) & 0x000F];
5668            *p++ = hexdigits[ch & 0x000F];
5669        }
5670
5671        /* Copy everything else as-is */
5672        else
5673            *p++ = (char) ch;
5674    }
5675
5676    assert(p - PyBytes_AS_STRING(repr) > 0);
5677    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5678        return NULL;
5679    return repr;
5680}
5681
5682PyObject *
5683PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5684{
5685    PyObject *s;
5686    if (!PyUnicode_Check(unicode)) {
5687        PyErr_BadArgument();
5688        return NULL;
5689    }
5690    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5691                                      PyUnicode_GET_SIZE(unicode));
5692    return s;
5693}
5694
5695/* --- Raw Unicode Escape Codec ------------------------------------------- */
5696
5697PyObject *
5698PyUnicode_DecodeRawUnicodeEscape(const char *s,
5699                                 Py_ssize_t size,
5700                                 const char *errors)
5701{
5702    const char *starts = s;
5703    Py_ssize_t startinpos;
5704    Py_ssize_t endinpos;
5705    Py_ssize_t outpos;
5706    PyUnicodeObject *v;
5707    Py_UNICODE *p;
5708    const char *end;
5709    const char *bs;
5710    PyObject *errorHandler = NULL;
5711    PyObject *exc = NULL;
5712
5713    /* Escaped strings will always be longer than the resulting
5714       Unicode string, so we start with size here and then reduce the
5715       length after conversion to the true value. (But decoding error
5716       handler might have to resize the string) */
5717    v = _PyUnicode_New(size);
5718    if (v == NULL)
5719        goto onError;
5720    if (size == 0)
5721        return (PyObject *)v;
5722    p = PyUnicode_AS_UNICODE(v);
5723    end = s + size;
5724    while (s < end) {
5725        unsigned char c;
5726        Py_UCS4 x;
5727        int i;
5728        int count;
5729
5730        /* Non-escape characters are interpreted as Unicode ordinals */
5731        if (*s != '\\') {
5732            *p++ = (unsigned char)*s++;
5733            continue;
5734        }
5735        startinpos = s-starts;
5736
5737        /* \u-escapes are only interpreted iff the number of leading
5738           backslashes if odd */
5739        bs = s;
5740        for (;s < end;) {
5741            if (*s != '\\')
5742                break;
5743            *p++ = (unsigned char)*s++;
5744        }
5745        if (((s - bs) & 1) == 0 ||
5746            s >= end ||
5747            (*s != 'u' && *s != 'U')) {
5748            continue;
5749        }
5750        p--;
5751        count = *s=='u' ? 4 : 8;
5752        s++;
5753
5754        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5755        outpos = p-PyUnicode_AS_UNICODE(v);
5756        for (x = 0, i = 0; i < count; ++i, ++s) {
5757            c = (unsigned char)*s;
5758            if (!Py_ISXDIGIT(c)) {
5759                endinpos = s-starts;
5760                if (unicode_decode_call_errorhandler(
5761                        errors, &errorHandler,
5762                        "rawunicodeescape", "truncated \\uXXXX",
5763                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5764                        &v, &outpos, &p))
5765                    goto onError;
5766                goto nextByte;
5767            }
5768            x = (x<<4) & ~0xF;
5769            if (c >= '0' && c <= '9')
5770                x += c - '0';
5771            else if (c >= 'a' && c <= 'f')
5772                x += 10 + c - 'a';
5773            else
5774                x += 10 + c - 'A';
5775        }
5776        if (x <= 0xffff)
5777            /* UCS-2 character */
5778            *p++ = (Py_UNICODE) x;
5779        else if (x <= 0x10ffff) {
5780            /* UCS-4 character. Either store directly, or as
5781               surrogate pair. */
5782#ifdef Py_UNICODE_WIDE
5783            *p++ = (Py_UNICODE) x;
5784#else
5785            x -= 0x10000L;
5786            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5787            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
5788#endif
5789        } else {
5790            endinpos = s-starts;
5791            outpos = p-PyUnicode_AS_UNICODE(v);
5792            if (unicode_decode_call_errorhandler(
5793                    errors, &errorHandler,
5794                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5795                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5796                    &v, &outpos, &p))
5797                goto onError;
5798        }
5799      nextByte:
5800        ;
5801    }
5802    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5803        goto onError;
5804    Py_XDECREF(errorHandler);
5805    Py_XDECREF(exc);
5806    if (PyUnicode_READY(v) == -1) {
5807        Py_DECREF(v);
5808        return NULL;
5809    }
5810    return (PyObject *)v;
5811
5812  onError:
5813    Py_XDECREF(v);
5814    Py_XDECREF(errorHandler);
5815    Py_XDECREF(exc);
5816    return NULL;
5817}
5818
5819PyObject *
5820PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5821                                 Py_ssize_t size)
5822{
5823    PyObject *repr;
5824    char *p;
5825    char *q;
5826
5827#ifdef Py_UNICODE_WIDE
5828    const Py_ssize_t expandsize = 10;
5829#else
5830    const Py_ssize_t expandsize = 6;
5831#endif
5832
5833    if (size > PY_SSIZE_T_MAX / expandsize)
5834        return PyErr_NoMemory();
5835
5836    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
5837    if (repr == NULL)
5838        return NULL;
5839    if (size == 0)
5840        return repr;
5841
5842    p = q = PyBytes_AS_STRING(repr);
5843    while (size-- > 0) {
5844        Py_UNICODE ch = *s++;
5845#ifdef Py_UNICODE_WIDE
5846        /* Map 32-bit characters to '\Uxxxxxxxx' */
5847        if (ch >= 0x10000) {
5848            *p++ = '\\';
5849            *p++ = 'U';
5850            *p++ = hexdigits[(ch >> 28) & 0xf];
5851            *p++ = hexdigits[(ch >> 24) & 0xf];
5852            *p++ = hexdigits[(ch >> 20) & 0xf];
5853            *p++ = hexdigits[(ch >> 16) & 0xf];
5854            *p++ = hexdigits[(ch >> 12) & 0xf];
5855            *p++ = hexdigits[(ch >> 8) & 0xf];
5856            *p++ = hexdigits[(ch >> 4) & 0xf];
5857            *p++ = hexdigits[ch & 15];
5858        }
5859        else
5860#else
5861            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5862            if (ch >= 0xD800 && ch < 0xDC00) {
5863                Py_UNICODE ch2;
5864                Py_UCS4 ucs;
5865
5866                ch2 = *s++;
5867                size--;
5868                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5869                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5870                    *p++ = '\\';
5871                    *p++ = 'U';
5872                    *p++ = hexdigits[(ucs >> 28) & 0xf];
5873                    *p++ = hexdigits[(ucs >> 24) & 0xf];
5874                    *p++ = hexdigits[(ucs >> 20) & 0xf];
5875                    *p++ = hexdigits[(ucs >> 16) & 0xf];
5876                    *p++ = hexdigits[(ucs >> 12) & 0xf];
5877                    *p++ = hexdigits[(ucs >> 8) & 0xf];
5878                    *p++ = hexdigits[(ucs >> 4) & 0xf];
5879                    *p++ = hexdigits[ucs & 0xf];
5880                    continue;
5881                }
5882                /* Fall through: isolated surrogates are copied as-is */
5883                s--;
5884                size++;
5885            }
5886#endif
5887        /* Map 16-bit characters to '\uxxxx' */
5888        if (ch >= 256) {
5889            *p++ = '\\';
5890            *p++ = 'u';
5891            *p++ = hexdigits[(ch >> 12) & 0xf];
5892            *p++ = hexdigits[(ch >> 8) & 0xf];
5893            *p++ = hexdigits[(ch >> 4) & 0xf];
5894            *p++ = hexdigits[ch & 15];
5895        }
5896        /* Copy everything else as-is */
5897        else
5898            *p++ = (char) ch;
5899    }
5900    size = p - q;
5901
5902    assert(size > 0);
5903    if (_PyBytes_Resize(&repr, size) < 0)
5904        return NULL;
5905    return repr;
5906}
5907
5908PyObject *
5909PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
5910{
5911    PyObject *s;
5912    if (!PyUnicode_Check(unicode)) {
5913        PyErr_BadArgument();
5914        return NULL;
5915    }
5916    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5917                                         PyUnicode_GET_SIZE(unicode));
5918
5919    return s;
5920}
5921
5922/* --- Unicode Internal Codec ------------------------------------------- */
5923
5924PyObject *
5925_PyUnicode_DecodeUnicodeInternal(const char *s,
5926                                 Py_ssize_t size,
5927                                 const char *errors)
5928{
5929    const char *starts = s;
5930    Py_ssize_t startinpos;
5931    Py_ssize_t endinpos;
5932    Py_ssize_t outpos;
5933    PyUnicodeObject *v;
5934    Py_UNICODE *p;
5935    const char *end;
5936    const char *reason;
5937    PyObject *errorHandler = NULL;
5938    PyObject *exc = NULL;
5939
5940#ifdef Py_UNICODE_WIDE
5941    Py_UNICODE unimax = PyUnicode_GetMax();
5942#endif
5943
5944    /* XXX overflow detection missing */
5945    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5946    if (v == NULL)
5947        goto onError;
5948    /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5949       as string was created with the old API. */
5950    if (PyUnicode_GET_SIZE(v) == 0)
5951        return (PyObject *)v;
5952    p = PyUnicode_AS_UNICODE(v);
5953    end = s + size;
5954
5955    while (s < end) {
5956        memcpy(p, s, sizeof(Py_UNICODE));
5957        /* We have to sanity check the raw data, otherwise doom looms for
5958           some malformed UCS-4 data. */
5959        if (
5960#ifdef Py_UNICODE_WIDE
5961            *p > unimax || *p < 0 ||
5962#endif
5963            end-s < Py_UNICODE_SIZE
5964            )
5965        {
5966            startinpos = s - starts;
5967            if (end-s < Py_UNICODE_SIZE) {
5968                endinpos = end-starts;
5969                reason = "truncated input";
5970            }
5971            else {
5972                endinpos = s - starts + Py_UNICODE_SIZE;
5973                reason = "illegal code point (> 0x10FFFF)";
5974            }
5975            outpos = p - PyUnicode_AS_UNICODE(v);
5976            if (unicode_decode_call_errorhandler(
5977                    errors, &errorHandler,
5978                    "unicode_internal", reason,
5979                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5980                    &v, &outpos, &p)) {
5981                goto onError;
5982            }
5983        }
5984        else {
5985            p++;
5986            s += Py_UNICODE_SIZE;
5987        }
5988    }
5989
5990    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5991        goto onError;
5992    Py_XDECREF(errorHandler);
5993    Py_XDECREF(exc);
5994    if (PyUnicode_READY(v) == -1) {
5995        Py_DECREF(v);
5996        return NULL;
5997    }
5998    return (PyObject *)v;
5999
6000  onError:
6001    Py_XDECREF(v);
6002    Py_XDECREF(errorHandler);
6003    Py_XDECREF(exc);
6004    return NULL;
6005}
6006
6007/* --- Latin-1 Codec ------------------------------------------------------ */
6008
6009PyObject *
6010PyUnicode_DecodeLatin1(const char *s,
6011                       Py_ssize_t size,
6012                       const char *errors)
6013{
6014    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6015    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6016}
6017
6018/* create or adjust a UnicodeEncodeError */
6019static void
6020make_encode_exception(PyObject **exceptionObject,
6021                      const char *encoding,
6022                      const Py_UNICODE *unicode, Py_ssize_t size,
6023                      Py_ssize_t startpos, Py_ssize_t endpos,
6024                      const char *reason)
6025{
6026    if (*exceptionObject == NULL) {
6027        *exceptionObject = PyUnicodeEncodeError_Create(
6028            encoding, unicode, size, startpos, endpos, reason);
6029    }
6030    else {
6031        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6032            goto onError;
6033        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6034            goto onError;
6035        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6036            goto onError;
6037        return;
6038      onError:
6039        Py_DECREF(*exceptionObject);
6040        *exceptionObject = NULL;
6041    }
6042}
6043
6044/* raises a UnicodeEncodeError */
6045static void
6046raise_encode_exception(PyObject **exceptionObject,
6047                       const char *encoding,
6048                       const Py_UNICODE *unicode, Py_ssize_t size,
6049                       Py_ssize_t startpos, Py_ssize_t endpos,
6050                       const char *reason)
6051{
6052    make_encode_exception(exceptionObject,
6053                          encoding, unicode, size, startpos, endpos, reason);
6054    if (*exceptionObject != NULL)
6055        PyCodec_StrictErrors(*exceptionObject);
6056}
6057
6058/* error handling callback helper:
6059   build arguments, call the callback and check the arguments,
6060   put the result into newpos and return the replacement string, which
6061   has to be freed by the caller */
6062static PyObject *
6063unicode_encode_call_errorhandler(const char *errors,
6064                                 PyObject **errorHandler,
6065                                 const char *encoding, const char *reason,
6066                                 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6067                                 Py_ssize_t startpos, Py_ssize_t endpos,
6068                                 Py_ssize_t *newpos)
6069{
6070    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6071
6072    PyObject *restuple;
6073    PyObject *resunicode;
6074
6075    if (*errorHandler == NULL) {
6076        *errorHandler = PyCodec_LookupError(errors);
6077        if (*errorHandler == NULL)
6078            return NULL;
6079    }
6080
6081    make_encode_exception(exceptionObject,
6082                          encoding, unicode, size, startpos, endpos, reason);
6083    if (*exceptionObject == NULL)
6084        return NULL;
6085
6086    restuple = PyObject_CallFunctionObjArgs(
6087        *errorHandler, *exceptionObject, NULL);
6088    if (restuple == NULL)
6089        return NULL;
6090    if (!PyTuple_Check(restuple)) {
6091        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6092        Py_DECREF(restuple);
6093        return NULL;
6094    }
6095    if (!PyArg_ParseTuple(restuple, argparse,
6096                          &resunicode, newpos)) {
6097        Py_DECREF(restuple);
6098        return NULL;
6099    }
6100    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6101        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6102        Py_DECREF(restuple);
6103        return NULL;
6104    }
6105    if (*newpos<0)
6106        *newpos = size+*newpos;
6107    if (*newpos<0 || *newpos>size) {
6108        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6109        Py_DECREF(restuple);
6110        return NULL;
6111    }
6112    Py_INCREF(resunicode);
6113    Py_DECREF(restuple);
6114    return resunicode;
6115}
6116
6117static PyObject *
6118unicode_encode_ucs1(const Py_UNICODE *p,
6119                    Py_ssize_t size,
6120                    const char *errors,
6121                    int limit)
6122{
6123    /* output object */
6124    PyObject *res;
6125    /* pointers to the beginning and end+1 of input */
6126    const Py_UNICODE *startp = p;
6127    const Py_UNICODE *endp = p + size;
6128    /* pointer to the beginning of the unencodable characters */
6129    /* const Py_UNICODE *badp = NULL; */
6130    /* pointer into the output */
6131    char *str;
6132    /* current output position */
6133    Py_ssize_t ressize;
6134    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6135    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6136    PyObject *errorHandler = NULL;
6137    PyObject *exc = NULL;
6138    /* the following variable is used for caching string comparisons
6139     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6140    int known_errorHandler = -1;
6141
6142    /* allocate enough for a simple encoding without
6143       replacements, if we need more, we'll resize */
6144    if (size == 0)
6145        return PyBytes_FromStringAndSize(NULL, 0);
6146    res = PyBytes_FromStringAndSize(NULL, size);
6147    if (res == NULL)
6148        return NULL;
6149    str = PyBytes_AS_STRING(res);
6150    ressize = size;
6151
6152    while (p<endp) {
6153        Py_UNICODE c = *p;
6154
6155        /* can we encode this? */
6156        if (c<limit) {
6157            /* no overflow check, because we know that the space is enough */
6158            *str++ = (char)c;
6159            ++p;
6160        }
6161        else {
6162            Py_ssize_t unicodepos = p-startp;
6163            Py_ssize_t requiredsize;
6164            PyObject *repunicode;
6165            Py_ssize_t repsize;
6166            Py_ssize_t newpos;
6167            Py_ssize_t respos;
6168            Py_UNICODE *uni2;
6169            /* startpos for collecting unencodable chars */
6170            const Py_UNICODE *collstart = p;
6171            const Py_UNICODE *collend = p;
6172            /* find all unecodable characters */
6173            while ((collend < endp) && ((*collend)>=limit))
6174                ++collend;
6175            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6176            if (known_errorHandler==-1) {
6177                if ((errors==NULL) || (!strcmp(errors, "strict")))
6178                    known_errorHandler = 1;
6179                else if (!strcmp(errors, "replace"))
6180                    known_errorHandler = 2;
6181                else if (!strcmp(errors, "ignore"))
6182                    known_errorHandler = 3;
6183                else if (!strcmp(errors, "xmlcharrefreplace"))
6184                    known_errorHandler = 4;
6185                else
6186                    known_errorHandler = 0;
6187            }
6188            switch (known_errorHandler) {
6189            case 1: /* strict */
6190                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6191                goto onError;
6192            case 2: /* replace */
6193                while (collstart++<collend)
6194                    *str++ = '?'; /* fall through */
6195            case 3: /* ignore */
6196                p = collend;
6197                break;
6198            case 4: /* xmlcharrefreplace */
6199                respos = str - PyBytes_AS_STRING(res);
6200                /* determine replacement size (temporarily (mis)uses p) */
6201                for (p = collstart, repsize = 0; p < collend; ++p) {
6202                    if (*p<10)
6203                        repsize += 2+1+1;
6204                    else if (*p<100)
6205                        repsize += 2+2+1;
6206                    else if (*p<1000)
6207                        repsize += 2+3+1;
6208                    else if (*p<10000)
6209                        repsize += 2+4+1;
6210#ifndef Py_UNICODE_WIDE
6211                    else
6212                        repsize += 2+5+1;
6213#else
6214                    else if (*p<100000)
6215                        repsize += 2+5+1;
6216                    else if (*p<1000000)
6217                        repsize += 2+6+1;
6218                    else
6219                        repsize += 2+7+1;
6220#endif
6221                }
6222                requiredsize = respos+repsize+(endp-collend);
6223                if (requiredsize > ressize) {
6224                    if (requiredsize<2*ressize)
6225                        requiredsize = 2*ressize;
6226                    if (_PyBytes_Resize(&res, requiredsize))
6227                        goto onError;
6228                    str = PyBytes_AS_STRING(res) + respos;
6229                    ressize = requiredsize;
6230                }
6231                /* generate replacement (temporarily (mis)uses p) */
6232                for (p = collstart; p < collend; ++p) {
6233                    str += sprintf(str, "&#%d;", (int)*p);
6234                }
6235                p = collend;
6236                break;
6237            default:
6238                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6239                                                              encoding, reason, startp, size, &exc,
6240                                                              collstart-startp, collend-startp, &newpos);
6241                if (repunicode == NULL)
6242                    goto onError;
6243                if (PyBytes_Check(repunicode)) {
6244                    /* Directly copy bytes result to output. */
6245                    repsize = PyBytes_Size(repunicode);
6246                    if (repsize > 1) {
6247                        /* Make room for all additional bytes. */
6248                        respos = str - PyBytes_AS_STRING(res);
6249                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6250                            Py_DECREF(repunicode);
6251                            goto onError;
6252                        }
6253                        str = PyBytes_AS_STRING(res) + respos;
6254                        ressize += repsize-1;
6255                    }
6256                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6257                    str += repsize;
6258                    p = startp + newpos;
6259                    Py_DECREF(repunicode);
6260                    break;
6261                }
6262                /* need more space? (at least enough for what we
6263                   have+the replacement+the rest of the string, so
6264                   we won't have to check space for encodable characters) */
6265                respos = str - PyBytes_AS_STRING(res);
6266                repsize = PyUnicode_GET_SIZE(repunicode);
6267                requiredsize = respos+repsize+(endp-collend);
6268                if (requiredsize > ressize) {
6269                    if (requiredsize<2*ressize)
6270                        requiredsize = 2*ressize;
6271                    if (_PyBytes_Resize(&res, requiredsize)) {
6272                        Py_DECREF(repunicode);
6273                        goto onError;
6274                    }
6275                    str = PyBytes_AS_STRING(res) + respos;
6276                    ressize = requiredsize;
6277                }
6278                /* check if there is anything unencodable in the replacement
6279                   and copy it to the output */
6280                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6281                    c = *uni2;
6282                    if (c >= limit) {
6283                        raise_encode_exception(&exc, encoding, startp, size,
6284                                               unicodepos, unicodepos+1, reason);
6285                        Py_DECREF(repunicode);
6286                        goto onError;
6287                    }
6288                    *str = (char)c;
6289                }
6290                p = startp + newpos;
6291                Py_DECREF(repunicode);
6292            }
6293        }
6294    }
6295    /* Resize if we allocated to much */
6296    size = str - PyBytes_AS_STRING(res);
6297    if (size < ressize) { /* If this falls res will be NULL */
6298        assert(size >= 0);
6299        if (_PyBytes_Resize(&res, size) < 0)
6300            goto onError;
6301    }
6302
6303    Py_XDECREF(errorHandler);
6304    Py_XDECREF(exc);
6305    return res;
6306
6307  onError:
6308    Py_XDECREF(res);
6309    Py_XDECREF(errorHandler);
6310    Py_XDECREF(exc);
6311    return NULL;
6312}
6313
6314PyObject *
6315PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6316                       Py_ssize_t size,
6317                       const char *errors)
6318{
6319    return unicode_encode_ucs1(p, size, errors, 256);
6320}
6321
6322PyObject *
6323_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6324{
6325    if (!PyUnicode_Check(unicode)) {
6326        PyErr_BadArgument();
6327        return NULL;
6328    }
6329    if (PyUnicode_READY(unicode) == -1)
6330        return NULL;
6331    /* Fast path: if it is a one-byte string, construct
6332       bytes object directly. */
6333    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6334        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6335                                         PyUnicode_GET_LENGTH(unicode));
6336    /* Non-Latin-1 characters present. Defer to above function to
6337       raise the exception. */
6338    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
6339                                  PyUnicode_GET_SIZE(unicode),
6340                                  errors);
6341}
6342
6343PyObject*
6344PyUnicode_AsLatin1String(PyObject *unicode)
6345{
6346    return _PyUnicode_AsLatin1String(unicode, NULL);
6347}
6348
6349/* --- 7-bit ASCII Codec -------------------------------------------------- */
6350
6351PyObject *
6352PyUnicode_DecodeASCII(const char *s,
6353                      Py_ssize_t size,
6354                      const char *errors)
6355{
6356    const char *starts = s;
6357    PyUnicodeObject *v;
6358    Py_UNICODE *p;
6359    Py_ssize_t startinpos;
6360    Py_ssize_t endinpos;
6361    Py_ssize_t outpos;
6362    const char *e;
6363    unsigned char* d;
6364    PyObject *errorHandler = NULL;
6365    PyObject *exc = NULL;
6366    Py_ssize_t i;
6367
6368    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6369    if (size == 1 && *(unsigned char*)s < 128)
6370        return PyUnicode_FromOrdinal(*(unsigned char*)s);
6371
6372    /* Fast path. Assume the input actually *is* ASCII, and allocate
6373       a single-block Unicode object with that assumption. If there is
6374       an error, drop the object and start over. */
6375    v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6376    if (v == NULL)
6377        goto onError;
6378    d = PyUnicode_1BYTE_DATA(v);
6379    for (i = 0; i < size; i++) {
6380        unsigned char ch = ((unsigned char*)s)[i];
6381        if (ch < 128)
6382            d[i] = ch;
6383        else
6384            break;
6385    }
6386    if (i == size)
6387        return (PyObject*)v;
6388    Py_DECREF(v); /* start over */
6389
6390    v = _PyUnicode_New(size);
6391    if (v == NULL)
6392        goto onError;
6393    if (size == 0)
6394        return (PyObject *)v;
6395    p = PyUnicode_AS_UNICODE(v);
6396    e = s + size;
6397    while (s < e) {
6398        register unsigned char c = (unsigned char)*s;
6399        if (c < 128) {
6400            *p++ = c;
6401            ++s;
6402        }
6403        else {
6404            startinpos = s-starts;
6405            endinpos = startinpos + 1;
6406            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6407            if (unicode_decode_call_errorhandler(
6408                    errors, &errorHandler,
6409                    "ascii", "ordinal not in range(128)",
6410                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6411                    &v, &outpos, &p))
6412                goto onError;
6413        }
6414    }
6415    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6416        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6417            goto onError;
6418    Py_XDECREF(errorHandler);
6419    Py_XDECREF(exc);
6420    if (PyUnicode_READY(v) == -1) {
6421        Py_DECREF(v);
6422        return NULL;
6423    }
6424    return (PyObject *)v;
6425
6426  onError:
6427    Py_XDECREF(v);
6428    Py_XDECREF(errorHandler);
6429    Py_XDECREF(exc);
6430    return NULL;
6431}
6432
6433PyObject *
6434PyUnicode_EncodeASCII(const Py_UNICODE *p,
6435                      Py_ssize_t size,
6436                      const char *errors)
6437{
6438    return unicode_encode_ucs1(p, size, errors, 128);
6439}
6440
6441PyObject *
6442_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6443{
6444    if (!PyUnicode_Check(unicode)) {
6445        PyErr_BadArgument();
6446        return NULL;
6447    }
6448    if (PyUnicode_READY(unicode) == -1)
6449        return NULL;
6450    /* Fast path: if it is an ASCII-only string, construct bytes object
6451       directly. Else defer to above function to raise the exception. */
6452    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6453        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6454                                         PyUnicode_GET_LENGTH(unicode));
6455    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
6456                                 PyUnicode_GET_SIZE(unicode),
6457                                 errors);
6458}
6459
6460PyObject *
6461PyUnicode_AsASCIIString(PyObject *unicode)
6462{
6463    return _PyUnicode_AsASCIIString(unicode, NULL);
6464}
6465
6466#ifdef HAVE_MBCS
6467
6468/* --- MBCS codecs for Windows -------------------------------------------- */
6469
6470#if SIZEOF_INT < SIZEOF_SIZE_T
6471#define NEED_RETRY
6472#endif
6473
6474/* XXX This code is limited to "true" double-byte encodings, as
6475   a) it assumes an incomplete character consists of a single byte, and
6476   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
6477   encodings, see IsDBCSLeadByteEx documentation. */
6478
6479static int
6480is_dbcs_lead_byte(const char *s, int offset)
6481{
6482    const char *curr = s + offset;
6483
6484    if (IsDBCSLeadByte(*curr)) {
6485        const char *prev = CharPrev(s, curr);
6486        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
6487    }
6488    return 0;
6489}
6490
6491/*
6492 * Decode MBCS string into unicode object. If 'final' is set, converts
6493 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6494 */
6495static int
6496decode_mbcs(PyUnicodeObject **v,
6497            const char *s, /* MBCS string */
6498            int size, /* sizeof MBCS string */
6499            int final,
6500            const char *errors)
6501{
6502    Py_UNICODE *p;
6503    Py_ssize_t n;
6504    DWORD usize;
6505    DWORD flags;
6506
6507    assert(size >= 0);
6508
6509    /* check and handle 'errors' arg */
6510    if (errors==NULL || strcmp(errors, "strict")==0)
6511        flags = MB_ERR_INVALID_CHARS;
6512    else if (strcmp(errors, "ignore")==0)
6513        flags = 0;
6514    else {
6515        PyErr_Format(PyExc_ValueError,
6516                     "mbcs encoding does not support errors='%s'",
6517                     errors);
6518        return -1;
6519    }
6520
6521    /* Skip trailing lead-byte unless 'final' is set */
6522    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
6523        --size;
6524
6525    /* First get the size of the result */
6526    if (size > 0) {
6527        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6528        if (usize==0)
6529            goto mbcs_decode_error;
6530    } else
6531        usize = 0;
6532
6533    if (*v == NULL) {
6534        /* Create unicode object */
6535        *v = _PyUnicode_New(usize);
6536        if (*v == NULL)
6537            return -1;
6538        n = 0;
6539    }
6540    else {
6541        /* Extend unicode object */
6542        n = PyUnicode_GET_SIZE(*v);
6543        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
6544            return -1;
6545    }
6546
6547    /* Do the conversion */
6548    if (usize > 0) {
6549        p = PyUnicode_AS_UNICODE(*v) + n;
6550        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6551            goto mbcs_decode_error;
6552        }
6553    }
6554    return size;
6555
6556mbcs_decode_error:
6557    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6558       we raise a UnicodeDecodeError - else it is a 'generic'
6559       windows error
6560     */
6561    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6562        /* Ideally, we should get reason from FormatMessage - this
6563           is the Windows 2000 English version of the message
6564        */
6565        PyObject *exc = NULL;
6566        const char *reason = "No mapping for the Unicode character exists "
6567                             "in the target multi-byte code page.";
6568        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6569        if (exc != NULL) {
6570            PyCodec_StrictErrors(exc);
6571            Py_DECREF(exc);
6572        }
6573    } else {
6574        PyErr_SetFromWindowsErrWithFilename(0, NULL);
6575    }
6576    return -1;
6577}
6578
6579PyObject *
6580PyUnicode_DecodeMBCSStateful(const char *s,
6581                             Py_ssize_t size,
6582                             const char *errors,
6583                             Py_ssize_t *consumed)
6584{
6585    PyUnicodeObject *v = NULL;
6586    int done;
6587
6588    if (consumed)
6589        *consumed = 0;
6590
6591#ifdef NEED_RETRY
6592  retry:
6593    if (size > INT_MAX)
6594        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
6595    else
6596#endif
6597        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
6598
6599    if (done < 0) {
6600        Py_XDECREF(v);
6601        return NULL;
6602    }
6603
6604    if (consumed)
6605        *consumed += done;
6606
6607#ifdef NEED_RETRY
6608    if (size > INT_MAX) {
6609        s += done;
6610        size -= done;
6611        goto retry;
6612    }
6613#endif
6614    if (PyUnicode_READY(v) == -1) {
6615        Py_DECREF(v);
6616        return NULL;
6617    }
6618    return (PyObject *)v;
6619}
6620
6621PyObject *
6622PyUnicode_DecodeMBCS(const char *s,
6623                     Py_ssize_t size,
6624                     const char *errors)
6625{
6626    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6627}
6628
6629/*
6630 * Convert unicode into string object (MBCS).
6631 * Returns 0 if succeed, -1 otherwise.
6632 */
6633static int
6634encode_mbcs(PyObject **repr,
6635            const Py_UNICODE *p, /* unicode */
6636            int size, /* size of unicode */
6637            const char* errors)
6638{
6639    BOOL usedDefaultChar = FALSE;
6640    BOOL *pusedDefaultChar;
6641    int mbcssize;
6642    Py_ssize_t n;
6643    PyObject *exc = NULL;
6644    DWORD flags;
6645
6646    assert(size >= 0);
6647
6648    /* check and handle 'errors' arg */
6649    if (errors==NULL || strcmp(errors, "strict")==0) {
6650        flags = WC_NO_BEST_FIT_CHARS;
6651        pusedDefaultChar = &usedDefaultChar;
6652    } else if (strcmp(errors, "replace")==0) {
6653        flags = 0;
6654        pusedDefaultChar = NULL;
6655    } else {
6656         PyErr_Format(PyExc_ValueError,
6657                      "mbcs encoding does not support errors='%s'",
6658                      errors);
6659         return -1;
6660    }
6661
6662    /* First get the size of the result */
6663    if (size > 0) {
6664        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6665                                       NULL, pusedDefaultChar);
6666        if (mbcssize == 0) {
6667            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6668            return -1;
6669        }
6670        /* If we used a default char, then we failed! */
6671        if (pusedDefaultChar && *pusedDefaultChar)
6672            goto mbcs_encode_error;
6673    } else {
6674        mbcssize = 0;
6675    }
6676
6677    if (*repr == NULL) {
6678        /* Create string object */
6679        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6680        if (*repr == NULL)
6681            return -1;
6682        n = 0;
6683    }
6684    else {
6685        /* Extend string object */
6686        n = PyBytes_Size(*repr);
6687        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6688            return -1;
6689    }
6690
6691    /* Do the conversion */
6692    if (size > 0) {
6693        char *s = PyBytes_AS_STRING(*repr) + n;
6694        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6695                                     NULL, pusedDefaultChar)) {
6696            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6697            return -1;
6698        }
6699        if (pusedDefaultChar && *pusedDefaultChar)
6700            goto mbcs_encode_error;
6701    }
6702    return 0;
6703
6704mbcs_encode_error:
6705    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6706    Py_XDECREF(exc);
6707    return -1;
6708}
6709
6710PyObject *
6711PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6712                     Py_ssize_t size,
6713                     const char *errors)
6714{
6715    PyObject *repr = NULL;
6716    int ret;
6717
6718#ifdef NEED_RETRY
6719  retry:
6720    if (size > INT_MAX)
6721        ret = encode_mbcs(&repr, p, INT_MAX, errors);
6722    else
6723#endif
6724        ret = encode_mbcs(&repr, p, (int)size, errors);
6725
6726    if (ret < 0) {
6727        Py_XDECREF(repr);
6728        return NULL;
6729    }
6730
6731#ifdef NEED_RETRY
6732    if (size > INT_MAX) {
6733        p += INT_MAX;
6734        size -= INT_MAX;
6735        goto retry;
6736    }
6737#endif
6738
6739    return repr;
6740}
6741
6742PyObject *
6743PyUnicode_AsMBCSString(PyObject *unicode)
6744{
6745    if (!PyUnicode_Check(unicode)) {
6746        PyErr_BadArgument();
6747        return NULL;
6748    }
6749    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
6750                                PyUnicode_GET_SIZE(unicode),
6751                                NULL);
6752}
6753
6754#undef NEED_RETRY
6755
6756#endif /* HAVE_MBCS */
6757
6758/* --- Character Mapping Codec -------------------------------------------- */
6759
6760PyObject *
6761PyUnicode_DecodeCharmap(const char *s,
6762                        Py_ssize_t size,
6763                        PyObject *mapping,
6764                        const char *errors)
6765{
6766    const char *starts = s;
6767    Py_ssize_t startinpos;
6768    Py_ssize_t endinpos;
6769    Py_ssize_t outpos;
6770    const char *e;
6771    PyUnicodeObject *v;
6772    Py_UNICODE *p;
6773    Py_ssize_t extrachars = 0;
6774    PyObject *errorHandler = NULL;
6775    PyObject *exc = NULL;
6776    Py_UNICODE *mapstring = NULL;
6777    Py_ssize_t maplen = 0;
6778
6779    /* Default to Latin-1 */
6780    if (mapping == NULL)
6781        return PyUnicode_DecodeLatin1(s, size, errors);
6782
6783    v = _PyUnicode_New(size);
6784    if (v == NULL)
6785        goto onError;
6786    if (size == 0)
6787        return (PyObject *)v;
6788    p = PyUnicode_AS_UNICODE(v);
6789    e = s + size;
6790    if (PyUnicode_CheckExact(mapping)) {
6791        mapstring = PyUnicode_AS_UNICODE(mapping);
6792        maplen = PyUnicode_GET_SIZE(mapping);
6793        while (s < e) {
6794            unsigned char ch = *s;
6795            Py_UNICODE x = 0xfffe; /* illegal value */
6796
6797            if (ch < maplen)
6798                x = mapstring[ch];
6799
6800            if (x == 0xfffe) {
6801                /* undefined mapping */
6802                outpos = p-PyUnicode_AS_UNICODE(v);
6803                startinpos = s-starts;
6804                endinpos = startinpos+1;
6805                if (unicode_decode_call_errorhandler(
6806                        errors, &errorHandler,
6807                        "charmap", "character maps to <undefined>",
6808                        &starts, &e, &startinpos, &endinpos, &exc, &s,
6809                        &v, &outpos, &p)) {
6810                    goto onError;
6811                }
6812                continue;
6813            }
6814            *p++ = x;
6815            ++s;
6816        }
6817    }
6818    else {
6819        while (s < e) {
6820            unsigned char ch = *s;
6821            PyObject *w, *x;
6822
6823            /* Get mapping (char ordinal -> integer, Unicode char or None) */
6824            w = PyLong_FromLong((long)ch);
6825            if (w == NULL)
6826                goto onError;
6827            x = PyObject_GetItem(mapping, w);
6828            Py_DECREF(w);
6829            if (x == NULL) {
6830                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6831                    /* No mapping found means: mapping is undefined. */
6832                    PyErr_Clear();
6833                    x = Py_None;
6834                    Py_INCREF(x);
6835                } else
6836                    goto onError;
6837            }
6838
6839            /* Apply mapping */
6840            if (PyLong_Check(x)) {
6841                long value = PyLong_AS_LONG(x);
6842                if (value < 0 || value > 65535) {
6843                    PyErr_SetString(PyExc_TypeError,
6844                                    "character mapping must be in range(65536)");
6845                    Py_DECREF(x);
6846                    goto onError;
6847                }
6848                *p++ = (Py_UNICODE)value;
6849            }
6850            else if (x == Py_None) {
6851                /* undefined mapping */
6852                outpos = p-PyUnicode_AS_UNICODE(v);
6853                startinpos = s-starts;
6854                endinpos = startinpos+1;
6855                if (unicode_decode_call_errorhandler(
6856                        errors, &errorHandler,
6857                        "charmap", "character maps to <undefined>",
6858                        &starts, &e, &startinpos, &endinpos, &exc, &s,
6859                        &v, &outpos, &p)) {
6860                    Py_DECREF(x);
6861                    goto onError;
6862                }
6863                Py_DECREF(x);
6864                continue;
6865            }
6866            else if (PyUnicode_Check(x)) {
6867                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
6868
6869                if (targetsize == 1)
6870                    /* 1-1 mapping */
6871                    *p++ = *PyUnicode_AS_UNICODE(x);
6872
6873                else if (targetsize > 1) {
6874                    /* 1-n mapping */
6875                    if (targetsize > extrachars) {
6876                        /* resize first */
6877                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6878                        Py_ssize_t needed = (targetsize - extrachars) + \
6879                            (targetsize << 2);
6880                        extrachars += needed;
6881                        /* XXX overflow detection missing */
6882                        if (PyUnicode_Resize((PyObject**)&v,
6883                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
6884                            Py_DECREF(x);
6885                            goto onError;
6886                        }
6887                        p = PyUnicode_AS_UNICODE(v) + oldpos;
6888                    }
6889                    Py_UNICODE_COPY(p,
6890                                    PyUnicode_AS_UNICODE(x),
6891                                    targetsize);
6892                    p += targetsize;
6893                    extrachars -= targetsize;
6894                }
6895                /* 1-0 mapping: skip the character */
6896            }
6897            else {
6898                /* wrong return value */
6899                PyErr_SetString(PyExc_TypeError,
6900                                "character mapping must return integer, None or str");
6901                Py_DECREF(x);
6902                goto onError;
6903            }
6904            Py_DECREF(x);
6905            ++s;
6906        }
6907    }
6908    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6909        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6910            goto onError;
6911    Py_XDECREF(errorHandler);
6912    Py_XDECREF(exc);
6913    if (PyUnicode_READY(v) == -1) {
6914        Py_DECREF(v);
6915        return NULL;
6916    }
6917    return (PyObject *)v;
6918
6919  onError:
6920    Py_XDECREF(errorHandler);
6921    Py_XDECREF(exc);
6922    Py_XDECREF(v);
6923    return NULL;
6924}
6925
6926/* Charmap encoding: the lookup table */
6927
6928struct encoding_map {
6929    PyObject_HEAD
6930    unsigned char level1[32];
6931    int count2, count3;
6932    unsigned char level23[1];
6933};
6934
6935static PyObject*
6936encoding_map_size(PyObject *obj, PyObject* args)
6937{
6938    struct encoding_map *map = (struct encoding_map*)obj;
6939    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
6940                           128*map->count3);
6941}
6942
6943static PyMethodDef encoding_map_methods[] = {
6944    {"size", encoding_map_size, METH_NOARGS,
6945     PyDoc_STR("Return the size (in bytes) of this object") },
6946    { 0 }
6947};
6948
6949static void
6950encoding_map_dealloc(PyObject* o)
6951{
6952    PyObject_FREE(o);
6953}
6954
6955static PyTypeObject EncodingMapType = {
6956    PyVarObject_HEAD_INIT(NULL, 0)
6957    "EncodingMap",          /*tp_name*/
6958    sizeof(struct encoding_map),   /*tp_basicsize*/
6959    0,                      /*tp_itemsize*/
6960    /* methods */
6961    encoding_map_dealloc,   /*tp_dealloc*/
6962    0,                      /*tp_print*/
6963    0,                      /*tp_getattr*/
6964    0,                      /*tp_setattr*/
6965    0,                      /*tp_reserved*/
6966    0,                      /*tp_repr*/
6967    0,                      /*tp_as_number*/
6968    0,                      /*tp_as_sequence*/
6969    0,                      /*tp_as_mapping*/
6970    0,                      /*tp_hash*/
6971    0,                      /*tp_call*/
6972    0,                      /*tp_str*/
6973    0,                      /*tp_getattro*/
6974    0,                      /*tp_setattro*/
6975    0,                      /*tp_as_buffer*/
6976    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
6977    0,                      /*tp_doc*/
6978    0,                      /*tp_traverse*/
6979    0,                      /*tp_clear*/
6980    0,                      /*tp_richcompare*/
6981    0,                      /*tp_weaklistoffset*/
6982    0,                      /*tp_iter*/
6983    0,                      /*tp_iternext*/
6984    encoding_map_methods,   /*tp_methods*/
6985    0,                      /*tp_members*/
6986    0,                      /*tp_getset*/
6987    0,                      /*tp_base*/
6988    0,                      /*tp_dict*/
6989    0,                      /*tp_descr_get*/
6990    0,                      /*tp_descr_set*/
6991    0,                      /*tp_dictoffset*/
6992    0,                      /*tp_init*/
6993    0,                      /*tp_alloc*/
6994    0,                      /*tp_new*/
6995    0,                      /*tp_free*/
6996    0,                      /*tp_is_gc*/
6997};
6998
6999PyObject*
7000PyUnicode_BuildEncodingMap(PyObject* string)
7001{
7002    PyObject *result;
7003    struct encoding_map *mresult;
7004    int i;
7005    int need_dict = 0;
7006    unsigned char level1[32];
7007    unsigned char level2[512];
7008    unsigned char *mlevel1, *mlevel2, *mlevel3;
7009    int count2 = 0, count3 = 0;
7010    int kind;
7011    void *data;
7012    Py_UCS4 ch;
7013
7014    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7015        PyErr_BadArgument();
7016        return NULL;
7017    }
7018    kind = PyUnicode_KIND(string);
7019    data = PyUnicode_DATA(string);
7020    memset(level1, 0xFF, sizeof level1);
7021    memset(level2, 0xFF, sizeof level2);
7022
7023    /* If there isn't a one-to-one mapping of NULL to \0,
7024       or if there are non-BMP characters, we need to use
7025       a mapping dictionary. */
7026    if (PyUnicode_READ(kind, data, 0) != 0)
7027        need_dict = 1;
7028    for (i = 1; i < 256; i++) {
7029        int l1, l2;
7030        ch = PyUnicode_READ(kind, data, i);
7031        if (ch == 0 || ch > 0xFFFF) {
7032            need_dict = 1;
7033            break;
7034        }
7035        if (ch == 0xFFFE)
7036            /* unmapped character */
7037            continue;
7038        l1 = ch >> 11;
7039        l2 = ch >> 7;
7040        if (level1[l1] == 0xFF)
7041            level1[l1] = count2++;
7042        if (level2[l2] == 0xFF)
7043            level2[l2] = count3++;
7044    }
7045
7046    if (count2 >= 0xFF || count3 >= 0xFF)
7047        need_dict = 1;
7048
7049    if (need_dict) {
7050        PyObject *result = PyDict_New();
7051        PyObject *key, *value;
7052        if (!result)
7053            return NULL;
7054        for (i = 0; i < 256; i++) {
7055            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7056            value = PyLong_FromLong(i);
7057            if (!key || !value)
7058                goto failed1;
7059            if (PyDict_SetItem(result, key, value) == -1)
7060                goto failed1;
7061            Py_DECREF(key);
7062            Py_DECREF(value);
7063        }
7064        return result;
7065      failed1:
7066        Py_XDECREF(key);
7067        Py_XDECREF(value);
7068        Py_DECREF(result);
7069        return NULL;
7070    }
7071
7072    /* Create a three-level trie */
7073    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7074                             16*count2 + 128*count3 - 1);
7075    if (!result)
7076        return PyErr_NoMemory();
7077    PyObject_Init(result, &EncodingMapType);
7078    mresult = (struct encoding_map*)result;
7079    mresult->count2 = count2;
7080    mresult->count3 = count3;
7081    mlevel1 = mresult->level1;
7082    mlevel2 = mresult->level23;
7083    mlevel3 = mresult->level23 + 16*count2;
7084    memcpy(mlevel1, level1, 32);
7085    memset(mlevel2, 0xFF, 16*count2);
7086    memset(mlevel3, 0, 128*count3);
7087    count3 = 0;
7088    for (i = 1; i < 256; i++) {
7089        int o1, o2, o3, i2, i3;
7090        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7091            /* unmapped character */
7092            continue;
7093        o1 = PyUnicode_READ(kind, data, i)>>11;
7094        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7095        i2 = 16*mlevel1[o1] + o2;
7096        if (mlevel2[i2] == 0xFF)
7097            mlevel2[i2] = count3++;
7098        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7099        i3 = 128*mlevel2[i2] + o3;
7100        mlevel3[i3] = i;
7101    }
7102    return result;
7103}
7104
7105static int
7106encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7107{
7108    struct encoding_map *map = (struct encoding_map*)mapping;
7109    int l1 = c>>11;
7110    int l2 = (c>>7) & 0xF;
7111    int l3 = c & 0x7F;
7112    int i;
7113
7114#ifdef Py_UNICODE_WIDE
7115    if (c > 0xFFFF) {
7116        return -1;
7117    }
7118#endif
7119    if (c == 0)
7120        return 0;
7121    /* level 1*/
7122    i = map->level1[l1];
7123    if (i == 0xFF) {
7124        return -1;
7125    }
7126    /* level 2*/
7127    i = map->level23[16*i+l2];
7128    if (i == 0xFF) {
7129        return -1;
7130    }
7131    /* level 3 */
7132    i = map->level23[16*map->count2 + 128*i + l3];
7133    if (i == 0) {
7134        return -1;
7135    }
7136    return i;
7137}
7138
7139/* Lookup the character ch in the mapping. If the character
7140   can't be found, Py_None is returned (or NULL, if another
7141   error occurred). */
7142static PyObject *
7143charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
7144{
7145    PyObject *w = PyLong_FromLong((long)c);
7146    PyObject *x;
7147
7148    if (w == NULL)
7149        return NULL;
7150    x = PyObject_GetItem(mapping, w);
7151    Py_DECREF(w);
7152    if (x == NULL) {
7153        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7154            /* No mapping found means: mapping is undefined. */
7155            PyErr_Clear();
7156            x = Py_None;
7157            Py_INCREF(x);
7158            return x;
7159        } else
7160            return NULL;
7161    }
7162    else if (x == Py_None)
7163        return x;
7164    else if (PyLong_Check(x)) {
7165        long value = PyLong_AS_LONG(x);
7166        if (value < 0 || value > 255) {
7167            PyErr_SetString(PyExc_TypeError,
7168                            "character mapping must be in range(256)");
7169            Py_DECREF(x);
7170            return NULL;
7171        }
7172        return x;
7173    }
7174    else if (PyBytes_Check(x))
7175        return x;
7176    else {
7177        /* wrong return value */
7178        PyErr_Format(PyExc_TypeError,
7179                     "character mapping must return integer, bytes or None, not %.400s",
7180                     x->ob_type->tp_name);
7181        Py_DECREF(x);
7182        return NULL;
7183    }
7184}
7185
7186static int
7187charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7188{
7189    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7190    /* exponentially overallocate to minimize reallocations */
7191    if (requiredsize < 2*outsize)
7192        requiredsize = 2*outsize;
7193    if (_PyBytes_Resize(outobj, requiredsize))
7194        return -1;
7195    return 0;
7196}
7197
7198typedef enum charmapencode_result {
7199    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7200} charmapencode_result;
7201/* lookup the character, put the result in the output string and adjust
7202   various state variables. Resize the output bytes object if not enough
7203   space is available. Return a new reference to the object that
7204   was put in the output buffer, or Py_None, if the mapping was undefined
7205   (in which case no character was written) or NULL, if a
7206   reallocation error occurred. The caller must decref the result */
7207static charmapencode_result
7208charmapencode_output(Py_UNICODE c, PyObject *mapping,
7209                     PyObject **outobj, Py_ssize_t *outpos)
7210{
7211    PyObject *rep;
7212    char *outstart;
7213    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7214
7215    if (Py_TYPE(mapping) == &EncodingMapType) {
7216        int res = encoding_map_lookup(c, mapping);
7217        Py_ssize_t requiredsize = *outpos+1;
7218        if (res == -1)
7219            return enc_FAILED;
7220        if (outsize<requiredsize)
7221            if (charmapencode_resize(outobj, outpos, requiredsize))
7222                return enc_EXCEPTION;
7223        outstart = PyBytes_AS_STRING(*outobj);
7224        outstart[(*outpos)++] = (char)res;
7225        return enc_SUCCESS;
7226    }
7227
7228    rep = charmapencode_lookup(c, mapping);
7229    if (rep==NULL)
7230        return enc_EXCEPTION;
7231    else if (rep==Py_None) {
7232        Py_DECREF(rep);
7233        return enc_FAILED;
7234    } else {
7235        if (PyLong_Check(rep)) {
7236            Py_ssize_t requiredsize = *outpos+1;
7237            if (outsize<requiredsize)
7238                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7239                    Py_DECREF(rep);
7240                    return enc_EXCEPTION;
7241                }
7242            outstart = PyBytes_AS_STRING(*outobj);
7243            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7244        }
7245        else {
7246            const char *repchars = PyBytes_AS_STRING(rep);
7247            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7248            Py_ssize_t requiredsize = *outpos+repsize;
7249            if (outsize<requiredsize)
7250                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7251                    Py_DECREF(rep);
7252                    return enc_EXCEPTION;
7253                }
7254            outstart = PyBytes_AS_STRING(*outobj);
7255            memcpy(outstart + *outpos, repchars, repsize);
7256            *outpos += repsize;
7257        }
7258    }
7259    Py_DECREF(rep);
7260    return enc_SUCCESS;
7261}
7262
7263/* handle an error in PyUnicode_EncodeCharmap
7264   Return 0 on success, -1 on error */
7265static int
7266charmap_encoding_error(
7267    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
7268    PyObject **exceptionObject,
7269    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7270    PyObject **res, Py_ssize_t *respos)
7271{
7272    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7273    Py_ssize_t repsize;
7274    Py_ssize_t newpos;
7275    Py_UNICODE *uni2;
7276    /* startpos for collecting unencodable chars */
7277    Py_ssize_t collstartpos = *inpos;
7278    Py_ssize_t collendpos = *inpos+1;
7279    Py_ssize_t collpos;
7280    char *encoding = "charmap";
7281    char *reason = "character maps to <undefined>";
7282    charmapencode_result x;
7283
7284    /* find all unencodable characters */
7285    while (collendpos < size) {
7286        PyObject *rep;
7287        if (Py_TYPE(mapping) == &EncodingMapType) {
7288            int res = encoding_map_lookup(p[collendpos], mapping);
7289            if (res != -1)
7290                break;
7291            ++collendpos;
7292            continue;
7293        }
7294
7295        rep = charmapencode_lookup(p[collendpos], mapping);
7296        if (rep==NULL)
7297            return -1;
7298        else if (rep!=Py_None) {
7299            Py_DECREF(rep);
7300            break;
7301        }
7302        Py_DECREF(rep);
7303        ++collendpos;
7304    }
7305    /* cache callback name lookup
7306     * (if not done yet, i.e. it's the first error) */
7307    if (*known_errorHandler==-1) {
7308        if ((errors==NULL) || (!strcmp(errors, "strict")))
7309            *known_errorHandler = 1;
7310        else if (!strcmp(errors, "replace"))
7311            *known_errorHandler = 2;
7312        else if (!strcmp(errors, "ignore"))
7313            *known_errorHandler = 3;
7314        else if (!strcmp(errors, "xmlcharrefreplace"))
7315            *known_errorHandler = 4;
7316        else
7317            *known_errorHandler = 0;
7318    }
7319    switch (*known_errorHandler) {
7320    case 1: /* strict */
7321        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7322        return -1;
7323    case 2: /* replace */
7324        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7325            x = charmapencode_output('?', mapping, res, respos);
7326            if (x==enc_EXCEPTION) {
7327                return -1;
7328            }
7329            else if (x==enc_FAILED) {
7330                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7331                return -1;
7332            }
7333        }
7334        /* fall through */
7335    case 3: /* ignore */
7336        *inpos = collendpos;
7337        break;
7338    case 4: /* xmlcharrefreplace */
7339        /* generate replacement (temporarily (mis)uses p) */
7340        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7341            char buffer[2+29+1+1];
7342            char *cp;
7343            sprintf(buffer, "&#%d;", (int)p[collpos]);
7344            for (cp = buffer; *cp; ++cp) {
7345                x = charmapencode_output(*cp, mapping, res, respos);
7346                if (x==enc_EXCEPTION)
7347                    return -1;
7348                else if (x==enc_FAILED) {
7349                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7350                    return -1;
7351                }
7352            }
7353        }
7354        *inpos = collendpos;
7355        break;
7356    default:
7357        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7358                                                      encoding, reason, p, size, exceptionObject,
7359                                                      collstartpos, collendpos, &newpos);
7360        if (repunicode == NULL)
7361            return -1;
7362        if (PyBytes_Check(repunicode)) {
7363            /* Directly copy bytes result to output. */
7364            Py_ssize_t outsize = PyBytes_Size(*res);
7365            Py_ssize_t requiredsize;
7366            repsize = PyBytes_Size(repunicode);
7367            requiredsize = *respos + repsize;
7368            if (requiredsize > outsize)
7369                /* Make room for all additional bytes. */
7370                if (charmapencode_resize(res, respos, requiredsize)) {
7371                    Py_DECREF(repunicode);
7372                    return -1;
7373                }
7374            memcpy(PyBytes_AsString(*res) + *respos,
7375                   PyBytes_AsString(repunicode),  repsize);
7376            *respos += repsize;
7377            *inpos = newpos;
7378            Py_DECREF(repunicode);
7379            break;
7380        }
7381        /* generate replacement  */
7382        repsize = PyUnicode_GET_SIZE(repunicode);
7383        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7384            x = charmapencode_output(*uni2, mapping, res, respos);
7385            if (x==enc_EXCEPTION) {
7386                return -1;
7387            }
7388            else if (x==enc_FAILED) {
7389                Py_DECREF(repunicode);
7390                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7391                return -1;
7392            }
7393        }
7394        *inpos = newpos;
7395        Py_DECREF(repunicode);
7396    }
7397    return 0;
7398}
7399
7400PyObject *
7401PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7402                        Py_ssize_t size,
7403                        PyObject *mapping,
7404                        const char *errors)
7405{
7406    /* output object */
7407    PyObject *res = NULL;
7408    /* current input position */
7409    Py_ssize_t inpos = 0;
7410    /* current output position */
7411    Py_ssize_t respos = 0;
7412    PyObject *errorHandler = NULL;
7413    PyObject *exc = NULL;
7414    /* the following variable is used for caching string comparisons
7415     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7416     * 3=ignore, 4=xmlcharrefreplace */
7417    int known_errorHandler = -1;
7418
7419    /* Default to Latin-1 */
7420    if (mapping == NULL)
7421        return PyUnicode_EncodeLatin1(p, size, errors);
7422
7423    /* allocate enough for a simple encoding without
7424       replacements, if we need more, we'll resize */
7425    res = PyBytes_FromStringAndSize(NULL, size);
7426    if (res == NULL)
7427        goto onError;
7428    if (size == 0)
7429        return res;
7430
7431    while (inpos<size) {
7432        /* try to encode it */
7433        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7434        if (x==enc_EXCEPTION) /* error */
7435            goto onError;
7436        if (x==enc_FAILED) { /* unencodable character */
7437            if (charmap_encoding_error(p, size, &inpos, mapping,
7438                                       &exc,
7439                                       &known_errorHandler, &errorHandler, errors,
7440                                       &res, &respos)) {
7441                goto onError;
7442            }
7443        }
7444        else
7445            /* done with this character => adjust input position */
7446            ++inpos;
7447    }
7448
7449    /* Resize if we allocated to much */
7450    if (respos<PyBytes_GET_SIZE(res))
7451        if (_PyBytes_Resize(&res, respos) < 0)
7452            goto onError;
7453
7454    Py_XDECREF(exc);
7455    Py_XDECREF(errorHandler);
7456    return res;
7457
7458  onError:
7459    Py_XDECREF(res);
7460    Py_XDECREF(exc);
7461    Py_XDECREF(errorHandler);
7462    return NULL;
7463}
7464
7465PyObject *
7466PyUnicode_AsCharmapString(PyObject *unicode,
7467                          PyObject *mapping)
7468{
7469    if (!PyUnicode_Check(unicode) || mapping == NULL) {
7470        PyErr_BadArgument();
7471        return NULL;
7472    }
7473    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
7474                                   PyUnicode_GET_SIZE(unicode),
7475                                   mapping,
7476                                   NULL);
7477}
7478
7479/* create or adjust a UnicodeTranslateError */
7480static void
7481make_translate_exception(PyObject **exceptionObject,
7482                         PyObject *unicode,
7483                         Py_ssize_t startpos, Py_ssize_t endpos,
7484                         const char *reason)
7485{
7486    if (*exceptionObject == NULL) {
7487        *exceptionObject = _PyUnicodeTranslateError_Create(
7488            unicode, startpos, endpos, reason);
7489    }
7490    else {
7491        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7492            goto onError;
7493        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7494            goto onError;
7495        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7496            goto onError;
7497        return;
7498      onError:
7499        Py_DECREF(*exceptionObject);
7500        *exceptionObject = NULL;
7501    }
7502}
7503
7504/* raises a UnicodeTranslateError */
7505static void
7506raise_translate_exception(PyObject **exceptionObject,
7507                          PyObject *unicode,
7508                          Py_ssize_t startpos, Py_ssize_t endpos,
7509                          const char *reason)
7510{
7511    make_translate_exception(exceptionObject,
7512                             unicode, startpos, endpos, reason);
7513    if (*exceptionObject != NULL)
7514        PyCodec_StrictErrors(*exceptionObject);
7515}
7516
7517/* error handling callback helper:
7518   build arguments, call the callback and check the arguments,
7519   put the result into newpos and return the replacement string, which
7520   has to be freed by the caller */
7521static PyObject *
7522unicode_translate_call_errorhandler(const char *errors,
7523                                    PyObject **errorHandler,
7524                                    const char *reason,
7525                                    PyObject *unicode, PyObject **exceptionObject,
7526                                    Py_ssize_t startpos, Py_ssize_t endpos,
7527                                    Py_ssize_t *newpos)
7528{
7529    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
7530
7531    Py_ssize_t i_newpos;
7532    PyObject *restuple;
7533    PyObject *resunicode;
7534
7535    if (*errorHandler == NULL) {
7536        *errorHandler = PyCodec_LookupError(errors);
7537        if (*errorHandler == NULL)
7538            return NULL;
7539    }
7540
7541    make_translate_exception(exceptionObject,
7542                             unicode, startpos, endpos, reason);
7543    if (*exceptionObject == NULL)
7544        return NULL;
7545
7546    restuple = PyObject_CallFunctionObjArgs(
7547        *errorHandler, *exceptionObject, NULL);
7548    if (restuple == NULL)
7549        return NULL;
7550    if (!PyTuple_Check(restuple)) {
7551        PyErr_SetString(PyExc_TypeError, &argparse[4]);
7552        Py_DECREF(restuple);
7553        return NULL;
7554    }
7555    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
7556                          &resunicode, &i_newpos)) {
7557        Py_DECREF(restuple);
7558        return NULL;
7559    }
7560    if (i_newpos<0)
7561        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
7562    else
7563        *newpos = i_newpos;
7564    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
7565        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7566        Py_DECREF(restuple);
7567        return NULL;
7568    }
7569    Py_INCREF(resunicode);
7570    Py_DECREF(restuple);
7571    return resunicode;
7572}
7573
7574/* Lookup the character ch in the mapping and put the result in result,
7575   which must be decrefed by the caller.
7576   Return 0 on success, -1 on error */
7577static int
7578charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
7579{
7580    PyObject *w = PyLong_FromLong((long)c);
7581    PyObject *x;
7582
7583    if (w == NULL)
7584        return -1;
7585    x = PyObject_GetItem(mapping, w);
7586    Py_DECREF(w);
7587    if (x == NULL) {
7588        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7589            /* No mapping found means: use 1:1 mapping. */
7590            PyErr_Clear();
7591            *result = NULL;
7592            return 0;
7593        } else
7594            return -1;
7595    }
7596    else if (x == Py_None) {
7597        *result = x;
7598        return 0;
7599    }
7600    else if (PyLong_Check(x)) {
7601        long value = PyLong_AS_LONG(x);
7602        long max = PyUnicode_GetMax();
7603        if (value < 0 || value > max) {
7604            PyErr_Format(PyExc_TypeError,
7605                         "character mapping must be in range(0x%x)", max+1);
7606            Py_DECREF(x);
7607            return -1;
7608        }
7609        *result = x;
7610        return 0;
7611    }
7612    else if (PyUnicode_Check(x)) {
7613        *result = x;
7614        return 0;
7615    }
7616    else {
7617        /* wrong return value */
7618        PyErr_SetString(PyExc_TypeError,
7619                        "character mapping must return integer, None or str");
7620        Py_DECREF(x);
7621        return -1;
7622    }
7623}
7624/* ensure that *outobj is at least requiredsize characters long,
7625   if not reallocate and adjust various state variables.
7626   Return 0 on success, -1 on error */
7627static int
7628charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
7629                               Py_ssize_t requiredsize)
7630{
7631    Py_ssize_t oldsize = *psize;
7632    if (requiredsize > oldsize) {
7633        /* exponentially overallocate to minimize reallocations */
7634        if (requiredsize < 2 * oldsize)
7635            requiredsize = 2 * oldsize;
7636        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7637        if (*outobj == 0)
7638            return -1;
7639        *psize = requiredsize;
7640    }
7641    return 0;
7642}
7643/* lookup the character, put the result in the output string and adjust
7644   various state variables. Return a new reference to the object that
7645   was put in the output buffer in *result, or Py_None, if the mapping was
7646   undefined (in which case no character was written).
7647   The called must decref result.
7648   Return 0 on success, -1 on error. */
7649static int
7650charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7651                        PyObject *mapping, Py_UCS4 **output,
7652                        Py_ssize_t *osize, Py_ssize_t *opos,
7653                        PyObject **res)
7654{
7655    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7656    if (charmaptranslate_lookup(curinp, mapping, res))
7657        return -1;
7658    if (*res==NULL) {
7659        /* not found => default to 1:1 mapping */
7660        (*output)[(*opos)++] = curinp;
7661    }
7662    else if (*res==Py_None)
7663        ;
7664    else if (PyLong_Check(*res)) {
7665        /* no overflow check, because we know that the space is enough */
7666        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
7667    }
7668    else if (PyUnicode_Check(*res)) {
7669        Py_ssize_t repsize;
7670        if (PyUnicode_READY(*res) == -1)
7671            return -1;
7672        repsize = PyUnicode_GET_LENGTH(*res);
7673        if (repsize==1) {
7674            /* no overflow check, because we know that the space is enough */
7675            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
7676        }
7677        else if (repsize!=0) {
7678            /* more than one character */
7679            Py_ssize_t requiredsize = *opos +
7680                (PyUnicode_GET_LENGTH(input) - ipos) +
7681                repsize - 1;
7682            Py_ssize_t i;
7683            if (charmaptranslate_makespace(output, osize, requiredsize))
7684                return -1;
7685            for(i = 0; i < repsize; i++)
7686                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
7687        }
7688    }
7689    else
7690        return -1;
7691    return 0;
7692}
7693
7694PyObject *
7695_PyUnicode_TranslateCharmap(PyObject *input,
7696                            PyObject *mapping,
7697                            const char *errors)
7698{
7699    /* input object */
7700    char *idata;
7701    Py_ssize_t size, i;
7702    int kind;
7703    /* output buffer */
7704    Py_UCS4 *output = NULL;
7705    Py_ssize_t osize;
7706    PyObject *res;
7707    /* current output position */
7708    Py_ssize_t opos;
7709    char *reason = "character maps to <undefined>";
7710    PyObject *errorHandler = NULL;
7711    PyObject *exc = NULL;
7712    /* the following variable is used for caching string comparisons
7713     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7714     * 3=ignore, 4=xmlcharrefreplace */
7715    int known_errorHandler = -1;
7716
7717    if (mapping == NULL) {
7718        PyErr_BadArgument();
7719        return NULL;
7720    }
7721
7722    if (PyUnicode_READY(input) == -1)
7723        return NULL;
7724    idata = (char*)PyUnicode_DATA(input);
7725    kind = PyUnicode_KIND(input);
7726    size = PyUnicode_GET_LENGTH(input);
7727    i = 0;
7728
7729    if (size == 0) {
7730        Py_INCREF(input);
7731        return input;
7732    }
7733
7734    /* allocate enough for a simple 1:1 translation without
7735       replacements, if we need more, we'll resize */
7736    osize = size;
7737    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7738    opos = 0;
7739    if (output == NULL) {
7740        PyErr_NoMemory();
7741        goto onError;
7742    }
7743
7744    while (i<size) {
7745        /* try to encode it */
7746        PyObject *x = NULL;
7747        if (charmaptranslate_output(input, i, mapping,
7748                                    &output, &osize, &opos, &x)) {
7749            Py_XDECREF(x);
7750            goto onError;
7751        }
7752        Py_XDECREF(x);
7753        if (x!=Py_None) /* it worked => adjust input pointer */
7754            ++i;
7755        else { /* untranslatable character */
7756            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7757            Py_ssize_t repsize;
7758            Py_ssize_t newpos;
7759            Py_ssize_t uni2;
7760            /* startpos for collecting untranslatable chars */
7761            Py_ssize_t collstart = i;
7762            Py_ssize_t collend = i+1;
7763            Py_ssize_t coll;
7764
7765            /* find all untranslatable characters */
7766            while (collend < size) {
7767                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
7768                    goto onError;
7769                Py_XDECREF(x);
7770                if (x!=Py_None)
7771                    break;
7772                ++collend;
7773            }
7774            /* cache callback name lookup
7775             * (if not done yet, i.e. it's the first error) */
7776            if (known_errorHandler==-1) {
7777                if ((errors==NULL) || (!strcmp(errors, "strict")))
7778                    known_errorHandler = 1;
7779                else if (!strcmp(errors, "replace"))
7780                    known_errorHandler = 2;
7781                else if (!strcmp(errors, "ignore"))
7782                    known_errorHandler = 3;
7783                else if (!strcmp(errors, "xmlcharrefreplace"))
7784                    known_errorHandler = 4;
7785                else
7786                    known_errorHandler = 0;
7787            }
7788            switch (known_errorHandler) {
7789            case 1: /* strict */
7790                raise_translate_exception(&exc, input, collstart,
7791                                          collend, reason);
7792                goto onError;
7793            case 2: /* replace */
7794                /* No need to check for space, this is a 1:1 replacement */
7795                for (coll = collstart; coll<collend; coll++)
7796                    output[opos++] = '?';
7797                /* fall through */
7798            case 3: /* ignore */
7799                i = collend;
7800                break;
7801            case 4: /* xmlcharrefreplace */
7802                /* generate replacement (temporarily (mis)uses i) */
7803                for (i = collstart; i < collend; ++i) {
7804                    char buffer[2+29+1+1];
7805                    char *cp;
7806                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7807                    if (charmaptranslate_makespace(&output, &osize,
7808                                                   opos+strlen(buffer)+(size-collend)))
7809                        goto onError;
7810                    for (cp = buffer; *cp; ++cp)
7811                        output[opos++] = *cp;
7812                }
7813                i = collend;
7814                break;
7815            default:
7816                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
7817                                                                 reason, input, &exc,
7818                                                                 collstart, collend, &newpos);
7819                if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
7820                    goto onError;
7821                /* generate replacement  */
7822                repsize = PyUnicode_GET_LENGTH(repunicode);
7823                if (charmaptranslate_makespace(&output, &osize,
7824                                               opos+repsize+(size-collend))) {
7825                    Py_DECREF(repunicode);
7826                    goto onError;
7827                }
7828                for (uni2 = 0; repsize-->0; ++uni2)
7829                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7830                i = newpos;
7831                Py_DECREF(repunicode);
7832            }
7833        }
7834    }
7835    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7836    if (!res)
7837        goto onError;
7838    PyMem_Free(output);
7839    Py_XDECREF(exc);
7840    Py_XDECREF(errorHandler);
7841    return res;
7842
7843  onError:
7844    PyMem_Free(output);
7845    Py_XDECREF(exc);
7846    Py_XDECREF(errorHandler);
7847    return NULL;
7848}
7849
7850/* Deprecated. Use PyUnicode_Translate instead. */
7851PyObject *
7852PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7853                           Py_ssize_t size,
7854                           PyObject *mapping,
7855                           const char *errors)
7856{
7857    PyObject *unicode = PyUnicode_FromUnicode(p, size);
7858    if (!unicode)
7859        return NULL;
7860    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7861}
7862
7863PyObject *
7864PyUnicode_Translate(PyObject *str,
7865                    PyObject *mapping,
7866                    const char *errors)
7867{
7868    PyObject *result;
7869
7870    str = PyUnicode_FromObject(str);
7871    if (str == NULL)
7872        goto onError;
7873    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
7874    Py_DECREF(str);
7875    return result;
7876
7877  onError:
7878    Py_XDECREF(str);
7879    return NULL;
7880}
7881
7882static Py_UCS4
7883fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7884{
7885    /* No need to call PyUnicode_READY(self) because this function is only
7886       called as a callback from fixup() which does it already. */
7887    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7888    const int kind = PyUnicode_KIND(self);
7889    void *data = PyUnicode_DATA(self);
7890    Py_UCS4 maxchar = 0, ch, fixed;
7891    Py_ssize_t i;
7892
7893    for (i = 0; i < len; ++i) {
7894        ch = PyUnicode_READ(kind, data, i);
7895        fixed = 0;
7896        if (ch > 127) {
7897            if (Py_UNICODE_ISSPACE(ch))
7898                fixed = ' ';
7899            else {
7900                const int decimal = Py_UNICODE_TODECIMAL(ch);
7901                if (decimal >= 0)
7902                    fixed = '0' + decimal;
7903            }
7904            if (fixed != 0) {
7905                if (fixed > maxchar)
7906                    maxchar = fixed;
7907                PyUnicode_WRITE(kind, data, i, fixed);
7908            }
7909            else if (ch > maxchar)
7910                maxchar = ch;
7911        }
7912        else if (ch > maxchar)
7913            maxchar = ch;
7914    }
7915
7916    return maxchar;
7917}
7918
7919PyObject *
7920_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7921{
7922    if (!PyUnicode_Check(unicode)) {
7923        PyErr_BadInternalCall();
7924        return NULL;
7925    }
7926    if (PyUnicode_READY(unicode) == -1)
7927        return NULL;
7928    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7929        /* If the string is already ASCII, just return the same string */
7930        Py_INCREF(unicode);
7931        return unicode;
7932    }
7933    return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7934}
7935
7936PyObject *
7937PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7938                                  Py_ssize_t length)
7939{
7940    PyObject *result;
7941    Py_UNICODE *p; /* write pointer into result */
7942    Py_ssize_t i;
7943    /* Copy to a new string */
7944    result = (PyObject *)_PyUnicode_New(length);
7945    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7946    if (result == NULL)
7947        return result;
7948    p = PyUnicode_AS_UNICODE(result);
7949    /* Iterate over code points */
7950    for (i = 0; i < length; i++) {
7951        Py_UNICODE ch =s[i];
7952        if (ch > 127) {
7953            int decimal = Py_UNICODE_TODECIMAL(ch);
7954            if (decimal >= 0)
7955                p[i] = '0' + decimal;
7956        }
7957    }
7958    if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7959        Py_DECREF(result);
7960        return NULL;
7961    }
7962    return result;
7963}
7964/* --- Decimal Encoder ---------------------------------------------------- */
7965
7966int
7967PyUnicode_EncodeDecimal(Py_UNICODE *s,
7968                        Py_ssize_t length,
7969                        char *output,
7970                        const char *errors)
7971{
7972    Py_UNICODE *p, *end;
7973    PyObject *errorHandler = NULL;
7974    PyObject *exc = NULL;
7975    const char *encoding = "decimal";
7976    const char *reason = "invalid decimal Unicode string";
7977    /* the following variable is used for caching string comparisons
7978     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7979    int known_errorHandler = -1;
7980
7981    if (output == NULL) {
7982        PyErr_BadArgument();
7983        return -1;
7984    }
7985
7986    p = s;
7987    end = s + length;
7988    while (p < end) {
7989        register Py_UNICODE ch = *p;
7990        int decimal;
7991        PyObject *repunicode;
7992        Py_ssize_t repsize;
7993        Py_ssize_t newpos;
7994        Py_UNICODE *uni2;
7995        Py_UNICODE *collstart;
7996        Py_UNICODE *collend;
7997
7998        if (Py_UNICODE_ISSPACE(ch)) {
7999            *output++ = ' ';
8000            ++p;
8001            continue;
8002        }
8003        decimal = Py_UNICODE_TODECIMAL(ch);
8004        if (decimal >= 0) {
8005            *output++ = '0' + decimal;
8006            ++p;
8007            continue;
8008        }
8009        if (0 < ch && ch < 256) {
8010            *output++ = (char)ch;
8011            ++p;
8012            continue;
8013        }
8014        /* All other characters are considered unencodable */
8015        collstart = p;
8016        collend = p+1;
8017        while (collend < end) {
8018            if ((0 < *collend && *collend < 256) ||
8019                !Py_UNICODE_ISSPACE(*collend) ||
8020                Py_UNICODE_TODECIMAL(*collend))
8021                break;
8022        }
8023        /* cache callback name lookup
8024         * (if not done yet, i.e. it's the first error) */
8025        if (known_errorHandler==-1) {
8026            if ((errors==NULL) || (!strcmp(errors, "strict")))
8027                known_errorHandler = 1;
8028            else if (!strcmp(errors, "replace"))
8029                known_errorHandler = 2;
8030            else if (!strcmp(errors, "ignore"))
8031                known_errorHandler = 3;
8032            else if (!strcmp(errors, "xmlcharrefreplace"))
8033                known_errorHandler = 4;
8034            else
8035                known_errorHandler = 0;
8036        }
8037        switch (known_errorHandler) {
8038        case 1: /* strict */
8039            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8040            goto onError;
8041        case 2: /* replace */
8042            for (p = collstart; p < collend; ++p)
8043                *output++ = '?';
8044            /* fall through */
8045        case 3: /* ignore */
8046            p = collend;
8047            break;
8048        case 4: /* xmlcharrefreplace */
8049            /* generate replacement (temporarily (mis)uses p) */
8050            for (p = collstart; p < collend; ++p)
8051                output += sprintf(output, "&#%d;", (int)*p);
8052            p = collend;
8053            break;
8054        default:
8055            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8056                                                          encoding, reason, s, length, &exc,
8057                                                          collstart-s, collend-s, &newpos);
8058            if (repunicode == NULL)
8059                goto onError;
8060            if (!PyUnicode_Check(repunicode)) {
8061                /* Byte results not supported, since they have no decimal property. */
8062                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8063                Py_DECREF(repunicode);
8064                goto onError;
8065            }
8066            /* generate replacement  */
8067            repsize = PyUnicode_GET_SIZE(repunicode);
8068            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8069                Py_UNICODE ch = *uni2;
8070                if (Py_UNICODE_ISSPACE(ch))
8071                    *output++ = ' ';
8072                else {
8073                    decimal = Py_UNICODE_TODECIMAL(ch);
8074                    if (decimal >= 0)
8075                        *output++ = '0' + decimal;
8076                    else if (0 < ch && ch < 256)
8077                        *output++ = (char)ch;
8078                    else {
8079                        Py_DECREF(repunicode);
8080                        raise_encode_exception(&exc, encoding,
8081                                               s, length, collstart-s, collend-s, reason);
8082                        goto onError;
8083                    }
8084                }
8085            }
8086            p = s + newpos;
8087            Py_DECREF(repunicode);
8088        }
8089    }
8090    /* 0-terminate the output string */
8091    *output++ = '\0';
8092    Py_XDECREF(exc);
8093    Py_XDECREF(errorHandler);
8094    return 0;
8095
8096  onError:
8097    Py_XDECREF(exc);
8098    Py_XDECREF(errorHandler);
8099    return -1;
8100}
8101
8102/* --- Helpers ------------------------------------------------------------ */
8103
8104#include "stringlib/ucs1lib.h"
8105#include "stringlib/fastsearch.h"
8106#include "stringlib/partition.h"
8107#include "stringlib/split.h"
8108#include "stringlib/count.h"
8109#include "stringlib/find.h"
8110#include "stringlib/localeutil.h"
8111#include "stringlib/undef.h"
8112
8113#include "stringlib/ucs2lib.h"
8114#include "stringlib/fastsearch.h"
8115#include "stringlib/partition.h"
8116#include "stringlib/split.h"
8117#include "stringlib/count.h"
8118#include "stringlib/find.h"
8119#include "stringlib/localeutil.h"
8120#include "stringlib/undef.h"
8121
8122#include "stringlib/ucs4lib.h"
8123#include "stringlib/fastsearch.h"
8124#include "stringlib/partition.h"
8125#include "stringlib/split.h"
8126#include "stringlib/count.h"
8127#include "stringlib/find.h"
8128#include "stringlib/localeutil.h"
8129#include "stringlib/undef.h"
8130
8131static Py_ssize_t
8132any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8133                                  const Py_UCS1*, Py_ssize_t,
8134                                  Py_ssize_t, Py_ssize_t),
8135               Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8136                                  const Py_UCS2*, Py_ssize_t,
8137                                  Py_ssize_t, Py_ssize_t),
8138               Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8139                                  const Py_UCS4*, Py_ssize_t,
8140                                  Py_ssize_t, Py_ssize_t),
8141               PyObject* s1, PyObject* s2,
8142               Py_ssize_t start,
8143               Py_ssize_t end)
8144{
8145    int kind1, kind2, kind;
8146    void *buf1, *buf2;
8147    Py_ssize_t len1, len2, result;
8148
8149    kind1 = PyUnicode_KIND(s1);
8150    kind2 = PyUnicode_KIND(s2);
8151    kind = kind1 > kind2 ? kind1 : kind2;
8152    buf1 = PyUnicode_DATA(s1);
8153    buf2 = PyUnicode_DATA(s2);
8154    if (kind1 != kind)
8155        buf1 = _PyUnicode_AsKind(s1, kind);
8156    if (!buf1)
8157        return -2;
8158    if (kind2 != kind)
8159        buf2 = _PyUnicode_AsKind(s2, kind);
8160    if (!buf2) {
8161        if (kind1 != kind) PyMem_Free(buf1);
8162        return -2;
8163    }
8164    len1 = PyUnicode_GET_LENGTH(s1);
8165    len2 = PyUnicode_GET_LENGTH(s2);
8166
8167    switch(kind) {
8168    case PyUnicode_1BYTE_KIND:
8169        result = ucs1(buf1, len1, buf2, len2, start, end);
8170        break;
8171    case PyUnicode_2BYTE_KIND:
8172        result = ucs2(buf1, len1, buf2, len2, start, end);
8173        break;
8174    case PyUnicode_4BYTE_KIND:
8175        result = ucs4(buf1, len1, buf2, len2, start, end);
8176        break;
8177    default:
8178        assert(0); result = -2;
8179    }
8180
8181    if (kind1 != kind)
8182        PyMem_Free(buf1);
8183    if (kind2 != kind)
8184        PyMem_Free(buf2);
8185
8186    return result;
8187}
8188
8189Py_ssize_t
8190_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8191                                   Py_ssize_t n_buffer,
8192                                   void *digits, Py_ssize_t n_digits,
8193                                   Py_ssize_t min_width,
8194                                   const char *grouping,
8195                                   const char *thousands_sep)
8196{
8197    switch(kind) {
8198    case PyUnicode_1BYTE_KIND:
8199        return _PyUnicode_ucs1_InsertThousandsGrouping(
8200            (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8201            min_width, grouping, thousands_sep);
8202    case PyUnicode_2BYTE_KIND:
8203        return _PyUnicode_ucs2_InsertThousandsGrouping(
8204            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8205            min_width, grouping, thousands_sep);
8206    case PyUnicode_4BYTE_KIND:
8207        return _PyUnicode_ucs4_InsertThousandsGrouping(
8208            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8209            min_width, grouping, thousands_sep);
8210    }
8211    assert(0);
8212    return -1;
8213}
8214
8215
8216#include "stringlib/unicodedefs.h"
8217#include "stringlib/fastsearch.h"
8218
8219#include "stringlib/count.h"
8220#include "stringlib/find.h"
8221
8222/* helper macro to fixup start/end slice values */
8223#define ADJUST_INDICES(start, end, len)         \
8224    if (end > len)                              \
8225        end = len;                              \
8226    else if (end < 0) {                         \
8227        end += len;                             \
8228        if (end < 0)                            \
8229            end = 0;                            \
8230    }                                           \
8231    if (start < 0) {                            \
8232        start += len;                           \
8233        if (start < 0)                          \
8234            start = 0;                          \
8235    }
8236
8237Py_ssize_t
8238PyUnicode_Count(PyObject *str,
8239                PyObject *substr,
8240                Py_ssize_t start,
8241                Py_ssize_t end)
8242{
8243    Py_ssize_t result;
8244    PyUnicodeObject* str_obj;
8245    PyUnicodeObject* sub_obj;
8246    int kind1, kind2, kind;
8247    void *buf1 = NULL, *buf2 = NULL;
8248    Py_ssize_t len1, len2;
8249
8250    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
8251    if (!str_obj || PyUnicode_READY(str_obj) == -1)
8252        return -1;
8253    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
8254    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
8255        Py_DECREF(str_obj);
8256        return -1;
8257    }
8258
8259    kind1 = PyUnicode_KIND(str_obj);
8260    kind2 = PyUnicode_KIND(sub_obj);
8261    kind = kind1 > kind2 ? kind1 : kind2;
8262    buf1 = PyUnicode_DATA(str_obj);
8263    if (kind1 != kind)
8264        buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8265    if (!buf1)
8266        goto onError;
8267    buf2 = PyUnicode_DATA(sub_obj);
8268    if (kind2 != kind)
8269        buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8270    if (!buf2)
8271        goto onError;
8272    len1 = PyUnicode_GET_LENGTH(str_obj);
8273    len2 = PyUnicode_GET_LENGTH(sub_obj);
8274
8275    ADJUST_INDICES(start, end, len1);
8276    switch(kind) {
8277    case PyUnicode_1BYTE_KIND:
8278        result = ucs1lib_count(
8279            ((Py_UCS1*)buf1) + start, end - start,
8280            buf2, len2, PY_SSIZE_T_MAX
8281            );
8282        break;
8283    case PyUnicode_2BYTE_KIND:
8284        result = ucs2lib_count(
8285            ((Py_UCS2*)buf1) + start, end - start,
8286            buf2, len2, PY_SSIZE_T_MAX
8287            );
8288        break;
8289    case PyUnicode_4BYTE_KIND:
8290        result = ucs4lib_count(
8291            ((Py_UCS4*)buf1) + start, end - start,
8292            buf2, len2, PY_SSIZE_T_MAX
8293            );
8294        break;
8295    default:
8296        assert(0); result = 0;
8297    }
8298
8299    Py_DECREF(sub_obj);
8300    Py_DECREF(str_obj);
8301
8302    if (kind1 != kind)
8303        PyMem_Free(buf1);
8304    if (kind2 != kind)
8305        PyMem_Free(buf2);
8306
8307    return result;
8308  onError:
8309    Py_DECREF(sub_obj);
8310    Py_DECREF(str_obj);
8311    if (kind1 != kind && buf1)
8312        PyMem_Free(buf1);
8313    if (kind2 != kind && buf2)
8314        PyMem_Free(buf2);
8315    return -1;
8316}
8317
8318Py_ssize_t
8319PyUnicode_Find(PyObject *str,
8320               PyObject *sub,
8321               Py_ssize_t start,
8322               Py_ssize_t end,
8323               int direction)
8324{
8325    Py_ssize_t result;
8326
8327    str = PyUnicode_FromObject(str);
8328    if (!str || PyUnicode_READY(str) == -1)
8329        return -2;
8330    sub = PyUnicode_FromObject(sub);
8331    if (!sub || PyUnicode_READY(sub) == -1) {
8332        Py_DECREF(str);
8333        return -2;
8334    }
8335
8336    if (direction > 0)
8337        result = any_find_slice(
8338            ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8339            str, sub, start, end
8340            );
8341    else
8342        result = any_find_slice(
8343            ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8344            str, sub, start, end
8345            );
8346
8347    Py_DECREF(str);
8348    Py_DECREF(sub);
8349
8350    return result;
8351}
8352
8353Py_ssize_t
8354PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8355                   Py_ssize_t start, Py_ssize_t end,
8356                   int direction)
8357{
8358    char *result;
8359    int kind;
8360    if (PyUnicode_READY(str) == -1)
8361        return -2;
8362    if (start < 0 || end < 0) {
8363        PyErr_SetString(PyExc_IndexError, "string index out of range");
8364        return -2;
8365    }
8366    if (end > PyUnicode_GET_LENGTH(str))
8367        end = PyUnicode_GET_LENGTH(str);
8368    kind = PyUnicode_KIND(str);
8369    result = findchar(PyUnicode_1BYTE_DATA(str)
8370                      + PyUnicode_KIND_SIZE(kind, start),
8371                      kind,
8372                      end-start, ch, direction);
8373    if (!result)
8374        return -1;
8375    return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8376}
8377
8378static int
8379tailmatch(PyUnicodeObject *self,
8380          PyUnicodeObject *substring,
8381          Py_ssize_t start,
8382          Py_ssize_t end,
8383          int direction)
8384{
8385    int kind_self;
8386    int kind_sub;
8387    void *data_self;
8388    void *data_sub;
8389    Py_ssize_t offset;
8390    Py_ssize_t i;
8391    Py_ssize_t end_sub;
8392
8393    if (PyUnicode_READY(self) == -1 ||
8394        PyUnicode_READY(substring) == -1)
8395        return 0;
8396
8397    if (PyUnicode_GET_LENGTH(substring) == 0)
8398        return 1;
8399
8400    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8401    end -= PyUnicode_GET_LENGTH(substring);
8402    if (end < start)
8403        return 0;
8404
8405    kind_self = PyUnicode_KIND(self);
8406    data_self = PyUnicode_DATA(self);
8407    kind_sub = PyUnicode_KIND(substring);
8408    data_sub = PyUnicode_DATA(substring);
8409    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8410
8411    if (direction > 0)
8412        offset = end;
8413    else
8414        offset = start;
8415
8416    if (PyUnicode_READ(kind_self, data_self, offset) ==
8417        PyUnicode_READ(kind_sub, data_sub, 0) &&
8418        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8419        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8420        /* If both are of the same kind, memcmp is sufficient */
8421        if (kind_self == kind_sub) {
8422            return ! memcmp((char *)data_self +
8423                                (offset * PyUnicode_CHARACTER_SIZE(substring)),
8424                            data_sub,
8425                            PyUnicode_GET_LENGTH(substring) *
8426                                PyUnicode_CHARACTER_SIZE(substring));
8427        }
8428        /* otherwise we have to compare each character by first accesing it */
8429        else {
8430            /* We do not need to compare 0 and len(substring)-1 because
8431               the if statement above ensured already that they are equal
8432               when we end up here. */
8433            // TODO: honor direction and do a forward or backwards search
8434            for (i = 1; i < end_sub; ++i) {
8435                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8436                    PyUnicode_READ(kind_sub, data_sub, i))
8437                    return 0;
8438            }
8439            return 1;
8440        }
8441    }
8442
8443    return 0;
8444}
8445
8446Py_ssize_t
8447PyUnicode_Tailmatch(PyObject *str,
8448                    PyObject *substr,
8449                    Py_ssize_t start,
8450                    Py_ssize_t end,
8451                    int direction)
8452{
8453    Py_ssize_t result;
8454
8455    str = PyUnicode_FromObject(str);
8456    if (str == NULL)
8457        return -1;
8458    substr = PyUnicode_FromObject(substr);
8459    if (substr == NULL) {
8460        Py_DECREF(str);
8461        return -1;
8462    }
8463
8464    result = tailmatch((PyUnicodeObject *)str,
8465                       (PyUnicodeObject *)substr,
8466                       start, end, direction);
8467    Py_DECREF(str);
8468    Py_DECREF(substr);
8469    return result;
8470}
8471
8472/* Apply fixfct filter to the Unicode object self and return a
8473   reference to the modified object */
8474
8475static PyObject *
8476fixup(PyUnicodeObject *self,
8477      Py_UCS4 (*fixfct)(PyUnicodeObject *s))
8478{
8479    PyObject *u;
8480    Py_UCS4 maxchar_old, maxchar_new = 0;
8481
8482    if (PyUnicode_READY(self) == -1)
8483        return NULL;
8484    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8485    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8486                      maxchar_old);
8487    if (u == NULL)
8488        return NULL;
8489
8490    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8491              PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
8492
8493    /* fix functions return the new maximum character in a string,
8494       if the kind of the resulting unicode object does not change,
8495       everything is fine.  Otherwise we need to change the string kind
8496       and re-run the fix function. */
8497    maxchar_new = fixfct((PyUnicodeObject*)u);
8498    if (maxchar_new == 0)
8499        /* do nothing, keep maxchar_new at 0 which means no changes. */;
8500    else if (maxchar_new <= 127)
8501        maxchar_new = 127;
8502    else if (maxchar_new <= 255)
8503        maxchar_new = 255;
8504    else if (maxchar_new <= 65535)
8505        maxchar_new = 65535;
8506    else
8507        maxchar_new = 1114111; /* 0x10ffff */
8508
8509    if (!maxchar_new && PyUnicode_CheckExact(self)) {
8510        /* fixfct should return TRUE if it modified the buffer. If
8511           FALSE, return a reference to the original buffer instead
8512           (to save space, not time) */
8513        Py_INCREF(self);
8514        Py_DECREF(u);
8515        return (PyObject*) self;
8516    }
8517    else if (maxchar_new == maxchar_old) {
8518        return u;
8519    }
8520    else {
8521        /* In case the maximum character changed, we need to
8522           convert the string to the new category. */
8523        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
8524        if (v == NULL) {
8525            Py_DECREF(u);
8526            return NULL;
8527        }
8528        if (maxchar_new > maxchar_old) {
8529            /* If the maxchar increased so that the kind changed, not all
8530               characters are representable anymore and we need to fix the
8531               string again. This only happens in very few cases. */
8532            if (PyUnicode_CopyCharacters(v, 0,
8533                                         (PyObject*)self, 0,
8534                                         PyUnicode_GET_LENGTH(self)) < 0)
8535            {
8536                Py_DECREF(u);
8537                return NULL;
8538            }
8539            maxchar_old = fixfct((PyUnicodeObject*)v);
8540            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8541        }
8542        else {
8543            if (PyUnicode_CopyCharacters(v, 0,
8544                                         u, 0,
8545                                         PyUnicode_GET_LENGTH(self)) < 0)
8546            {
8547                Py_DECREF(u);
8548                return NULL;
8549            }
8550        }
8551
8552        Py_DECREF(u);
8553        return v;
8554    }
8555}
8556
8557static Py_UCS4
8558fixupper(PyUnicodeObject *self)
8559{
8560    /* No need to call PyUnicode_READY(self) because this function is only
8561       called as a callback from fixup() which does it already. */
8562    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8563    const int kind = PyUnicode_KIND(self);
8564    void *data = PyUnicode_DATA(self);
8565    int touched = 0;
8566    Py_UCS4 maxchar = 0;
8567    Py_ssize_t i;
8568
8569    for (i = 0; i < len; ++i) {
8570        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8571        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8572        if (up != ch) {
8573            if (up > maxchar)
8574                maxchar = up;
8575            PyUnicode_WRITE(kind, data, i, up);
8576            touched = 1;
8577        }
8578        else if (ch > maxchar)
8579            maxchar = ch;
8580    }
8581
8582    if (touched)
8583        return maxchar;
8584    else
8585        return 0;
8586}
8587
8588static Py_UCS4
8589fixlower(PyUnicodeObject *self)
8590{
8591    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8592    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8593    const int kind = PyUnicode_KIND(self);
8594    void *data = PyUnicode_DATA(self);
8595    int touched = 0;
8596    Py_UCS4 maxchar = 0;
8597    Py_ssize_t i;
8598
8599    for(i = 0; i < len; ++i) {
8600        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8601        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8602        if (lo != ch) {
8603            if (lo > maxchar)
8604                maxchar = lo;
8605            PyUnicode_WRITE(kind, data, i, lo);
8606            touched = 1;
8607        }
8608        else if (ch > maxchar)
8609            maxchar = ch;
8610    }
8611
8612    if (touched)
8613        return maxchar;
8614    else
8615        return 0;
8616}
8617
8618static Py_UCS4
8619fixswapcase(PyUnicodeObject *self)
8620{
8621    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8622    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8623    const int kind = PyUnicode_KIND(self);
8624    void *data = PyUnicode_DATA(self);
8625    int touched = 0;
8626    Py_UCS4 maxchar = 0;
8627    Py_ssize_t i;
8628
8629    for(i = 0; i < len; ++i) {
8630        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8631        Py_UCS4 nu = 0;
8632
8633        if (Py_UNICODE_ISUPPER(ch))
8634            nu = Py_UNICODE_TOLOWER(ch);
8635        else if (Py_UNICODE_ISLOWER(ch))
8636            nu = Py_UNICODE_TOUPPER(ch);
8637
8638        if (nu != 0) {
8639            if (nu > maxchar)
8640                maxchar = nu;
8641            PyUnicode_WRITE(kind, data, i, nu);
8642            touched = 1;
8643        }
8644        else if (ch > maxchar)
8645            maxchar = ch;
8646    }
8647
8648    if (touched)
8649        return maxchar;
8650    else
8651        return 0;
8652}
8653
8654static Py_UCS4
8655fixcapitalize(PyUnicodeObject *self)
8656{
8657    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8658    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8659    const int kind = PyUnicode_KIND(self);
8660    void *data = PyUnicode_DATA(self);
8661    int touched = 0;
8662    Py_UCS4 maxchar = 0;
8663    Py_ssize_t i = 0;
8664    Py_UCS4 ch;
8665
8666    if (len == 0)
8667        return 0;
8668
8669    ch = PyUnicode_READ(kind, data, i);
8670    if (!Py_UNICODE_ISUPPER(ch)) {
8671        maxchar = Py_UNICODE_TOUPPER(ch);
8672        PyUnicode_WRITE(kind, data, i, maxchar);
8673        touched = 1;
8674    }
8675    ++i;
8676    for(; i < len; ++i) {
8677        ch = PyUnicode_READ(kind, data, i);
8678        if (!Py_UNICODE_ISLOWER(ch)) {
8679            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8680            if (lo > maxchar)
8681                maxchar = lo;
8682            PyUnicode_WRITE(kind, data, i, lo);
8683            touched = 1;
8684        }
8685        else if (ch > maxchar)
8686            maxchar = ch;
8687    }
8688
8689    if (touched)
8690        return maxchar;
8691    else
8692        return 0;
8693}
8694
8695static Py_UCS4
8696fixtitle(PyUnicodeObject *self)
8697{
8698    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8699    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8700    const int kind = PyUnicode_KIND(self);
8701    void *data = PyUnicode_DATA(self);
8702    Py_UCS4 maxchar = 0;
8703    Py_ssize_t i = 0;
8704    int previous_is_cased;
8705
8706    /* Shortcut for single character strings */
8707    if (len == 1) {
8708        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8709        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8710        if (ti != ch) {
8711            PyUnicode_WRITE(kind, data, i, ti);
8712            return ti;
8713        }
8714        else
8715            return 0;
8716    }
8717    previous_is_cased = 0;
8718    for(; i < len; ++i) {
8719        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8720        Py_UCS4 nu;
8721
8722        if (previous_is_cased)
8723            nu = Py_UNICODE_TOLOWER(ch);
8724        else
8725            nu = Py_UNICODE_TOTITLE(ch);
8726
8727        if (nu > maxchar)
8728            maxchar = nu;
8729        PyUnicode_WRITE(kind, data, i, nu);
8730
8731        if (Py_UNICODE_ISLOWER(ch) ||
8732            Py_UNICODE_ISUPPER(ch) ||
8733            Py_UNICODE_ISTITLE(ch))
8734            previous_is_cased = 1;
8735        else
8736            previous_is_cased = 0;
8737    }
8738    return maxchar;
8739}
8740
8741PyObject *
8742PyUnicode_Join(PyObject *separator, PyObject *seq)
8743{
8744    PyObject *sep = NULL;
8745    Py_ssize_t seplen = 1;
8746    PyObject *res = NULL; /* the result */
8747    PyObject *fseq;          /* PySequence_Fast(seq) */
8748    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
8749    PyObject **items;
8750    PyObject *item;
8751    Py_ssize_t sz, i, res_offset;
8752    Py_UCS4 maxchar = 0;
8753    Py_UCS4 item_maxchar;
8754
8755    fseq = PySequence_Fast(seq, "");
8756    if (fseq == NULL) {
8757        return NULL;
8758    }
8759
8760    /* NOTE: the following code can't call back into Python code,
8761     * so we are sure that fseq won't be mutated.
8762     */
8763
8764    seqlen = PySequence_Fast_GET_SIZE(fseq);
8765    /* If empty sequence, return u"". */
8766    if (seqlen == 0) {
8767        res = PyUnicode_New(0, 0);
8768        goto Done;
8769    }
8770    items = PySequence_Fast_ITEMS(fseq);
8771    /* If singleton sequence with an exact Unicode, return that. */
8772    if (seqlen == 1) {
8773        item = items[0];
8774        if (PyUnicode_CheckExact(item)) {
8775            Py_INCREF(item);
8776            res = item;
8777            goto Done;
8778        }
8779    }
8780    else {
8781        /* Set up sep and seplen */
8782        if (separator == NULL) {
8783            /* fall back to a blank space separator */
8784            sep = PyUnicode_FromOrdinal(' ');
8785            if (!sep)
8786                goto onError;
8787        }
8788        else {
8789            if (!PyUnicode_Check(separator)) {
8790                PyErr_Format(PyExc_TypeError,
8791                             "separator: expected str instance,"
8792                             " %.80s found",
8793                             Py_TYPE(separator)->tp_name);
8794                goto onError;
8795            }
8796            if (PyUnicode_READY(separator) == -1)
8797                goto onError;
8798            sep = separator;
8799            seplen = PyUnicode_GET_LENGTH(separator);
8800            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8801            /* inc refcount to keep this code path symetric with the
8802               above case of a blank separator */
8803            Py_INCREF(sep);
8804        }
8805    }
8806
8807    /* There are at least two things to join, or else we have a subclass
8808     * of str in the sequence.
8809     * Do a pre-pass to figure out the total amount of space we'll
8810     * need (sz), and see whether all argument are strings.
8811     */
8812    sz = 0;
8813    for (i = 0; i < seqlen; i++) {
8814        const Py_ssize_t old_sz = sz;
8815        item = items[i];
8816        if (!PyUnicode_Check(item)) {
8817            PyErr_Format(PyExc_TypeError,
8818                         "sequence item %zd: expected str instance,"
8819                         " %.80s found",
8820                         i, Py_TYPE(item)->tp_name);
8821            goto onError;
8822        }
8823        if (PyUnicode_READY(item) == -1)
8824            goto onError;
8825        sz += PyUnicode_GET_LENGTH(item);
8826        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8827        if (item_maxchar > maxchar)
8828            maxchar = item_maxchar;
8829        if (i != 0)
8830            sz += seplen;
8831        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8832            PyErr_SetString(PyExc_OverflowError,
8833                            "join() result is too long for a Python string");
8834            goto onError;
8835        }
8836    }
8837
8838    res = PyUnicode_New(sz, maxchar);
8839    if (res == NULL)
8840        goto onError;
8841
8842    /* Catenate everything. */
8843    for (i = 0, res_offset = 0; i < seqlen; ++i) {
8844        Py_ssize_t itemlen;
8845        item = items[i];
8846        itemlen = PyUnicode_GET_LENGTH(item);
8847        /* Copy item, and maybe the separator. */
8848        if (i) {
8849            if (PyUnicode_CopyCharacters(res, res_offset,
8850                                         sep, 0, seplen) < 0)
8851                goto onError;
8852            res_offset += seplen;
8853        }
8854        if (PyUnicode_CopyCharacters(res, res_offset,
8855                                     item, 0, itemlen) < 0)
8856            goto onError;
8857        res_offset += itemlen;
8858    }
8859    assert(res_offset == PyUnicode_GET_LENGTH(res));
8860
8861  Done:
8862    Py_DECREF(fseq);
8863    Py_XDECREF(sep);
8864    return res;
8865
8866  onError:
8867    Py_DECREF(fseq);
8868    Py_XDECREF(sep);
8869    Py_XDECREF(res);
8870    return NULL;
8871}
8872
8873#define FILL(kind, data, value, start, length) \
8874    do { \
8875        Py_ssize_t i_ = 0; \
8876        assert(kind != PyUnicode_WCHAR_KIND); \
8877        switch ((kind)) { \
8878        case PyUnicode_1BYTE_KIND: { \
8879            unsigned char * to_ = (unsigned char *)((data)) + (start); \
8880            memset(to_, (unsigned char)value, length); \
8881            break; \
8882        } \
8883        case PyUnicode_2BYTE_KIND: { \
8884            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8885            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8886            break; \
8887        } \
8888        default: { \
8889            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8890            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8891            break; \
8892        } \
8893        } \
8894    } while (0)
8895
8896static PyUnicodeObject *
8897pad(PyUnicodeObject *self,
8898    Py_ssize_t left,
8899    Py_ssize_t right,
8900    Py_UCS4 fill)
8901{
8902    PyObject *u;
8903    Py_UCS4 maxchar;
8904    int kind;
8905    void *data;
8906
8907    if (left < 0)
8908        left = 0;
8909    if (right < 0)
8910        right = 0;
8911
8912    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
8913        Py_INCREF(self);
8914        return self;
8915    }
8916
8917    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8918        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
8919        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8920        return NULL;
8921    }
8922    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8923    if (fill > maxchar)
8924        maxchar = fill;
8925    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
8926    if (!u)
8927        return NULL;
8928
8929    kind = PyUnicode_KIND(u);
8930    data = PyUnicode_DATA(u);
8931    if (left)
8932        FILL(kind, data, fill, 0, left);
8933    if (right)
8934        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
8935    if (PyUnicode_CopyCharacters(u, left,
8936                                 (PyObject*)self, 0,
8937                                 _PyUnicode_LENGTH(self)) < 0)
8938    {
8939        Py_DECREF(u);
8940        return NULL;
8941    }
8942
8943    return (PyUnicodeObject*)u;
8944}
8945#undef FILL
8946
8947PyObject *
8948PyUnicode_Splitlines(PyObject *string, int keepends)
8949{
8950    PyObject *list;
8951
8952    string = PyUnicode_FromObject(string);
8953    if (string == NULL || PyUnicode_READY(string) == -1)
8954        return NULL;
8955
8956    switch(PyUnicode_KIND(string)) {
8957    case PyUnicode_1BYTE_KIND:
8958        list = ucs1lib_splitlines(
8959            (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8960            PyUnicode_GET_LENGTH(string), keepends);
8961        break;
8962    case PyUnicode_2BYTE_KIND:
8963        list = ucs2lib_splitlines(
8964            (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8965            PyUnicode_GET_LENGTH(string), keepends);
8966        break;
8967    case PyUnicode_4BYTE_KIND:
8968        list = ucs4lib_splitlines(
8969            (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8970            PyUnicode_GET_LENGTH(string), keepends);
8971        break;
8972    default:
8973        assert(0);
8974        list = 0;
8975    }
8976    Py_DECREF(string);
8977    return list;
8978}
8979
8980static PyObject *
8981split(PyUnicodeObject *self,
8982      PyUnicodeObject *substring,
8983      Py_ssize_t maxcount)
8984{
8985    int kind1, kind2, kind;
8986    void *buf1, *buf2;
8987    Py_ssize_t len1, len2;
8988    PyObject* out;
8989
8990    if (maxcount < 0)
8991        maxcount = PY_SSIZE_T_MAX;
8992
8993    if (PyUnicode_READY(self) == -1)
8994        return NULL;
8995
8996    if (substring == NULL)
8997        switch(PyUnicode_KIND(self)) {
8998        case PyUnicode_1BYTE_KIND:
8999            return ucs1lib_split_whitespace(
9000                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9001                PyUnicode_GET_LENGTH(self), maxcount
9002                );
9003        case PyUnicode_2BYTE_KIND:
9004            return ucs2lib_split_whitespace(
9005                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9006                PyUnicode_GET_LENGTH(self), maxcount
9007                );
9008        case PyUnicode_4BYTE_KIND:
9009            return ucs4lib_split_whitespace(
9010                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9011                PyUnicode_GET_LENGTH(self), maxcount
9012                );
9013        default:
9014            assert(0);
9015            return NULL;
9016        }
9017
9018    if (PyUnicode_READY(substring) == -1)
9019        return NULL;
9020
9021    kind1 = PyUnicode_KIND(self);
9022    kind2 = PyUnicode_KIND(substring);
9023    kind = kind1 > kind2 ? kind1 : kind2;
9024    buf1 = PyUnicode_DATA(self);
9025    buf2 = PyUnicode_DATA(substring);
9026    if (kind1 != kind)
9027        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9028    if (!buf1)
9029        return NULL;
9030    if (kind2 != kind)
9031        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9032    if (!buf2) {
9033        if (kind1 != kind) PyMem_Free(buf1);
9034        return NULL;
9035    }
9036    len1 = PyUnicode_GET_LENGTH(self);
9037    len2 = PyUnicode_GET_LENGTH(substring);
9038
9039    switch(kind) {
9040    case PyUnicode_1BYTE_KIND:
9041        out = ucs1lib_split(
9042            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9043        break;
9044    case PyUnicode_2BYTE_KIND:
9045        out = ucs2lib_split(
9046            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9047        break;
9048    case PyUnicode_4BYTE_KIND:
9049        out = ucs4lib_split(
9050            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9051        break;
9052    default:
9053        out = NULL;
9054    }
9055    if (kind1 != kind)
9056        PyMem_Free(buf1);
9057    if (kind2 != kind)
9058        PyMem_Free(buf2);
9059    return out;
9060}
9061
9062static PyObject *
9063rsplit(PyUnicodeObject *self,
9064       PyUnicodeObject *substring,
9065       Py_ssize_t maxcount)
9066{
9067    int kind1, kind2, kind;
9068    void *buf1, *buf2;
9069    Py_ssize_t len1, len2;
9070    PyObject* out;
9071
9072    if (maxcount < 0)
9073        maxcount = PY_SSIZE_T_MAX;
9074
9075    if (PyUnicode_READY(self) == -1)
9076        return NULL;
9077
9078    if (substring == NULL)
9079        switch(PyUnicode_KIND(self)) {
9080        case PyUnicode_1BYTE_KIND:
9081            return ucs1lib_rsplit_whitespace(
9082                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9083                PyUnicode_GET_LENGTH(self), maxcount
9084                );
9085        case PyUnicode_2BYTE_KIND:
9086            return ucs2lib_rsplit_whitespace(
9087                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9088                PyUnicode_GET_LENGTH(self), maxcount
9089                );
9090        case PyUnicode_4BYTE_KIND:
9091            return ucs4lib_rsplit_whitespace(
9092                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9093                PyUnicode_GET_LENGTH(self), maxcount
9094                );
9095        default:
9096            assert(0);
9097            return NULL;
9098        }
9099
9100    if (PyUnicode_READY(substring) == -1)
9101        return NULL;
9102
9103    kind1 = PyUnicode_KIND(self);
9104    kind2 = PyUnicode_KIND(substring);
9105    kind = kind1 > kind2 ? kind1 : kind2;
9106    buf1 = PyUnicode_DATA(self);
9107    buf2 = PyUnicode_DATA(substring);
9108    if (kind1 != kind)
9109        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9110    if (!buf1)
9111        return NULL;
9112    if (kind2 != kind)
9113        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9114    if (!buf2) {
9115        if (kind1 != kind) PyMem_Free(buf1);
9116        return NULL;
9117    }
9118    len1 = PyUnicode_GET_LENGTH(self);
9119    len2 = PyUnicode_GET_LENGTH(substring);
9120
9121    switch(kind) {
9122    case PyUnicode_1BYTE_KIND:
9123        out = ucs1lib_rsplit(
9124            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9125        break;
9126    case PyUnicode_2BYTE_KIND:
9127        out = ucs2lib_rsplit(
9128            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9129        break;
9130    case PyUnicode_4BYTE_KIND:
9131        out = ucs4lib_rsplit(
9132            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9133        break;
9134    default:
9135        out = NULL;
9136    }
9137    if (kind1 != kind)
9138        PyMem_Free(buf1);
9139    if (kind2 != kind)
9140        PyMem_Free(buf2);
9141    return out;
9142}
9143
9144static Py_ssize_t
9145anylib_find(int kind, void *buf1, Py_ssize_t len1,
9146            void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9147{
9148    switch(kind) {
9149    case PyUnicode_1BYTE_KIND:
9150        return ucs1lib_find(buf1, len1, buf2, len2, offset);
9151    case PyUnicode_2BYTE_KIND:
9152        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9153    case PyUnicode_4BYTE_KIND:
9154        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9155    }
9156    assert(0);
9157    return -1;
9158}
9159
9160static Py_ssize_t
9161anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9162             void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9163{
9164        switch(kind) {
9165        case PyUnicode_1BYTE_KIND:
9166            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9167        case PyUnicode_2BYTE_KIND:
9168            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9169        case PyUnicode_4BYTE_KIND:
9170            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9171        }
9172        assert(0);
9173        return 0;
9174}
9175
9176static PyObject *
9177replace(PyObject *self, PyObject *str1,
9178        PyObject *str2, Py_ssize_t maxcount)
9179{
9180    PyObject *u;
9181    char *sbuf = PyUnicode_DATA(self);
9182    char *buf1 = PyUnicode_DATA(str1);
9183    char *buf2 = PyUnicode_DATA(str2);
9184    int srelease = 0, release1 = 0, release2 = 0;
9185    int skind = PyUnicode_KIND(self);
9186    int kind1 = PyUnicode_KIND(str1);
9187    int kind2 = PyUnicode_KIND(str2);
9188    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9189    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9190    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9191
9192    if (maxcount < 0)
9193        maxcount = PY_SSIZE_T_MAX;
9194    else if (maxcount == 0 || slen == 0)
9195        goto nothing;
9196
9197    if (skind < kind1)
9198        /* substring too wide to be present */
9199        goto nothing;
9200
9201    if (len1 == len2) {
9202        Py_ssize_t i;
9203        /* same length */
9204        if (len1 == 0)
9205            goto nothing;
9206        if (len1 == 1) {
9207            /* replace characters */
9208            Py_UCS4 u1, u2, maxchar;
9209            int mayshrink, rkind;
9210            u1 = PyUnicode_READ_CHAR(str1, 0);
9211            if (!findchar(sbuf, PyUnicode_KIND(self),
9212                          slen, u1, 1))
9213                goto nothing;
9214            u2 = PyUnicode_READ_CHAR(str2, 0);
9215            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9216            /* Replacing u1 with u2 may cause a maxchar reduction in the
9217               result string. */
9218            mayshrink = maxchar > 127;
9219            if (u2 > maxchar) {
9220                maxchar = u2;
9221                mayshrink = 0;
9222            }
9223            u = PyUnicode_New(slen, maxchar);
9224            if (!u)
9225                goto error;
9226            if (PyUnicode_CopyCharacters(u, 0,
9227                                         (PyObject*)self, 0, slen) < 0)
9228            {
9229                Py_DECREF(u);
9230                return NULL;
9231            }
9232            rkind = PyUnicode_KIND(u);
9233            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9234                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
9235                    if (--maxcount < 0)
9236                        break;
9237                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
9238                }
9239            if (mayshrink) {
9240                PyObject *tmp = u;
9241                u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9242                                              PyUnicode_GET_LENGTH(tmp));
9243                Py_DECREF(tmp);
9244            }
9245        } else {
9246            int rkind = skind;
9247            char *res;
9248            if (kind1 < rkind) {
9249                /* widen substring */
9250                buf1 = _PyUnicode_AsKind(str1, rkind);
9251                if (!buf1) goto error;
9252                release1 = 1;
9253            }
9254            i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
9255            if (i < 0)
9256                goto nothing;
9257            if (rkind > kind2) {
9258                /* widen replacement */
9259                buf2 = _PyUnicode_AsKind(str2, rkind);
9260                if (!buf2) goto error;
9261                release2 = 1;
9262            }
9263            else if (rkind < kind2) {
9264                /* widen self and buf1 */
9265                rkind = kind2;
9266                if (release1) PyMem_Free(buf1);
9267                sbuf = _PyUnicode_AsKind(self, rkind);
9268                if (!sbuf) goto error;
9269                srelease = 1;
9270                buf1 = _PyUnicode_AsKind(str1, rkind);
9271                if (!buf1) goto error;
9272                release1 = 1;
9273            }
9274            res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9275            if (!res) {
9276                PyErr_NoMemory();
9277                goto error;
9278            }
9279            memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
9280            /* change everything in-place, starting with this one */
9281            memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9282                   buf2,
9283                   PyUnicode_KIND_SIZE(rkind, len2));
9284            i += len1;
9285
9286            while ( --maxcount > 0) {
9287                i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9288                                slen-i,
9289                                buf1, len1, i);
9290                if (i == -1)
9291                    break;
9292                memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9293                       buf2,
9294                       PyUnicode_KIND_SIZE(rkind, len2));
9295                i += len1;
9296            }
9297
9298            u = PyUnicode_FromKindAndData(rkind, res, slen);
9299            PyMem_Free(res);
9300            if (!u) goto error;
9301        }
9302    } else {
9303
9304        Py_ssize_t n, i, j, ires;
9305        Py_ssize_t product, new_size;
9306        int rkind = skind;
9307        char *res;
9308
9309        if (kind1 < rkind) {
9310            buf1 = _PyUnicode_AsKind(str1, rkind);
9311            if (!buf1) goto error;
9312            release1 = 1;
9313        }
9314        n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
9315        if (n == 0)
9316            goto nothing;
9317        if (kind2 < rkind) {
9318            buf2 = _PyUnicode_AsKind(str2, rkind);
9319            if (!buf2) goto error;
9320            release2 = 1;
9321        }
9322        else if (kind2 > rkind) {
9323            rkind = kind2;
9324            sbuf = _PyUnicode_AsKind(self, rkind);
9325            if (!sbuf) goto error;
9326            srelease = 1;
9327            if (release1) PyMem_Free(buf1);
9328            buf1 = _PyUnicode_AsKind(str1, rkind);
9329            if (!buf1) goto error;
9330            release1 = 1;
9331        }
9332        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9333           PyUnicode_GET_LENGTH(str1))); */
9334        product = n * (len2-len1);
9335        if ((product / (len2-len1)) != n) {
9336                PyErr_SetString(PyExc_OverflowError,
9337                                "replace string is too long");
9338                goto error;
9339        }
9340        new_size = slen + product;
9341        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9342            PyErr_SetString(PyExc_OverflowError,
9343                            "replace string is too long");
9344            goto error;
9345        }
9346        res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9347        if (!res)
9348            goto error;
9349        ires = i = 0;
9350        if (len1 > 0) {
9351            while (n-- > 0) {
9352                /* look for next match */
9353                j = anylib_find(rkind,
9354                                sbuf + PyUnicode_KIND_SIZE(rkind, i),
9355                                slen-i, buf1, len1, i);
9356                if (j == -1)
9357                    break;
9358                else if (j > i) {
9359                    /* copy unchanged part [i:j] */
9360                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9361                           sbuf + PyUnicode_KIND_SIZE(rkind, i),
9362                           PyUnicode_KIND_SIZE(rkind, j-i));
9363                    ires += j - i;
9364                }
9365                /* copy substitution string */
9366                if (len2 > 0) {
9367                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9368                           buf2,
9369                           PyUnicode_KIND_SIZE(rkind, len2));
9370                    ires += len2;
9371                }
9372                i = j + len1;
9373            }
9374            if (i < slen)
9375                /* copy tail [i:] */
9376                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9377                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9378                       PyUnicode_KIND_SIZE(rkind, slen-i));
9379        } else {
9380            /* interleave */
9381            while (n > 0) {
9382                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9383                       buf2,
9384                       PyUnicode_KIND_SIZE(rkind, len2));
9385                ires += len2;
9386                if (--n <= 0)
9387                    break;
9388                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9389                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9390                       PyUnicode_KIND_SIZE(rkind, 1));
9391                ires++;
9392                i++;
9393            }
9394            memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9395                   sbuf + PyUnicode_KIND_SIZE(rkind, i),
9396                   PyUnicode_KIND_SIZE(rkind, slen-i));
9397        }
9398        u = PyUnicode_FromKindAndData(rkind, res, new_size);
9399        PyMem_Free(res);
9400    }
9401    if (srelease)
9402        PyMem_FREE(sbuf);
9403    if (release1)
9404        PyMem_FREE(buf1);
9405    if (release2)
9406        PyMem_FREE(buf2);
9407    return u;
9408
9409  nothing:
9410    /* nothing to replace; return original string (when possible) */
9411    if (srelease)
9412        PyMem_FREE(sbuf);
9413    if (release1)
9414        PyMem_FREE(buf1);
9415    if (release2)
9416        PyMem_FREE(buf2);
9417    if (PyUnicode_CheckExact(self)) {
9418        Py_INCREF(self);
9419        return (PyObject *) self;
9420    }
9421    return PyUnicode_Copy(self);
9422  error:
9423    if (srelease && sbuf)
9424        PyMem_FREE(sbuf);
9425    if (release1 && buf1)
9426        PyMem_FREE(buf1);
9427    if (release2 && buf2)
9428        PyMem_FREE(buf2);
9429    return NULL;
9430}
9431
9432/* --- Unicode Object Methods --------------------------------------------- */
9433
9434PyDoc_STRVAR(title__doc__,
9435             "S.title() -> str\n\
9436\n\
9437Return a titlecased version of S, i.e. words start with title case\n\
9438characters, all remaining cased characters have lower case.");
9439
9440static PyObject*
9441unicode_title(PyUnicodeObject *self)
9442{
9443    return fixup(self, fixtitle);
9444}
9445
9446PyDoc_STRVAR(capitalize__doc__,
9447             "S.capitalize() -> str\n\
9448\n\
9449Return a capitalized version of S, i.e. make the first character\n\
9450have upper case and the rest lower case.");
9451
9452static PyObject*
9453unicode_capitalize(PyUnicodeObject *self)
9454{
9455    return fixup(self, fixcapitalize);
9456}
9457
9458#if 0
9459PyDoc_STRVAR(capwords__doc__,
9460             "S.capwords() -> str\n\
9461\n\
9462Apply .capitalize() to all words in S and return the result with\n\
9463normalized whitespace (all whitespace strings are replaced by ' ').");
9464
9465static PyObject*
9466unicode_capwords(PyUnicodeObject *self)
9467{
9468    PyObject *list;
9469    PyObject *item;
9470    Py_ssize_t i;
9471
9472    /* Split into words */
9473    list = split(self, NULL, -1);
9474    if (!list)
9475        return NULL;
9476
9477    /* Capitalize each word */
9478    for (i = 0; i < PyList_GET_SIZE(list); i++) {
9479        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
9480                     fixcapitalize);
9481        if (item == NULL)
9482            goto onError;
9483        Py_DECREF(PyList_GET_ITEM(list, i));
9484        PyList_SET_ITEM(list, i, item);
9485    }
9486
9487    /* Join the words to form a new string */
9488    item = PyUnicode_Join(NULL, list);
9489
9490  onError:
9491    Py_DECREF(list);
9492    return (PyObject *)item;
9493}
9494#endif
9495
9496/* Argument converter.  Coerces to a single unicode character */
9497
9498static int
9499convert_uc(PyObject *obj, void *addr)
9500{
9501    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
9502    PyObject *uniobj;
9503
9504    uniobj = PyUnicode_FromObject(obj);
9505    if (uniobj == NULL) {
9506        PyErr_SetString(PyExc_TypeError,
9507                        "The fill character cannot be converted to Unicode");
9508        return 0;
9509    }
9510    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
9511        PyErr_SetString(PyExc_TypeError,
9512                        "The fill character must be exactly one character long");
9513        Py_DECREF(uniobj);
9514        return 0;
9515    }
9516    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
9517    Py_DECREF(uniobj);
9518    return 1;
9519}
9520
9521PyDoc_STRVAR(center__doc__,
9522             "S.center(width[, fillchar]) -> str\n\
9523\n\
9524Return S centered in a string of length width. Padding is\n\
9525done using the specified fill character (default is a space)");
9526
9527static PyObject *
9528unicode_center(PyUnicodeObject *self, PyObject *args)
9529{
9530    Py_ssize_t marg, left;
9531    Py_ssize_t width;
9532    Py_UCS4 fillchar = ' ';
9533
9534    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
9535        return NULL;
9536
9537    if (PyUnicode_READY(self) == -1)
9538        return NULL;
9539
9540    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
9541        Py_INCREF(self);
9542        return (PyObject*) self;
9543    }
9544
9545    marg = width - _PyUnicode_LENGTH(self);
9546    left = marg / 2 + (marg & width & 1);
9547
9548    return (PyObject*) pad(self, left, marg - left, fillchar);
9549}
9550
9551#if 0
9552
9553/* This code should go into some future Unicode collation support
9554   module. The basic comparison should compare ordinals on a naive
9555   basis (this is what Java does and thus Jython too). */
9556
9557/* speedy UTF-16 code point order comparison */
9558/* gleaned from: */
9559/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9560
9561static short utf16Fixup[32] =
9562{
9563    0, 0, 0, 0, 0, 0, 0, 0,
9564    0, 0, 0, 0, 0, 0, 0, 0,
9565    0, 0, 0, 0, 0, 0, 0, 0,
9566    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
9567};
9568
9569static int
9570unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9571{
9572    Py_ssize_t len1, len2;
9573
9574    Py_UNICODE *s1 = str1->str;
9575    Py_UNICODE *s2 = str2->str;
9576
9577    len1 = str1->_base._base.length;
9578    len2 = str2->_base._base.length;
9579
9580    while (len1 > 0 && len2 > 0) {
9581        Py_UNICODE c1, c2;
9582
9583        c1 = *s1++;
9584        c2 = *s2++;
9585
9586        if (c1 > (1<<11) * 26)
9587            c1 += utf16Fixup[c1>>11];
9588        if (c2 > (1<<11) * 26)
9589            c2 += utf16Fixup[c2>>11];
9590        /* now c1 and c2 are in UTF-32-compatible order */
9591
9592        if (c1 != c2)
9593            return (c1 < c2) ? -1 : 1;
9594
9595        len1--; len2--;
9596    }
9597
9598    return (len1 < len2) ? -1 : (len1 != len2);
9599}
9600
9601#else
9602
9603/* This function assumes that str1 and str2 are readied by the caller. */
9604
9605static int
9606unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9607{
9608    int kind1, kind2;
9609    void *data1, *data2;
9610    Py_ssize_t len1, len2, i;
9611
9612    kind1 = PyUnicode_KIND(str1);
9613    kind2 = PyUnicode_KIND(str2);
9614    data1 = PyUnicode_DATA(str1);
9615    data2 = PyUnicode_DATA(str2);
9616    len1 = PyUnicode_GET_LENGTH(str1);
9617    len2 = PyUnicode_GET_LENGTH(str2);
9618
9619    for (i = 0; i < len1 && i < len2; ++i) {
9620        Py_UCS4 c1, c2;
9621        c1 = PyUnicode_READ(kind1, data1, i);
9622        c2 = PyUnicode_READ(kind2, data2, i);
9623
9624        if (c1 != c2)
9625            return (c1 < c2) ? -1 : 1;
9626    }
9627
9628    return (len1 < len2) ? -1 : (len1 != len2);
9629}
9630
9631#endif
9632
9633int
9634PyUnicode_Compare(PyObject *left, PyObject *right)
9635{
9636    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9637        if (PyUnicode_READY(left) == -1 ||
9638            PyUnicode_READY(right) == -1)
9639            return -1;
9640        return unicode_compare((PyUnicodeObject *)left,
9641                               (PyUnicodeObject *)right);
9642    }
9643    PyErr_Format(PyExc_TypeError,
9644                 "Can't compare %.100s and %.100s",
9645                 left->ob_type->tp_name,
9646                 right->ob_type->tp_name);
9647    return -1;
9648}
9649
9650int
9651PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9652{
9653    Py_ssize_t i;
9654    int kind;
9655    void *data;
9656    Py_UCS4 chr;
9657
9658    assert(_PyUnicode_CHECK(uni));
9659    if (PyUnicode_READY(uni) == -1)
9660        return -1;
9661    kind = PyUnicode_KIND(uni);
9662    data = PyUnicode_DATA(uni);
9663    /* Compare Unicode string and source character set string */
9664    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9665        if (chr != str[i])
9666            return (chr < (unsigned char)(str[i])) ? -1 : 1;
9667    /* This check keeps Python strings that end in '\0' from comparing equal
9668     to C strings identical up to that point. */
9669    if (PyUnicode_GET_LENGTH(uni) != i || chr)
9670        return 1; /* uni is longer */
9671    if (str[i])
9672        return -1; /* str is longer */
9673    return 0;
9674}
9675
9676
9677#define TEST_COND(cond)                         \
9678    ((cond) ? Py_True : Py_False)
9679
9680PyObject *
9681PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
9682{
9683    int result;
9684
9685    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9686        PyObject *v;
9687        if (PyUnicode_READY(left) == -1 ||
9688            PyUnicode_READY(right) == -1)
9689            return NULL;
9690        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9691            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
9692            if (op == Py_EQ) {
9693                Py_INCREF(Py_False);
9694                return Py_False;
9695            }
9696            if (op == Py_NE) {
9697                Py_INCREF(Py_True);
9698                return Py_True;
9699            }
9700        }
9701        if (left == right)
9702            result = 0;
9703        else
9704            result = unicode_compare((PyUnicodeObject *)left,
9705                                     (PyUnicodeObject *)right);
9706
9707        /* Convert the return value to a Boolean */
9708        switch (op) {
9709        case Py_EQ:
9710            v = TEST_COND(result == 0);
9711            break;
9712        case Py_NE:
9713            v = TEST_COND(result != 0);
9714            break;
9715        case Py_LE:
9716            v = TEST_COND(result <= 0);
9717            break;
9718        case Py_GE:
9719            v = TEST_COND(result >= 0);
9720            break;
9721        case Py_LT:
9722            v = TEST_COND(result == -1);
9723            break;
9724        case Py_GT:
9725            v = TEST_COND(result == 1);
9726            break;
9727        default:
9728            PyErr_BadArgument();
9729            return NULL;
9730        }
9731        Py_INCREF(v);
9732        return v;
9733    }
9734
9735    Py_RETURN_NOTIMPLEMENTED;
9736}
9737
9738int
9739PyUnicode_Contains(PyObject *container, PyObject *element)
9740{
9741    PyObject *str, *sub;
9742    int kind1, kind2, kind;
9743    void *buf1, *buf2;
9744    Py_ssize_t len1, len2;
9745    int result;
9746
9747    /* Coerce the two arguments */
9748    sub = PyUnicode_FromObject(element);
9749    if (!sub) {
9750        PyErr_Format(PyExc_TypeError,
9751                     "'in <string>' requires string as left operand, not %s",
9752                     element->ob_type->tp_name);
9753        return -1;
9754    }
9755    if (PyUnicode_READY(sub) == -1)
9756        return -1;
9757
9758    str = PyUnicode_FromObject(container);
9759    if (!str || PyUnicode_READY(str) == -1) {
9760        Py_DECREF(sub);
9761        return -1;
9762    }
9763
9764    kind1 = PyUnicode_KIND(str);
9765    kind2 = PyUnicode_KIND(sub);
9766    kind = kind1 > kind2 ? kind1 : kind2;
9767    buf1 = PyUnicode_DATA(str);
9768    buf2 = PyUnicode_DATA(sub);
9769    if (kind1 != kind)
9770        buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9771    if (!buf1) {
9772        Py_DECREF(sub);
9773        return -1;
9774    }
9775    if (kind2 != kind)
9776        buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9777    if (!buf2) {
9778        Py_DECREF(sub);
9779        if (kind1 != kind) PyMem_Free(buf1);
9780        return -1;
9781    }
9782    len1 = PyUnicode_GET_LENGTH(str);
9783    len2 = PyUnicode_GET_LENGTH(sub);
9784
9785    switch(kind) {
9786    case PyUnicode_1BYTE_KIND:
9787        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9788        break;
9789    case PyUnicode_2BYTE_KIND:
9790        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9791        break;
9792    case PyUnicode_4BYTE_KIND:
9793        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9794        break;
9795    default:
9796        result = -1;
9797        assert(0);
9798    }
9799
9800    Py_DECREF(str);
9801    Py_DECREF(sub);
9802
9803    if (kind1 != kind)
9804        PyMem_Free(buf1);
9805    if (kind2 != kind)
9806        PyMem_Free(buf2);
9807
9808    return result;
9809}
9810
9811/* Concat to string or Unicode object giving a new Unicode object. */
9812
9813PyObject *
9814PyUnicode_Concat(PyObject *left, PyObject *right)
9815{
9816    PyObject *u = NULL, *v = NULL, *w;
9817    Py_UCS4 maxchar;
9818
9819    /* Coerce the two arguments */
9820    u = PyUnicode_FromObject(left);
9821    if (u == NULL)
9822        goto onError;
9823    v = PyUnicode_FromObject(right);
9824    if (v == NULL)
9825        goto onError;
9826
9827    /* Shortcuts */
9828    if (v == unicode_empty) {
9829        Py_DECREF(v);
9830        return u;
9831    }
9832    if (u == unicode_empty) {
9833        Py_DECREF(u);
9834        return v;
9835    }
9836
9837    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
9838    maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
9839
9840    /* Concat the two Unicode strings */
9841    w = PyUnicode_New(
9842        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9843        maxchar);
9844    if (w == NULL)
9845        goto onError;
9846    if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9847        goto onError;
9848    if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
9849                                 v, 0,
9850                                 PyUnicode_GET_LENGTH(v)) < 0)
9851        goto onError;
9852    Py_DECREF(u);
9853    Py_DECREF(v);
9854    return w;
9855
9856  onError:
9857    Py_XDECREF(u);
9858    Py_XDECREF(v);
9859    return NULL;
9860}
9861
9862void
9863PyUnicode_Append(PyObject **p_left, PyObject *right)
9864{
9865    PyObject *left, *res;
9866
9867    if (p_left == NULL) {
9868        if (!PyErr_Occurred())
9869            PyErr_BadInternalCall();
9870        return;
9871    }
9872    left = *p_left;
9873    if (right == NULL || !PyUnicode_Check(left)) {
9874        if (!PyErr_Occurred())
9875            PyErr_BadInternalCall();
9876        goto error;
9877    }
9878
9879    if (PyUnicode_CheckExact(left) && left != unicode_empty
9880        && PyUnicode_CheckExact(right) && right != unicode_empty
9881        && unicode_resizable(left)
9882        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9883            || _PyUnicode_WSTR(left) != NULL))
9884    {
9885        Py_ssize_t u_len, v_len, new_len, copied;
9886
9887        /* FIXME: don't make wstr string ready */
9888        if (PyUnicode_READY(left))
9889            goto error;
9890        if (PyUnicode_READY(right))
9891            goto error;
9892
9893        /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9894        if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9895        {
9896            u_len = PyUnicode_GET_LENGTH(left);
9897            v_len = PyUnicode_GET_LENGTH(right);
9898            if (u_len > PY_SSIZE_T_MAX - v_len) {
9899                PyErr_SetString(PyExc_OverflowError,
9900                                "strings are too large to concat");
9901                goto error;
9902            }
9903            new_len = u_len + v_len;
9904
9905            /* Now we own the last reference to 'left', so we can resize it
9906             * in-place.
9907             */
9908            if (unicode_resize(&left, new_len) != 0) {
9909                /* XXX if _PyUnicode_Resize() fails, 'left' has been
9910                 * deallocated so it cannot be put back into
9911                 * 'variable'.  The MemoryError is raised when there
9912                 * is no value in 'variable', which might (very
9913                 * remotely) be a cause of incompatibilities.
9914                 */
9915                goto error;
9916            }
9917            /* copy 'right' into the newly allocated area of 'left' */
9918            copied = PyUnicode_CopyCharacters(left, u_len,
9919                                              right, 0,
9920                                              v_len);
9921            assert(0 <= copied);
9922            *p_left = left;
9923            return;
9924        }
9925    }
9926
9927    res = PyUnicode_Concat(left, right);
9928    if (res == NULL)
9929        goto error;
9930    Py_DECREF(left);
9931    *p_left = res;
9932    return;
9933
9934error:
9935    Py_DECREF(*p_left);
9936    *p_left = NULL;
9937}
9938
9939void
9940PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9941{
9942    PyUnicode_Append(pleft, right);
9943    Py_XDECREF(right);
9944}
9945
9946PyDoc_STRVAR(count__doc__,
9947             "S.count(sub[, start[, end]]) -> int\n\
9948\n\
9949Return the number of non-overlapping occurrences of substring sub in\n\
9950string S[start:end].  Optional arguments start and end are\n\
9951interpreted as in slice notation.");
9952
9953static PyObject *
9954unicode_count(PyUnicodeObject *self, PyObject *args)
9955{
9956    PyUnicodeObject *substring;
9957    Py_ssize_t start = 0;
9958    Py_ssize_t end = PY_SSIZE_T_MAX;
9959    PyObject *result;
9960    int kind1, kind2, kind;
9961    void *buf1, *buf2;
9962    Py_ssize_t len1, len2, iresult;
9963
9964    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9965                                            &start, &end))
9966        return NULL;
9967
9968    kind1 = PyUnicode_KIND(self);
9969    kind2 = PyUnicode_KIND(substring);
9970    kind = kind1 > kind2 ? kind1 : kind2;
9971    buf1 = PyUnicode_DATA(self);
9972    buf2 = PyUnicode_DATA(substring);
9973    if (kind1 != kind)
9974        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9975    if (!buf1) {
9976        Py_DECREF(substring);
9977        return NULL;
9978    }
9979    if (kind2 != kind)
9980        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9981    if (!buf2) {
9982        Py_DECREF(substring);
9983        if (kind1 != kind) PyMem_Free(buf1);
9984        return NULL;
9985    }
9986    len1 = PyUnicode_GET_LENGTH(self);
9987    len2 = PyUnicode_GET_LENGTH(substring);
9988
9989    ADJUST_INDICES(start, end, len1);
9990    switch(kind) {
9991    case PyUnicode_1BYTE_KIND:
9992        iresult = ucs1lib_count(
9993            ((Py_UCS1*)buf1) + start, end - start,
9994            buf2, len2, PY_SSIZE_T_MAX
9995            );
9996        break;
9997    case PyUnicode_2BYTE_KIND:
9998        iresult = ucs2lib_count(
9999            ((Py_UCS2*)buf1) + start, end - start,
10000            buf2, len2, PY_SSIZE_T_MAX
10001            );
10002        break;
10003    case PyUnicode_4BYTE_KIND:
10004        iresult = ucs4lib_count(
10005            ((Py_UCS4*)buf1) + start, end - start,
10006            buf2, len2, PY_SSIZE_T_MAX
10007            );
10008        break;
10009    default:
10010        assert(0); iresult = 0;
10011    }
10012
10013    result = PyLong_FromSsize_t(iresult);
10014
10015    if (kind1 != kind)
10016        PyMem_Free(buf1);
10017    if (kind2 != kind)
10018        PyMem_Free(buf2);
10019
10020    Py_DECREF(substring);
10021
10022    return result;
10023}
10024
10025PyDoc_STRVAR(encode__doc__,
10026             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10027\n\
10028Encode S using the codec registered for encoding. Default encoding\n\
10029is 'utf-8'. errors may be given to set a different error\n\
10030handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10031a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10032'xmlcharrefreplace' as well as any other name registered with\n\
10033codecs.register_error that can handle UnicodeEncodeErrors.");
10034
10035static PyObject *
10036unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
10037{
10038    static char *kwlist[] = {"encoding", "errors", 0};
10039    char *encoding = NULL;
10040    char *errors = NULL;
10041
10042    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10043                                     kwlist, &encoding, &errors))
10044        return NULL;
10045    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
10046}
10047
10048PyDoc_STRVAR(expandtabs__doc__,
10049             "S.expandtabs([tabsize]) -> str\n\
10050\n\
10051Return a copy of S where all tab characters are expanded using spaces.\n\
10052If tabsize is not given, a tab size of 8 characters is assumed.");
10053
10054static PyObject*
10055unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10056{
10057    Py_UNICODE *e;
10058    Py_UNICODE *p;
10059    Py_UNICODE *q;
10060    Py_UNICODE *qe;
10061    Py_ssize_t i, j, incr, wstr_length;
10062    PyUnicodeObject *u;
10063    int tabsize = 8;
10064
10065    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10066        return NULL;
10067
10068    if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10069        return NULL;
10070
10071    /* First pass: determine size of output string */
10072    i = 0; /* chars up to and including most recent \n or \r */
10073    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
10074    e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10075    for (p = _PyUnicode_WSTR(self); p < e; p++)
10076        if (*p == '\t') {
10077            if (tabsize > 0) {
10078                incr = tabsize - (j % tabsize); /* cannot overflow */
10079                if (j > PY_SSIZE_T_MAX - incr)
10080                    goto overflow1;
10081                j += incr;
10082            }
10083        }
10084        else {
10085            if (j > PY_SSIZE_T_MAX - 1)
10086                goto overflow1;
10087            j++;
10088            if (*p == '\n' || *p == '\r') {
10089                if (i > PY_SSIZE_T_MAX - j)
10090                    goto overflow1;
10091                i += j;
10092                j = 0;
10093            }
10094        }
10095
10096    if (i > PY_SSIZE_T_MAX - j)
10097        goto overflow1;
10098
10099    /* Second pass: create output string and fill it */
10100    u = _PyUnicode_New(i + j);
10101    if (!u)
10102        return NULL;
10103
10104    j = 0; /* same as in first pass */
10105    q = _PyUnicode_WSTR(u); /* next output char */
10106    qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
10107
10108    for (p = _PyUnicode_WSTR(self); p < e; p++)
10109        if (*p == '\t') {
10110            if (tabsize > 0) {
10111                i = tabsize - (j % tabsize);
10112                j += i;
10113                while (i--) {
10114                    if (q >= qe)
10115                        goto overflow2;
10116                    *q++ = ' ';
10117                }
10118            }
10119        }
10120        else {
10121            if (q >= qe)
10122                goto overflow2;
10123            *q++ = *p;
10124            j++;
10125            if (*p == '\n' || *p == '\r')
10126                j = 0;
10127        }
10128
10129    if (PyUnicode_READY(u) == -1) {
10130        Py_DECREF(u);
10131        return NULL;
10132    }
10133    return (PyObject*) u;
10134
10135  overflow2:
10136    Py_DECREF(u);
10137  overflow1:
10138    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10139    return NULL;
10140}
10141
10142PyDoc_STRVAR(find__doc__,
10143             "S.find(sub[, start[, end]]) -> int\n\
10144\n\
10145Return the lowest index in S where substring sub is found,\n\
10146such that sub is contained within S[start:end].  Optional\n\
10147arguments start and end are interpreted as in slice notation.\n\
10148\n\
10149Return -1 on failure.");
10150
10151static PyObject *
10152unicode_find(PyObject *self, PyObject *args)
10153{
10154    PyUnicodeObject *substring;
10155    Py_ssize_t start;
10156    Py_ssize_t end;
10157    Py_ssize_t result;
10158
10159    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10160                                            &start, &end))
10161        return NULL;
10162
10163    if (PyUnicode_READY(self) == -1)
10164        return NULL;
10165    if (PyUnicode_READY(substring) == -1)
10166        return NULL;
10167
10168    result = any_find_slice(
10169        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10170        self, (PyObject*)substring, start, end
10171        );
10172
10173    Py_DECREF(substring);
10174
10175    if (result == -2)
10176        return NULL;
10177
10178    return PyLong_FromSsize_t(result);
10179}
10180
10181static PyObject *
10182unicode_getitem(PyObject *self, Py_ssize_t index)
10183{
10184    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10185    if (ch == (Py_UCS4)-1)
10186        return NULL;
10187    return PyUnicode_FromOrdinal(ch);
10188}
10189
10190/* Believe it or not, this produces the same value for ASCII strings
10191   as bytes_hash(). */
10192static Py_hash_t
10193unicode_hash(PyUnicodeObject *self)
10194{
10195    Py_ssize_t len;
10196    Py_uhash_t x;
10197
10198    if (_PyUnicode_HASH(self) != -1)
10199        return _PyUnicode_HASH(self);
10200    if (PyUnicode_READY(self) == -1)
10201        return -1;
10202    len = PyUnicode_GET_LENGTH(self);
10203
10204    /* The hash function as a macro, gets expanded three times below. */
10205#define HASH(P) \
10206    x = (Py_uhash_t)*P << 7; \
10207    while (--len >= 0) \
10208        x = (1000003*x) ^ (Py_uhash_t)*P++;
10209
10210    switch (PyUnicode_KIND(self)) {
10211    case PyUnicode_1BYTE_KIND: {
10212        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10213        HASH(c);
10214        break;
10215    }
10216    case PyUnicode_2BYTE_KIND: {
10217        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10218        HASH(s);
10219        break;
10220    }
10221    default: {
10222        Py_UCS4 *l;
10223        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10224               "Impossible switch case in unicode_hash");
10225        l = PyUnicode_4BYTE_DATA(self);
10226        HASH(l);
10227        break;
10228    }
10229    }
10230    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10231
10232    if (x == -1)
10233        x = -2;
10234    _PyUnicode_HASH(self) = x;
10235    return x;
10236}
10237#undef HASH
10238
10239PyDoc_STRVAR(index__doc__,
10240             "S.index(sub[, start[, end]]) -> int\n\
10241\n\
10242Like S.find() but raise ValueError when the substring is not found.");
10243
10244static PyObject *
10245unicode_index(PyObject *self, PyObject *args)
10246{
10247    Py_ssize_t result;
10248    PyUnicodeObject *substring;
10249    Py_ssize_t start;
10250    Py_ssize_t end;
10251
10252    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10253                                            &start, &end))
10254        return NULL;
10255
10256    if (PyUnicode_READY(self) == -1)
10257        return NULL;
10258    if (PyUnicode_READY(substring) == -1)
10259        return NULL;
10260
10261    result = any_find_slice(
10262        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10263        self, (PyObject*)substring, start, end
10264        );
10265
10266    Py_DECREF(substring);
10267
10268    if (result == -2)
10269        return NULL;
10270
10271    if (result < 0) {
10272        PyErr_SetString(PyExc_ValueError, "substring not found");
10273        return NULL;
10274    }
10275
10276    return PyLong_FromSsize_t(result);
10277}
10278
10279PyDoc_STRVAR(islower__doc__,
10280             "S.islower() -> bool\n\
10281\n\
10282Return True if all cased characters in S are lowercase and there is\n\
10283at least one cased character in S, False otherwise.");
10284
10285static PyObject*
10286unicode_islower(PyUnicodeObject *self)
10287{
10288    Py_ssize_t i, length;
10289    int kind;
10290    void *data;
10291    int cased;
10292
10293    if (PyUnicode_READY(self) == -1)
10294        return NULL;
10295    length = PyUnicode_GET_LENGTH(self);
10296    kind = PyUnicode_KIND(self);
10297    data = PyUnicode_DATA(self);
10298
10299    /* Shortcut for single character strings */
10300    if (length == 1)
10301        return PyBool_FromLong(
10302            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
10303
10304    /* Special case for empty strings */
10305    if (length == 0)
10306        return PyBool_FromLong(0);
10307
10308    cased = 0;
10309    for (i = 0; i < length; i++) {
10310        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10311
10312        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10313            return PyBool_FromLong(0);
10314        else if (!cased && Py_UNICODE_ISLOWER(ch))
10315            cased = 1;
10316    }
10317    return PyBool_FromLong(cased);
10318}
10319
10320PyDoc_STRVAR(isupper__doc__,
10321             "S.isupper() -> bool\n\
10322\n\
10323Return True if all cased characters in S are uppercase and there is\n\
10324at least one cased character in S, False otherwise.");
10325
10326static PyObject*
10327unicode_isupper(PyUnicodeObject *self)
10328{
10329    Py_ssize_t i, length;
10330    int kind;
10331    void *data;
10332    int cased;
10333
10334    if (PyUnicode_READY(self) == -1)
10335        return NULL;
10336    length = PyUnicode_GET_LENGTH(self);
10337    kind = PyUnicode_KIND(self);
10338    data = PyUnicode_DATA(self);
10339
10340    /* Shortcut for single character strings */
10341    if (length == 1)
10342        return PyBool_FromLong(
10343            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
10344
10345    /* Special case for empty strings */
10346    if (length == 0)
10347        return PyBool_FromLong(0);
10348
10349    cased = 0;
10350    for (i = 0; i < length; i++) {
10351        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10352
10353        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10354            return PyBool_FromLong(0);
10355        else if (!cased && Py_UNICODE_ISUPPER(ch))
10356            cased = 1;
10357    }
10358    return PyBool_FromLong(cased);
10359}
10360
10361PyDoc_STRVAR(istitle__doc__,
10362             "S.istitle() -> bool\n\
10363\n\
10364Return True if S is a titlecased string and there is at least one\n\
10365character in S, i.e. upper- and titlecase characters may only\n\
10366follow uncased characters and lowercase characters only cased ones.\n\
10367Return False otherwise.");
10368
10369static PyObject*
10370unicode_istitle(PyUnicodeObject *self)
10371{
10372    Py_ssize_t i, length;
10373    int kind;
10374    void *data;
10375    int cased, previous_is_cased;
10376
10377    if (PyUnicode_READY(self) == -1)
10378        return NULL;
10379    length = PyUnicode_GET_LENGTH(self);
10380    kind = PyUnicode_KIND(self);
10381    data = PyUnicode_DATA(self);
10382
10383    /* Shortcut for single character strings */
10384    if (length == 1) {
10385        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10386        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10387                               (Py_UNICODE_ISUPPER(ch) != 0));
10388    }
10389
10390    /* Special case for empty strings */
10391    if (length == 0)
10392        return PyBool_FromLong(0);
10393
10394    cased = 0;
10395    previous_is_cased = 0;
10396    for (i = 0; i < length; i++) {
10397        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10398
10399        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10400            if (previous_is_cased)
10401                return PyBool_FromLong(0);
10402            previous_is_cased = 1;
10403            cased = 1;
10404        }
10405        else if (Py_UNICODE_ISLOWER(ch)) {
10406            if (!previous_is_cased)
10407                return PyBool_FromLong(0);
10408            previous_is_cased = 1;
10409            cased = 1;
10410        }
10411        else
10412            previous_is_cased = 0;
10413    }
10414    return PyBool_FromLong(cased);
10415}
10416
10417PyDoc_STRVAR(isspace__doc__,
10418             "S.isspace() -> bool\n\
10419\n\
10420Return True if all characters in S are whitespace\n\
10421and there is at least one character in S, False otherwise.");
10422
10423static PyObject*
10424unicode_isspace(PyUnicodeObject *self)
10425{
10426    Py_ssize_t i, length;
10427    int kind;
10428    void *data;
10429
10430    if (PyUnicode_READY(self) == -1)
10431        return NULL;
10432    length = PyUnicode_GET_LENGTH(self);
10433    kind = PyUnicode_KIND(self);
10434    data = PyUnicode_DATA(self);
10435
10436    /* Shortcut for single character strings */
10437    if (length == 1)
10438        return PyBool_FromLong(
10439            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
10440
10441    /* Special case for empty strings */
10442    if (length == 0)
10443        return PyBool_FromLong(0);
10444
10445    for (i = 0; i < length; i++) {
10446        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10447        if (!Py_UNICODE_ISSPACE(ch))
10448            return PyBool_FromLong(0);
10449    }
10450    return PyBool_FromLong(1);
10451}
10452
10453PyDoc_STRVAR(isalpha__doc__,
10454             "S.isalpha() -> bool\n\
10455\n\
10456Return True if all characters in S are alphabetic\n\
10457and there is at least one character in S, False otherwise.");
10458
10459static PyObject*
10460unicode_isalpha(PyUnicodeObject *self)
10461{
10462    Py_ssize_t i, length;
10463    int kind;
10464    void *data;
10465
10466    if (PyUnicode_READY(self) == -1)
10467        return NULL;
10468    length = PyUnicode_GET_LENGTH(self);
10469    kind = PyUnicode_KIND(self);
10470    data = PyUnicode_DATA(self);
10471
10472    /* Shortcut for single character strings */
10473    if (length == 1)
10474        return PyBool_FromLong(
10475            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
10476
10477    /* Special case for empty strings */
10478    if (length == 0)
10479        return PyBool_FromLong(0);
10480
10481    for (i = 0; i < length; i++) {
10482        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
10483            return PyBool_FromLong(0);
10484    }
10485    return PyBool_FromLong(1);
10486}
10487
10488PyDoc_STRVAR(isalnum__doc__,
10489             "S.isalnum() -> bool\n\
10490\n\
10491Return True if all characters in S are alphanumeric\n\
10492and there is at least one character in S, False otherwise.");
10493
10494static PyObject*
10495unicode_isalnum(PyUnicodeObject *self)
10496{
10497    int kind;
10498    void *data;
10499    Py_ssize_t len, i;
10500
10501    if (PyUnicode_READY(self) == -1)
10502        return NULL;
10503
10504    kind = PyUnicode_KIND(self);
10505    data = PyUnicode_DATA(self);
10506    len = PyUnicode_GET_LENGTH(self);
10507
10508    /* Shortcut for single character strings */
10509    if (len == 1) {
10510        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10511        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10512    }
10513
10514    /* Special case for empty strings */
10515    if (len == 0)
10516        return PyBool_FromLong(0);
10517
10518    for (i = 0; i < len; i++) {
10519        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10520        if (!Py_UNICODE_ISALNUM(ch))
10521            return PyBool_FromLong(0);
10522    }
10523    return PyBool_FromLong(1);
10524}
10525
10526PyDoc_STRVAR(isdecimal__doc__,
10527             "S.isdecimal() -> bool\n\
10528\n\
10529Return True if there are only decimal characters in S,\n\
10530False otherwise.");
10531
10532static PyObject*
10533unicode_isdecimal(PyUnicodeObject *self)
10534{
10535    Py_ssize_t i, length;
10536    int kind;
10537    void *data;
10538
10539    if (PyUnicode_READY(self) == -1)
10540        return NULL;
10541    length = PyUnicode_GET_LENGTH(self);
10542    kind = PyUnicode_KIND(self);
10543    data = PyUnicode_DATA(self);
10544
10545    /* Shortcut for single character strings */
10546    if (length == 1)
10547        return PyBool_FromLong(
10548            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
10549
10550    /* Special case for empty strings */
10551    if (length == 0)
10552        return PyBool_FromLong(0);
10553
10554    for (i = 0; i < length; i++) {
10555        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
10556            return PyBool_FromLong(0);
10557    }
10558    return PyBool_FromLong(1);
10559}
10560
10561PyDoc_STRVAR(isdigit__doc__,
10562             "S.isdigit() -> bool\n\
10563\n\
10564Return True if all characters in S are digits\n\
10565and there is at least one character in S, False otherwise.");
10566
10567static PyObject*
10568unicode_isdigit(PyUnicodeObject *self)
10569{
10570    Py_ssize_t i, length;
10571    int kind;
10572    void *data;
10573
10574    if (PyUnicode_READY(self) == -1)
10575        return NULL;
10576    length = PyUnicode_GET_LENGTH(self);
10577    kind = PyUnicode_KIND(self);
10578    data = PyUnicode_DATA(self);
10579
10580    /* Shortcut for single character strings */
10581    if (length == 1) {
10582        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10583        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10584    }
10585
10586    /* Special case for empty strings */
10587    if (length == 0)
10588        return PyBool_FromLong(0);
10589
10590    for (i = 0; i < length; i++) {
10591        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
10592            return PyBool_FromLong(0);
10593    }
10594    return PyBool_FromLong(1);
10595}
10596
10597PyDoc_STRVAR(isnumeric__doc__,
10598             "S.isnumeric() -> bool\n\
10599\n\
10600Return True if there are only numeric characters in S,\n\
10601False otherwise.");
10602
10603static PyObject*
10604unicode_isnumeric(PyUnicodeObject *self)
10605{
10606    Py_ssize_t i, length;
10607    int kind;
10608    void *data;
10609
10610    if (PyUnicode_READY(self) == -1)
10611        return NULL;
10612    length = PyUnicode_GET_LENGTH(self);
10613    kind = PyUnicode_KIND(self);
10614    data = PyUnicode_DATA(self);
10615
10616    /* Shortcut for single character strings */
10617    if (length == 1)
10618        return PyBool_FromLong(
10619            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
10620
10621    /* Special case for empty strings */
10622    if (length == 0)
10623        return PyBool_FromLong(0);
10624
10625    for (i = 0; i < length; i++) {
10626        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
10627            return PyBool_FromLong(0);
10628    }
10629    return PyBool_FromLong(1);
10630}
10631
10632int
10633PyUnicode_IsIdentifier(PyObject *self)
10634{
10635    int kind;
10636    void *data;
10637    Py_ssize_t i;
10638    Py_UCS4 first;
10639
10640    if (PyUnicode_READY(self) == -1) {
10641        Py_FatalError("identifier not ready");
10642        return 0;
10643    }
10644
10645    /* Special case for empty strings */
10646    if (PyUnicode_GET_LENGTH(self) == 0)
10647        return 0;
10648    kind = PyUnicode_KIND(self);
10649    data = PyUnicode_DATA(self);
10650
10651    /* PEP 3131 says that the first character must be in
10652       XID_Start and subsequent characters in XID_Continue,
10653       and for the ASCII range, the 2.x rules apply (i.e
10654       start with letters and underscore, continue with
10655       letters, digits, underscore). However, given the current
10656       definition of XID_Start and XID_Continue, it is sufficient
10657       to check just for these, except that _ must be allowed
10658       as starting an identifier.  */
10659    first = PyUnicode_READ(kind, data, 0);
10660    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
10661        return 0;
10662
10663    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
10664        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
10665            return 0;
10666    return 1;
10667}
10668
10669PyDoc_STRVAR(isidentifier__doc__,
10670             "S.isidentifier() -> bool\n\
10671\n\
10672Return True if S is a valid identifier according\n\
10673to the language definition.");
10674
10675static PyObject*
10676unicode_isidentifier(PyObject *self)
10677{
10678    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10679}
10680
10681PyDoc_STRVAR(isprintable__doc__,
10682             "S.isprintable() -> bool\n\
10683\n\
10684Return True if all characters in S are considered\n\
10685printable in repr() or S is empty, False otherwise.");
10686
10687static PyObject*
10688unicode_isprintable(PyObject *self)
10689{
10690    Py_ssize_t i, length;
10691    int kind;
10692    void *data;
10693
10694    if (PyUnicode_READY(self) == -1)
10695        return NULL;
10696    length = PyUnicode_GET_LENGTH(self);
10697    kind = PyUnicode_KIND(self);
10698    data = PyUnicode_DATA(self);
10699
10700    /* Shortcut for single character strings */
10701    if (length == 1)
10702        return PyBool_FromLong(
10703            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
10704
10705    for (i = 0; i < length; i++) {
10706        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
10707            Py_RETURN_FALSE;
10708        }
10709    }
10710    Py_RETURN_TRUE;
10711}
10712
10713PyDoc_STRVAR(join__doc__,
10714             "S.join(iterable) -> str\n\
10715\n\
10716Return a string which is the concatenation of the strings in the\n\
10717iterable.  The separator between elements is S.");
10718
10719static PyObject*
10720unicode_join(PyObject *self, PyObject *data)
10721{
10722    return PyUnicode_Join(self, data);
10723}
10724
10725static Py_ssize_t
10726unicode_length(PyUnicodeObject *self)
10727{
10728    if (PyUnicode_READY(self) == -1)
10729        return -1;
10730    return PyUnicode_GET_LENGTH(self);
10731}
10732
10733PyDoc_STRVAR(ljust__doc__,
10734             "S.ljust(width[, fillchar]) -> str\n\
10735\n\
10736Return S left-justified in a Unicode string of length width. Padding is\n\
10737done using the specified fill character (default is a space).");
10738
10739static PyObject *
10740unicode_ljust(PyUnicodeObject *self, PyObject *args)
10741{
10742    Py_ssize_t width;
10743    Py_UCS4 fillchar = ' ';
10744
10745    if (PyUnicode_READY(self) == -1)
10746        return NULL;
10747
10748    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
10749        return NULL;
10750
10751    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10752        Py_INCREF(self);
10753        return (PyObject*) self;
10754    }
10755
10756    return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
10757}
10758
10759PyDoc_STRVAR(lower__doc__,
10760             "S.lower() -> str\n\
10761\n\
10762Return a copy of the string S converted to lowercase.");
10763
10764static PyObject*
10765unicode_lower(PyUnicodeObject *self)
10766{
10767    return fixup(self, fixlower);
10768}
10769
10770#define LEFTSTRIP 0
10771#define RIGHTSTRIP 1
10772#define BOTHSTRIP 2
10773
10774/* Arrays indexed by above */
10775static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10776
10777#define STRIPNAME(i) (stripformat[i]+3)
10778
10779/* externally visible for str.strip(unicode) */
10780PyObject *
10781_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10782{
10783    void *data;
10784    int kind;
10785    Py_ssize_t i, j, len;
10786    BLOOM_MASK sepmask;
10787
10788    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10789        return NULL;
10790
10791    kind = PyUnicode_KIND(self);
10792    data = PyUnicode_DATA(self);
10793    len = PyUnicode_GET_LENGTH(self);
10794    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10795                              PyUnicode_DATA(sepobj),
10796                              PyUnicode_GET_LENGTH(sepobj));
10797
10798    i = 0;
10799    if (striptype != RIGHTSTRIP) {
10800        while (i < len &&
10801               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
10802            i++;
10803        }
10804    }
10805
10806    j = len;
10807    if (striptype != LEFTSTRIP) {
10808        do {
10809            j--;
10810        } while (j >= i &&
10811                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
10812        j++;
10813    }
10814
10815    return PyUnicode_Substring((PyObject*)self, i, j);
10816}
10817
10818PyObject*
10819PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10820{
10821    unsigned char *data;
10822    int kind;
10823    Py_ssize_t length;
10824
10825    if (PyUnicode_READY(self) == -1)
10826        return NULL;
10827
10828    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10829
10830    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
10831    {
10832        if (PyUnicode_CheckExact(self)) {
10833            Py_INCREF(self);
10834            return self;
10835        }
10836        else
10837            return PyUnicode_Copy(self);
10838    }
10839
10840    length = end - start;
10841    if (length == 1)
10842        return unicode_getitem(self, start);
10843
10844    if (start < 0 || end < 0) {
10845        PyErr_SetString(PyExc_IndexError, "string index out of range");
10846        return NULL;
10847    }
10848
10849    kind = PyUnicode_KIND(self);
10850    data = PyUnicode_1BYTE_DATA(self);
10851    return PyUnicode_FromKindAndData(kind,
10852                                     data + PyUnicode_KIND_SIZE(kind, start),
10853                                     length);
10854}
10855
10856static PyObject *
10857do_strip(PyUnicodeObject *self, int striptype)
10858{
10859    int kind;
10860    void *data;
10861    Py_ssize_t len, i, j;
10862
10863    if (PyUnicode_READY(self) == -1)
10864        return NULL;
10865
10866    kind = PyUnicode_KIND(self);
10867    data = PyUnicode_DATA(self);
10868    len = PyUnicode_GET_LENGTH(self);
10869
10870    i = 0;
10871    if (striptype != RIGHTSTRIP) {
10872        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
10873            i++;
10874        }
10875    }
10876
10877    j = len;
10878    if (striptype != LEFTSTRIP) {
10879        do {
10880            j--;
10881        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
10882        j++;
10883    }
10884
10885    return PyUnicode_Substring((PyObject*)self, i, j);
10886}
10887
10888
10889static PyObject *
10890do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10891{
10892    PyObject *sep = NULL;
10893
10894    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10895        return NULL;
10896
10897    if (sep != NULL && sep != Py_None) {
10898        if (PyUnicode_Check(sep))
10899            return _PyUnicode_XStrip(self, striptype, sep);
10900        else {
10901            PyErr_Format(PyExc_TypeError,
10902                         "%s arg must be None or str",
10903                         STRIPNAME(striptype));
10904            return NULL;
10905        }
10906    }
10907
10908    return do_strip(self, striptype);
10909}
10910
10911
10912PyDoc_STRVAR(strip__doc__,
10913             "S.strip([chars]) -> str\n\
10914\n\
10915Return a copy of the string S with leading and trailing\n\
10916whitespace removed.\n\
10917If chars is given and not None, remove characters in chars instead.");
10918
10919static PyObject *
10920unicode_strip(PyUnicodeObject *self, PyObject *args)
10921{
10922    if (PyTuple_GET_SIZE(args) == 0)
10923        return do_strip(self, BOTHSTRIP); /* Common case */
10924    else
10925        return do_argstrip(self, BOTHSTRIP, args);
10926}
10927
10928
10929PyDoc_STRVAR(lstrip__doc__,
10930             "S.lstrip([chars]) -> str\n\
10931\n\
10932Return a copy of the string S with leading whitespace removed.\n\
10933If chars is given and not None, remove characters in chars instead.");
10934
10935static PyObject *
10936unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10937{
10938    if (PyTuple_GET_SIZE(args) == 0)
10939        return do_strip(self, LEFTSTRIP); /* Common case */
10940    else
10941        return do_argstrip(self, LEFTSTRIP, args);
10942}
10943
10944
10945PyDoc_STRVAR(rstrip__doc__,
10946             "S.rstrip([chars]) -> str\n\
10947\n\
10948Return a copy of the string S with trailing whitespace removed.\n\
10949If chars is given and not None, remove characters in chars instead.");
10950
10951static PyObject *
10952unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10953{
10954    if (PyTuple_GET_SIZE(args) == 0)
10955        return do_strip(self, RIGHTSTRIP); /* Common case */
10956    else
10957        return do_argstrip(self, RIGHTSTRIP, args);
10958}
10959
10960
10961static PyObject*
10962unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
10963{
10964    PyUnicodeObject *u;
10965    Py_ssize_t nchars, n;
10966
10967    if (len < 1) {
10968        Py_INCREF(unicode_empty);
10969        return unicode_empty;
10970    }
10971
10972    if (len == 1 && PyUnicode_CheckExact(str)) {
10973        /* no repeat, return original string */
10974        Py_INCREF(str);
10975        return (PyObject*) str;
10976    }
10977
10978    if (PyUnicode_READY(str) == -1)
10979        return NULL;
10980
10981    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
10982        PyErr_SetString(PyExc_OverflowError,
10983                        "repeated string is too long");
10984        return NULL;
10985    }
10986    nchars = len * PyUnicode_GET_LENGTH(str);
10987
10988    u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
10989    if (!u)
10990        return NULL;
10991    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
10992
10993    if (PyUnicode_GET_LENGTH(str) == 1) {
10994        const int kind = PyUnicode_KIND(str);
10995        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10996        void *to = PyUnicode_DATA(u);
10997        if (kind == PyUnicode_1BYTE_KIND)
10998            memset(to, (unsigned char)fill_char, len);
10999        else {
11000            for (n = 0; n < len; ++n)
11001                PyUnicode_WRITE(kind, to, n, fill_char);
11002        }
11003    }
11004    else {
11005        /* number of characters copied this far */
11006        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11007        const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11008        char *to = (char *) PyUnicode_DATA(u);
11009        Py_MEMCPY(to, PyUnicode_DATA(str),
11010                  PyUnicode_GET_LENGTH(str) * char_size);
11011        while (done < nchars) {
11012            n = (done <= nchars-done) ? done : nchars-done;
11013            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11014            done += n;
11015        }
11016    }
11017
11018    return (PyObject*) u;
11019}
11020
11021PyObject *
11022PyUnicode_Replace(PyObject *obj,
11023                  PyObject *subobj,
11024                  PyObject *replobj,
11025                  Py_ssize_t maxcount)
11026{
11027    PyObject *self;
11028    PyObject *str1;
11029    PyObject *str2;
11030    PyObject *result;
11031
11032    self = PyUnicode_FromObject(obj);
11033    if (self == NULL || PyUnicode_READY(self) == -1)
11034        return NULL;
11035    str1 = PyUnicode_FromObject(subobj);
11036    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11037        Py_DECREF(self);
11038        return NULL;
11039    }
11040    str2 = PyUnicode_FromObject(replobj);
11041    if (str2 == NULL || PyUnicode_READY(str2)) {
11042        Py_DECREF(self);
11043        Py_DECREF(str1);
11044        return NULL;
11045    }
11046    result = replace(self, str1, str2, maxcount);
11047    Py_DECREF(self);
11048    Py_DECREF(str1);
11049    Py_DECREF(str2);
11050    return result;
11051}
11052
11053PyDoc_STRVAR(replace__doc__,
11054             "S.replace(old, new[, count]) -> str\n\
11055\n\
11056Return a copy of S with all occurrences of substring\n\
11057old replaced by new.  If the optional argument count is\n\
11058given, only the first count occurrences are replaced.");
11059
11060static PyObject*
11061unicode_replace(PyObject *self, PyObject *args)
11062{
11063    PyObject *str1;
11064    PyObject *str2;
11065    Py_ssize_t maxcount = -1;
11066    PyObject *result;
11067
11068    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11069        return NULL;
11070    if (!PyUnicode_READY(self) == -1)
11071        return NULL;
11072    str1 = PyUnicode_FromObject(str1);
11073    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11074        return NULL;
11075    str2 = PyUnicode_FromObject(str2);
11076    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11077        Py_DECREF(str1);
11078        return NULL;
11079    }
11080
11081    result = replace(self, str1, str2, maxcount);
11082
11083    Py_DECREF(str1);
11084    Py_DECREF(str2);
11085    return result;
11086}
11087
11088static PyObject *
11089unicode_repr(PyObject *unicode)
11090{
11091    PyObject *repr;
11092    Py_ssize_t isize;
11093    Py_ssize_t osize, squote, dquote, i, o;
11094    Py_UCS4 max, quote;
11095    int ikind, okind;
11096    void *idata, *odata;
11097
11098    if (PyUnicode_READY(unicode) == -1)
11099        return NULL;
11100
11101    isize = PyUnicode_GET_LENGTH(unicode);
11102    idata = PyUnicode_DATA(unicode);
11103
11104    /* Compute length of output, quote characters, and
11105       maximum character */
11106    osize = 2; /* quotes */
11107    max = 127;
11108    squote = dquote = 0;
11109    ikind = PyUnicode_KIND(unicode);
11110    for (i = 0; i < isize; i++) {
11111        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11112        switch (ch) {
11113        case '\'': squote++; osize++; break;
11114        case '"':  dquote++; osize++; break;
11115        case '\\': case '\t': case '\r': case '\n':
11116            osize += 2; break;
11117        default:
11118            /* Fast-path ASCII */
11119            if (ch < ' ' || ch == 0x7f)
11120                osize += 4; /* \xHH */
11121            else if (ch < 0x7f)
11122                osize++;
11123            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11124                osize++;
11125                max = ch > max ? ch : max;
11126            }
11127            else if (ch < 0x100)
11128                osize += 4; /* \xHH */
11129            else if (ch < 0x10000)
11130                osize += 6; /* \uHHHH */
11131            else
11132                osize += 10; /* \uHHHHHHHH */
11133        }
11134    }
11135
11136    quote = '\'';
11137    if (squote) {
11138        if (dquote)
11139            /* Both squote and dquote present. Use squote,
11140               and escape them */
11141            osize += squote;
11142        else
11143            quote = '"';
11144    }
11145
11146    repr = PyUnicode_New(osize, max);
11147    if (repr == NULL)
11148        return NULL;
11149    okind = PyUnicode_KIND(repr);
11150    odata = PyUnicode_DATA(repr);
11151
11152    PyUnicode_WRITE(okind, odata, 0, quote);
11153    PyUnicode_WRITE(okind, odata, osize-1, quote);
11154
11155    for (i = 0, o = 1; i < isize; i++) {
11156        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11157
11158        /* Escape quotes and backslashes */
11159        if ((ch == quote) || (ch == '\\')) {
11160            PyUnicode_WRITE(okind, odata, o++, '\\');
11161            PyUnicode_WRITE(okind, odata, o++, ch);
11162            continue;
11163        }
11164
11165        /* Map special whitespace to '\t', \n', '\r' */
11166        if (ch == '\t') {
11167            PyUnicode_WRITE(okind, odata, o++, '\\');
11168            PyUnicode_WRITE(okind, odata, o++, 't');
11169        }
11170        else if (ch == '\n') {
11171            PyUnicode_WRITE(okind, odata, o++, '\\');
11172            PyUnicode_WRITE(okind, odata, o++, 'n');
11173        }
11174        else if (ch == '\r') {
11175            PyUnicode_WRITE(okind, odata, o++, '\\');
11176            PyUnicode_WRITE(okind, odata, o++, 'r');
11177        }
11178
11179        /* Map non-printable US ASCII to '\xhh' */
11180        else if (ch < ' ' || ch == 0x7F) {
11181            PyUnicode_WRITE(okind, odata, o++, '\\');
11182            PyUnicode_WRITE(okind, odata, o++, 'x');
11183            PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11184            PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11185        }
11186
11187        /* Copy ASCII characters as-is */
11188        else if (ch < 0x7F) {
11189            PyUnicode_WRITE(okind, odata, o++, ch);
11190        }
11191
11192        /* Non-ASCII characters */
11193        else {
11194            /* Map Unicode whitespace and control characters
11195               (categories Z* and C* except ASCII space)
11196            */
11197            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11198                /* Map 8-bit characters to '\xhh' */
11199                if (ch <= 0xff) {
11200                    PyUnicode_WRITE(okind, odata, o++, '\\');
11201                    PyUnicode_WRITE(okind, odata, o++, 'x');
11202                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11203                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11204                }
11205                /* Map 21-bit characters to '\U00xxxxxx' */
11206                else if (ch >= 0x10000) {
11207                    PyUnicode_WRITE(okind, odata, o++, '\\');
11208                    PyUnicode_WRITE(okind, odata, o++, 'U');
11209                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11210                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11211                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11212                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11213                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11214                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11215                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11216                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11217                }
11218                /* Map 16-bit characters to '\uxxxx' */
11219                else {
11220                    PyUnicode_WRITE(okind, odata, o++, '\\');
11221                    PyUnicode_WRITE(okind, odata, o++, 'u');
11222                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11223                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11224                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11225                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11226                }
11227            }
11228            /* Copy characters as-is */
11229            else {
11230                PyUnicode_WRITE(okind, odata, o++, ch);
11231            }
11232        }
11233    }
11234    /* Closing quote already added at the beginning */
11235    return repr;
11236}
11237
11238PyDoc_STRVAR(rfind__doc__,
11239             "S.rfind(sub[, start[, end]]) -> int\n\
11240\n\
11241Return the highest index in S where substring sub is found,\n\
11242such that sub is contained within S[start:end].  Optional\n\
11243arguments start and end are interpreted as in slice notation.\n\
11244\n\
11245Return -1 on failure.");
11246
11247static PyObject *
11248unicode_rfind(PyObject *self, PyObject *args)
11249{
11250    PyUnicodeObject *substring;
11251    Py_ssize_t start;
11252    Py_ssize_t end;
11253    Py_ssize_t result;
11254
11255    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11256                                            &start, &end))
11257        return NULL;
11258
11259    if (PyUnicode_READY(self) == -1)
11260        return NULL;
11261    if (PyUnicode_READY(substring) == -1)
11262        return NULL;
11263
11264    result = any_find_slice(
11265        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11266        self, (PyObject*)substring, start, end
11267        );
11268
11269    Py_DECREF(substring);
11270
11271    if (result == -2)
11272        return NULL;
11273
11274    return PyLong_FromSsize_t(result);
11275}
11276
11277PyDoc_STRVAR(rindex__doc__,
11278             "S.rindex(sub[, start[, end]]) -> int\n\
11279\n\
11280Like S.rfind() but raise ValueError when the substring is not found.");
11281
11282static PyObject *
11283unicode_rindex(PyObject *self, PyObject *args)
11284{
11285    PyUnicodeObject *substring;
11286    Py_ssize_t start;
11287    Py_ssize_t end;
11288    Py_ssize_t result;
11289
11290    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11291                                            &start, &end))
11292        return NULL;
11293
11294    if (PyUnicode_READY(self) == -1)
11295        return NULL;
11296    if (PyUnicode_READY(substring) == -1)
11297        return NULL;
11298
11299    result = any_find_slice(
11300        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11301        self, (PyObject*)substring, start, end
11302        );
11303
11304    Py_DECREF(substring);
11305
11306    if (result == -2)
11307        return NULL;
11308
11309    if (result < 0) {
11310        PyErr_SetString(PyExc_ValueError, "substring not found");
11311        return NULL;
11312    }
11313
11314    return PyLong_FromSsize_t(result);
11315}
11316
11317PyDoc_STRVAR(rjust__doc__,
11318             "S.rjust(width[, fillchar]) -> str\n\
11319\n\
11320Return S right-justified in a string of length width. Padding is\n\
11321done using the specified fill character (default is a space).");
11322
11323static PyObject *
11324unicode_rjust(PyUnicodeObject *self, PyObject *args)
11325{
11326    Py_ssize_t width;
11327    Py_UCS4 fillchar = ' ';
11328
11329    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
11330        return NULL;
11331
11332    if (PyUnicode_READY(self) == -1)
11333        return NULL;
11334
11335    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11336        Py_INCREF(self);
11337        return (PyObject*) self;
11338    }
11339
11340    return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
11341}
11342
11343PyObject *
11344PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11345{
11346    PyObject *result;
11347
11348    s = PyUnicode_FromObject(s);
11349    if (s == NULL)
11350        return NULL;
11351    if (sep != NULL) {
11352        sep = PyUnicode_FromObject(sep);
11353        if (sep == NULL) {
11354            Py_DECREF(s);
11355            return NULL;
11356        }
11357    }
11358
11359    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11360
11361    Py_DECREF(s);
11362    Py_XDECREF(sep);
11363    return result;
11364}
11365
11366PyDoc_STRVAR(split__doc__,
11367             "S.split([sep[, maxsplit]]) -> list of strings\n\
11368\n\
11369Return a list of the words in S, using sep as the\n\
11370delimiter string.  If maxsplit is given, at most maxsplit\n\
11371splits are done. If sep is not specified or is None, any\n\
11372whitespace string is a separator and empty strings are\n\
11373removed from the result.");
11374
11375static PyObject*
11376unicode_split(PyUnicodeObject *self, PyObject *args)
11377{
11378    PyObject *substring = Py_None;
11379    Py_ssize_t maxcount = -1;
11380
11381    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
11382        return NULL;
11383
11384    if (substring == Py_None)
11385        return split(self, NULL, maxcount);
11386    else if (PyUnicode_Check(substring))
11387        return split(self, (PyUnicodeObject *)substring, maxcount);
11388    else
11389        return PyUnicode_Split((PyObject *)self, substring, maxcount);
11390}
11391
11392PyObject *
11393PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11394{
11395    PyObject* str_obj;
11396    PyObject* sep_obj;
11397    PyObject* out;
11398    int kind1, kind2, kind;
11399    void *buf1 = NULL, *buf2 = NULL;
11400    Py_ssize_t len1, len2;
11401
11402    str_obj = PyUnicode_FromObject(str_in);
11403    if (!str_obj || PyUnicode_READY(str_obj) == -1)
11404        return NULL;
11405    sep_obj = PyUnicode_FromObject(sep_in);
11406    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
11407        Py_DECREF(str_obj);
11408        return NULL;
11409    }
11410
11411    kind1 = PyUnicode_KIND(str_in);
11412    kind2 = PyUnicode_KIND(sep_obj);
11413    kind = kind1 > kind2 ? kind1 : kind2;
11414    buf1 = PyUnicode_DATA(str_in);
11415    if (kind1 != kind)
11416        buf1 = _PyUnicode_AsKind(str_in, kind);
11417    if (!buf1)
11418        goto onError;
11419    buf2 = PyUnicode_DATA(sep_obj);
11420    if (kind2 != kind)
11421        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11422    if (!buf2)
11423        goto onError;
11424    len1 = PyUnicode_GET_LENGTH(str_obj);
11425    len2 = PyUnicode_GET_LENGTH(sep_obj);
11426
11427    switch(PyUnicode_KIND(str_in)) {
11428    case PyUnicode_1BYTE_KIND:
11429        out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11430        break;
11431    case PyUnicode_2BYTE_KIND:
11432        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11433        break;
11434    case PyUnicode_4BYTE_KIND:
11435        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11436        break;
11437    default:
11438        assert(0);
11439        out = 0;
11440    }
11441
11442    Py_DECREF(sep_obj);
11443    Py_DECREF(str_obj);
11444    if (kind1 != kind)
11445        PyMem_Free(buf1);
11446    if (kind2 != kind)
11447        PyMem_Free(buf2);
11448
11449    return out;
11450  onError:
11451    Py_DECREF(sep_obj);
11452    Py_DECREF(str_obj);
11453    if (kind1 != kind && buf1)
11454        PyMem_Free(buf1);
11455    if (kind2 != kind && buf2)
11456        PyMem_Free(buf2);
11457    return NULL;
11458}
11459
11460
11461PyObject *
11462PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11463{
11464    PyObject* str_obj;
11465    PyObject* sep_obj;
11466    PyObject* out;
11467    int kind1, kind2, kind;
11468    void *buf1 = NULL, *buf2 = NULL;
11469    Py_ssize_t len1, len2;
11470
11471    str_obj = PyUnicode_FromObject(str_in);
11472    if (!str_obj)
11473        return NULL;
11474    sep_obj = PyUnicode_FromObject(sep_in);
11475    if (!sep_obj) {
11476        Py_DECREF(str_obj);
11477        return NULL;
11478    }
11479
11480    kind1 = PyUnicode_KIND(str_in);
11481    kind2 = PyUnicode_KIND(sep_obj);
11482    kind = Py_MAX(kind1, kind2);
11483    buf1 = PyUnicode_DATA(str_in);
11484    if (kind1 != kind)
11485        buf1 = _PyUnicode_AsKind(str_in, kind);
11486    if (!buf1)
11487        goto onError;
11488    buf2 = PyUnicode_DATA(sep_obj);
11489    if (kind2 != kind)
11490        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11491    if (!buf2)
11492        goto onError;
11493    len1 = PyUnicode_GET_LENGTH(str_obj);
11494    len2 = PyUnicode_GET_LENGTH(sep_obj);
11495
11496    switch(PyUnicode_KIND(str_in)) {
11497    case PyUnicode_1BYTE_KIND:
11498        out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11499        break;
11500    case PyUnicode_2BYTE_KIND:
11501        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11502        break;
11503    case PyUnicode_4BYTE_KIND:
11504        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11505        break;
11506    default:
11507        assert(0);
11508        out = 0;
11509    }
11510
11511    Py_DECREF(sep_obj);
11512    Py_DECREF(str_obj);
11513    if (kind1 != kind)
11514        PyMem_Free(buf1);
11515    if (kind2 != kind)
11516        PyMem_Free(buf2);
11517
11518    return out;
11519  onError:
11520    Py_DECREF(sep_obj);
11521    Py_DECREF(str_obj);
11522    if (kind1 != kind && buf1)
11523        PyMem_Free(buf1);
11524    if (kind2 != kind && buf2)
11525        PyMem_Free(buf2);
11526    return NULL;
11527}
11528
11529PyDoc_STRVAR(partition__doc__,
11530             "S.partition(sep) -> (head, sep, tail)\n\
11531\n\
11532Search for the separator sep in S, and return the part before it,\n\
11533the separator itself, and the part after it.  If the separator is not\n\
11534found, return S and two empty strings.");
11535
11536static PyObject*
11537unicode_partition(PyUnicodeObject *self, PyObject *separator)
11538{
11539    return PyUnicode_Partition((PyObject *)self, separator);
11540}
11541
11542PyDoc_STRVAR(rpartition__doc__,
11543             "S.rpartition(sep) -> (head, sep, tail)\n\
11544\n\
11545Search for the separator sep in S, starting at the end of S, and return\n\
11546the part before it, the separator itself, and the part after it.  If the\n\
11547separator is not found, return two empty strings and S.");
11548
11549static PyObject*
11550unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11551{
11552    return PyUnicode_RPartition((PyObject *)self, separator);
11553}
11554
11555PyObject *
11556PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11557{
11558    PyObject *result;
11559
11560    s = PyUnicode_FromObject(s);
11561    if (s == NULL)
11562        return NULL;
11563    if (sep != NULL) {
11564        sep = PyUnicode_FromObject(sep);
11565        if (sep == NULL) {
11566            Py_DECREF(s);
11567            return NULL;
11568        }
11569    }
11570
11571    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11572
11573    Py_DECREF(s);
11574    Py_XDECREF(sep);
11575    return result;
11576}
11577
11578PyDoc_STRVAR(rsplit__doc__,
11579             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
11580\n\
11581Return a list of the words in S, using sep as the\n\
11582delimiter string, starting at the end of the string and\n\
11583working to the front.  If maxsplit is given, at most maxsplit\n\
11584splits are done. If sep is not specified, any whitespace string\n\
11585is a separator.");
11586
11587static PyObject*
11588unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11589{
11590    PyObject *substring = Py_None;
11591    Py_ssize_t maxcount = -1;
11592
11593    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
11594        return NULL;
11595
11596    if (substring == Py_None)
11597        return rsplit(self, NULL, maxcount);
11598    else if (PyUnicode_Check(substring))
11599        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
11600    else
11601        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
11602}
11603
11604PyDoc_STRVAR(splitlines__doc__,
11605             "S.splitlines([keepends]) -> list of strings\n\
11606\n\
11607Return a list of the lines in S, breaking at line boundaries.\n\
11608Line breaks are not included in the resulting list unless keepends\n\
11609is given and true.");
11610
11611static PyObject*
11612unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
11613{
11614    static char *kwlist[] = {"keepends", 0};
11615    int keepends = 0;
11616
11617    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11618                                     kwlist, &keepends))
11619        return NULL;
11620
11621    return PyUnicode_Splitlines((PyObject *)self, keepends);
11622}
11623
11624static
11625PyObject *unicode_str(PyObject *self)
11626{
11627    if (PyUnicode_CheckExact(self)) {
11628        Py_INCREF(self);
11629        return self;
11630    } else
11631        /* Subtype -- return genuine unicode string with the same value. */
11632        return PyUnicode_Copy(self);
11633}
11634
11635PyDoc_STRVAR(swapcase__doc__,
11636             "S.swapcase() -> str\n\
11637\n\
11638Return a copy of S with uppercase characters converted to lowercase\n\
11639and vice versa.");
11640
11641static PyObject*
11642unicode_swapcase(PyUnicodeObject *self)
11643{
11644    return fixup(self, fixswapcase);
11645}
11646
11647PyDoc_STRVAR(maketrans__doc__,
11648             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
11649\n\
11650Return a translation table usable for str.translate().\n\
11651If there is only one argument, it must be a dictionary mapping Unicode\n\
11652ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
11653Character keys will be then converted to ordinals.\n\
11654If there are two arguments, they must be strings of equal length, and\n\
11655in the resulting dictionary, each character in x will be mapped to the\n\
11656character at the same position in y. If there is a third argument, it\n\
11657must be a string, whose characters will be mapped to None in the result.");
11658
11659static PyObject*
11660unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11661{
11662    PyObject *x, *y = NULL, *z = NULL;
11663    PyObject *new = NULL, *key, *value;
11664    Py_ssize_t i = 0;
11665    int res;
11666
11667    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11668        return NULL;
11669    new = PyDict_New();
11670    if (!new)
11671        return NULL;
11672    if (y != NULL) {
11673        int x_kind, y_kind, z_kind;
11674        void *x_data, *y_data, *z_data;
11675
11676        /* x must be a string too, of equal length */
11677        if (!PyUnicode_Check(x)) {
11678            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11679                            "be a string if there is a second argument");
11680            goto err;
11681        }
11682        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
11683            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11684                            "arguments must have equal length");
11685            goto err;
11686        }
11687        /* create entries for translating chars in x to those in y */
11688        x_kind = PyUnicode_KIND(x);
11689        y_kind = PyUnicode_KIND(y);
11690        x_data = PyUnicode_DATA(x);
11691        y_data = PyUnicode_DATA(y);
11692        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11693            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11694            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
11695            if (!key || !value)
11696                goto err;
11697            res = PyDict_SetItem(new, key, value);
11698            Py_DECREF(key);
11699            Py_DECREF(value);
11700            if (res < 0)
11701                goto err;
11702        }
11703        /* create entries for deleting chars in z */
11704        if (z != NULL) {
11705            z_kind = PyUnicode_KIND(z);
11706            z_data = PyUnicode_DATA(z);
11707            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
11708                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
11709                if (!key)
11710                    goto err;
11711                res = PyDict_SetItem(new, key, Py_None);
11712                Py_DECREF(key);
11713                if (res < 0)
11714                    goto err;
11715            }
11716        }
11717    } else {
11718        int kind;
11719        void *data;
11720
11721        /* x must be a dict */
11722        if (!PyDict_CheckExact(x)) {
11723            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11724                            "to maketrans it must be a dict");
11725            goto err;
11726        }
11727        /* copy entries into the new dict, converting string keys to int keys */
11728        while (PyDict_Next(x, &i, &key, &value)) {
11729            if (PyUnicode_Check(key)) {
11730                /* convert string keys to integer keys */
11731                PyObject *newkey;
11732                if (PyUnicode_GET_SIZE(key) != 1) {
11733                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
11734                                    "table must be of length 1");
11735                    goto err;
11736                }
11737                kind = PyUnicode_KIND(key);
11738                data = PyUnicode_DATA(key);
11739                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
11740                if (!newkey)
11741                    goto err;
11742                res = PyDict_SetItem(new, newkey, value);
11743                Py_DECREF(newkey);
11744                if (res < 0)
11745                    goto err;
11746            } else if (PyLong_Check(key)) {
11747                /* just keep integer keys */
11748                if (PyDict_SetItem(new, key, value) < 0)
11749                    goto err;
11750            } else {
11751                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11752                                "be strings or integers");
11753                goto err;
11754            }
11755        }
11756    }
11757    return new;
11758  err:
11759    Py_DECREF(new);
11760    return NULL;
11761}
11762
11763PyDoc_STRVAR(translate__doc__,
11764             "S.translate(table) -> str\n\
11765\n\
11766Return a copy of the string S, where all characters have been mapped\n\
11767through the given translation table, which must be a mapping of\n\
11768Unicode ordinals to Unicode ordinals, strings, or None.\n\
11769Unmapped characters are left untouched. Characters mapped to None\n\
11770are deleted.");
11771
11772static PyObject*
11773unicode_translate(PyObject *self, PyObject *table)
11774{
11775    return _PyUnicode_TranslateCharmap(self, table, "ignore");
11776}
11777
11778PyDoc_STRVAR(upper__doc__,
11779             "S.upper() -> str\n\
11780\n\
11781Return a copy of S converted to uppercase.");
11782
11783static PyObject*
11784unicode_upper(PyUnicodeObject *self)
11785{
11786    return fixup(self, fixupper);
11787}
11788
11789PyDoc_STRVAR(zfill__doc__,
11790             "S.zfill(width) -> str\n\
11791\n\
11792Pad a numeric string S with zeros on the left, to fill a field\n\
11793of the specified width. The string S is never truncated.");
11794
11795static PyObject *
11796unicode_zfill(PyUnicodeObject *self, PyObject *args)
11797{
11798    Py_ssize_t fill;
11799    PyUnicodeObject *u;
11800    Py_ssize_t width;
11801    int kind;
11802    void *data;
11803    Py_UCS4 chr;
11804
11805    if (PyUnicode_READY(self) == -1)
11806        return NULL;
11807
11808    if (!PyArg_ParseTuple(args, "n:zfill", &width))
11809        return NULL;
11810
11811    if (PyUnicode_GET_LENGTH(self) >= width) {
11812        if (PyUnicode_CheckExact(self)) {
11813            Py_INCREF(self);
11814            return (PyObject*) self;
11815        }
11816        else
11817            return PyUnicode_Copy((PyObject*)self);
11818    }
11819
11820    fill = width - _PyUnicode_LENGTH(self);
11821
11822    u = pad(self, fill, 0, '0');
11823
11824    if (u == NULL)
11825        return NULL;
11826
11827    kind = PyUnicode_KIND(u);
11828    data = PyUnicode_DATA(u);
11829    chr = PyUnicode_READ(kind, data, fill);
11830
11831    if (chr == '+' || chr == '-') {
11832        /* move sign to beginning of string */
11833        PyUnicode_WRITE(kind, data, 0, chr);
11834        PyUnicode_WRITE(kind, data, fill, '0');
11835    }
11836
11837    return (PyObject*) u;
11838}
11839
11840#if 0
11841static PyObject *
11842unicode__decimal2ascii(PyObject *self)
11843{
11844    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
11845}
11846#endif
11847
11848PyDoc_STRVAR(startswith__doc__,
11849             "S.startswith(prefix[, start[, end]]) -> bool\n\
11850\n\
11851Return True if S starts with the specified prefix, False otherwise.\n\
11852With optional start, test S beginning at that position.\n\
11853With optional end, stop comparing S at that position.\n\
11854prefix can also be a tuple of strings to try.");
11855
11856static PyObject *
11857unicode_startswith(PyUnicodeObject *self,
11858                   PyObject *args)
11859{
11860    PyObject *subobj;
11861    PyUnicodeObject *substring;
11862    Py_ssize_t start = 0;
11863    Py_ssize_t end = PY_SSIZE_T_MAX;
11864    int result;
11865
11866    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
11867        return NULL;
11868    if (PyTuple_Check(subobj)) {
11869        Py_ssize_t i;
11870        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11871            substring = (PyUnicodeObject *)PyUnicode_FromObject(
11872                PyTuple_GET_ITEM(subobj, i));
11873            if (substring == NULL)
11874                return NULL;
11875            result = tailmatch(self, substring, start, end, -1);
11876            Py_DECREF(substring);
11877            if (result) {
11878                Py_RETURN_TRUE;
11879            }
11880        }
11881        /* nothing matched */
11882        Py_RETURN_FALSE;
11883    }
11884    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
11885    if (substring == NULL) {
11886        if (PyErr_ExceptionMatches(PyExc_TypeError))
11887            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11888                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
11889        return NULL;
11890    }
11891    result = tailmatch(self, substring, start, end, -1);
11892    Py_DECREF(substring);
11893    return PyBool_FromLong(result);
11894}
11895
11896
11897PyDoc_STRVAR(endswith__doc__,
11898             "S.endswith(suffix[, start[, end]]) -> bool\n\
11899\n\
11900Return True if S ends with the specified suffix, False otherwise.\n\
11901With optional start, test S beginning at that position.\n\
11902With optional end, stop comparing S at that position.\n\
11903suffix can also be a tuple of strings to try.");
11904
11905static PyObject *
11906unicode_endswith(PyUnicodeObject *self,
11907                 PyObject *args)
11908{
11909    PyObject *subobj;
11910    PyUnicodeObject *substring;
11911    Py_ssize_t start = 0;
11912    Py_ssize_t end = PY_SSIZE_T_MAX;
11913    int result;
11914
11915    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
11916        return NULL;
11917    if (PyTuple_Check(subobj)) {
11918        Py_ssize_t i;
11919        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11920            substring = (PyUnicodeObject *)PyUnicode_FromObject(
11921                PyTuple_GET_ITEM(subobj, i));
11922            if (substring == NULL)
11923                return NULL;
11924            result = tailmatch(self, substring, start, end, +1);
11925            Py_DECREF(substring);
11926            if (result) {
11927                Py_RETURN_TRUE;
11928            }
11929        }
11930        Py_RETURN_FALSE;
11931    }
11932    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
11933    if (substring == NULL) {
11934        if (PyErr_ExceptionMatches(PyExc_TypeError))
11935            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11936                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
11937        return NULL;
11938    }
11939    result = tailmatch(self, substring, start, end, +1);
11940    Py_DECREF(substring);
11941    return PyBool_FromLong(result);
11942}
11943
11944#include "stringlib/unicode_format.h"
11945
11946PyDoc_STRVAR(format__doc__,
11947             "S.format(*args, **kwargs) -> str\n\
11948\n\
11949Return a formatted version of S, using substitutions from args and kwargs.\n\
11950The substitutions are identified by braces ('{' and '}').");
11951
11952PyDoc_STRVAR(format_map__doc__,
11953             "S.format_map(mapping) -> str\n\
11954\n\
11955Return a formatted version of S, using substitutions from mapping.\n\
11956The substitutions are identified by braces ('{' and '}').");
11957
11958static PyObject *
11959unicode__format__(PyObject* self, PyObject* args)
11960{
11961    PyObject *format_spec;
11962
11963    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11964        return NULL;
11965
11966    return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11967                                     PyUnicode_GET_LENGTH(format_spec));
11968}
11969
11970PyDoc_STRVAR(p_format__doc__,
11971             "S.__format__(format_spec) -> str\n\
11972\n\
11973Return a formatted version of S as described by format_spec.");
11974
11975static PyObject *
11976unicode__sizeof__(PyUnicodeObject *v)
11977{
11978    Py_ssize_t size;
11979
11980    /* If it's a compact object, account for base structure +
11981       character data. */
11982    if (PyUnicode_IS_COMPACT_ASCII(v))
11983        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11984    else if (PyUnicode_IS_COMPACT(v))
11985        size = sizeof(PyCompactUnicodeObject) +
11986            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11987    else {
11988        /* If it is a two-block object, account for base object, and
11989           for character block if present. */
11990        size = sizeof(PyUnicodeObject);
11991        if (_PyUnicode_DATA_ANY(v))
11992            size += (PyUnicode_GET_LENGTH(v) + 1) *
11993                PyUnicode_CHARACTER_SIZE(v);
11994    }
11995    /* If the wstr pointer is present, account for it unless it is shared
11996       with the data pointer. Check if the data is not shared. */
11997    if (_PyUnicode_WSTR(v) &&
11998        (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
11999        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12000    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12001        size += PyUnicode_UTF8_LENGTH(v) + 1;
12002
12003    return PyLong_FromSsize_t(size);
12004}
12005
12006PyDoc_STRVAR(sizeof__doc__,
12007             "S.__sizeof__() -> size of S in memory, in bytes");
12008
12009static PyObject *
12010unicode_getnewargs(PyObject *v)
12011{
12012    PyObject *copy = PyUnicode_Copy(v);
12013    if (!copy)
12014        return NULL;
12015    return Py_BuildValue("(N)", copy);
12016}
12017
12018static PyMethodDef unicode_methods[] = {
12019
12020    /* Order is according to common usage: often used methods should
12021       appear first, since lookup is done sequentially. */
12022
12023    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12024    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12025    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12026    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12027    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12028    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12029    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12030    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12031    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12032    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12033    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12034    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12035    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12036    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12037    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12038    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12039    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12040    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12041    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12042    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12043    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12044    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12045    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12046    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12047    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12048    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12049    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12050    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12051    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12052    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12053    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12054    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12055    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12056    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12057    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12058    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12059    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12060    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12061    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12062    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12063    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12064    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12065    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12066    {"maketrans", (PyCFunction) unicode_maketrans,
12067     METH_VARARGS | METH_STATIC, maketrans__doc__},
12068    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12069#if 0
12070    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12071#endif
12072
12073#if 0
12074    /* These methods are just used for debugging the implementation. */
12075    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12076#endif
12077
12078    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12079    {NULL, NULL}
12080};
12081
12082static PyObject *
12083unicode_mod(PyObject *v, PyObject *w)
12084{
12085    if (!PyUnicode_Check(v))
12086        Py_RETURN_NOTIMPLEMENTED;
12087    return PyUnicode_Format(v, w);
12088}
12089
12090static PyNumberMethods unicode_as_number = {
12091    0,              /*nb_add*/
12092    0,              /*nb_subtract*/
12093    0,              /*nb_multiply*/
12094    unicode_mod,            /*nb_remainder*/
12095};
12096
12097static PySequenceMethods unicode_as_sequence = {
12098    (lenfunc) unicode_length,       /* sq_length */
12099    PyUnicode_Concat,           /* sq_concat */
12100    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12101    (ssizeargfunc) unicode_getitem,     /* sq_item */
12102    0,                  /* sq_slice */
12103    0,                  /* sq_ass_item */
12104    0,                  /* sq_ass_slice */
12105    PyUnicode_Contains,         /* sq_contains */
12106};
12107
12108static PyObject*
12109unicode_subscript(PyUnicodeObject* self, PyObject* item)
12110{
12111    if (PyUnicode_READY(self) == -1)
12112        return NULL;
12113
12114    if (PyIndex_Check(item)) {
12115        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12116        if (i == -1 && PyErr_Occurred())
12117            return NULL;
12118        if (i < 0)
12119            i += PyUnicode_GET_LENGTH(self);
12120        return unicode_getitem((PyObject*)self, i);
12121    } else if (PySlice_Check(item)) {
12122        Py_ssize_t start, stop, step, slicelength, cur, i;
12123        const Py_UNICODE* source_buf;
12124        Py_UNICODE* result_buf;
12125        PyObject* result;
12126
12127        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12128                                 &start, &stop, &step, &slicelength) < 0) {
12129            return NULL;
12130        }
12131
12132        if (slicelength <= 0) {
12133            return PyUnicode_New(0, 0);
12134        } else if (start == 0 && step == 1 &&
12135                   slicelength == PyUnicode_GET_LENGTH(self) &&
12136                   PyUnicode_CheckExact(self)) {
12137            Py_INCREF(self);
12138            return (PyObject *)self;
12139        } else if (step == 1) {
12140            return PyUnicode_Substring((PyObject*)self,
12141                                       start, start + slicelength);
12142        } else {
12143            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
12144            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12145                                                       sizeof(Py_UNICODE));
12146
12147            if (result_buf == NULL)
12148                return PyErr_NoMemory();
12149
12150            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12151                result_buf[i] = source_buf[cur];
12152            }
12153
12154            result = PyUnicode_FromUnicode(result_buf, slicelength);
12155            PyObject_FREE(result_buf);
12156            return result;
12157        }
12158    } else {
12159        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12160        return NULL;
12161    }
12162}
12163
12164static PyMappingMethods unicode_as_mapping = {
12165    (lenfunc)unicode_length,        /* mp_length */
12166    (binaryfunc)unicode_subscript,  /* mp_subscript */
12167    (objobjargproc)0,           /* mp_ass_subscript */
12168};
12169
12170
12171/* Helpers for PyUnicode_Format() */
12172
12173static PyObject *
12174getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12175{
12176    Py_ssize_t argidx = *p_argidx;
12177    if (argidx < arglen) {
12178        (*p_argidx)++;
12179        if (arglen < 0)
12180            return args;
12181        else
12182            return PyTuple_GetItem(args, argidx);
12183    }
12184    PyErr_SetString(PyExc_TypeError,
12185                    "not enough arguments for format string");
12186    return NULL;
12187}
12188
12189/* Returns a new reference to a PyUnicode object, or NULL on failure. */
12190
12191static PyObject *
12192formatfloat(PyObject *v, int flags, int prec, int type)
12193{
12194    char *p;
12195    PyObject *result;
12196    double x;
12197
12198    x = PyFloat_AsDouble(v);
12199    if (x == -1.0 && PyErr_Occurred())
12200        return NULL;
12201
12202    if (prec < 0)
12203        prec = 6;
12204
12205    p = PyOS_double_to_string(x, type, prec,
12206                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
12207    if (p == NULL)
12208        return NULL;
12209    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
12210    PyMem_Free(p);
12211    return result;
12212}
12213
12214static PyObject*
12215formatlong(PyObject *val, int flags, int prec, int type)
12216{
12217    char *buf;
12218    int len;
12219    PyObject *str; /* temporary string object. */
12220    PyObject *result;
12221
12222    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12223    if (!str)
12224        return NULL;
12225    result = PyUnicode_DecodeASCII(buf, len, NULL);
12226    Py_DECREF(str);
12227    return result;
12228}
12229
12230static int
12231formatchar(Py_UCS4 *buf,
12232           size_t buflen,
12233           PyObject *v)
12234{
12235    /* presume that the buffer is at least 3 characters long */
12236    if (PyUnicode_Check(v)) {
12237        if (PyUnicode_GET_LENGTH(v) == 1) {
12238            buf[0] = PyUnicode_READ_CHAR(v, 0);
12239            buf[1] = '\0';
12240            return 1;
12241        }
12242        goto onError;
12243    }
12244    else {
12245        /* Integer input truncated to a character */
12246        long x;
12247        x = PyLong_AsLong(v);
12248        if (x == -1 && PyErr_Occurred())
12249            goto onError;
12250
12251        if (x < 0 || x > 0x10ffff) {
12252            PyErr_SetString(PyExc_OverflowError,
12253                            "%c arg not in range(0x110000)");
12254            return -1;
12255        }
12256
12257        buf[0] = (Py_UCS4) x;
12258        buf[1] = '\0';
12259        return 1;
12260    }
12261
12262  onError:
12263    PyErr_SetString(PyExc_TypeError,
12264                    "%c requires int or char");
12265    return -1;
12266}
12267
12268/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
12269   FORMATBUFLEN is the length of the buffer in which chars are formatted.
12270*/
12271#define FORMATBUFLEN (size_t)10
12272
12273PyObject *
12274PyUnicode_Format(PyObject *format, PyObject *args)
12275{
12276    void *fmt;
12277    int fmtkind;
12278    PyObject *result;
12279    Py_UCS4 *res, *res0;
12280    Py_UCS4 max;
12281    int kind;
12282    Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
12283    int args_owned = 0;
12284    PyObject *dict = NULL;
12285    PyUnicodeObject *uformat;
12286
12287    if (format == NULL || args == NULL) {
12288        PyErr_BadInternalCall();
12289        return NULL;
12290    }
12291    uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12292    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
12293        return NULL;
12294    fmt = PyUnicode_DATA(uformat);
12295    fmtkind = PyUnicode_KIND(uformat);
12296    fmtcnt = PyUnicode_GET_LENGTH(uformat);
12297    fmtpos = 0;
12298
12299    reslen = rescnt = fmtcnt + 100;
12300    res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12301    if (res0 == NULL) {
12302        PyErr_NoMemory();
12303        goto onError;
12304    }
12305
12306    if (PyTuple_Check(args)) {
12307        arglen = PyTuple_Size(args);
12308        argidx = 0;
12309    }
12310    else {
12311        arglen = -1;
12312        argidx = -2;
12313    }
12314    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
12315        !PyUnicode_Check(args))
12316        dict = args;
12317
12318    while (--fmtcnt >= 0) {
12319        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12320            if (--rescnt < 0) {
12321                rescnt = fmtcnt + 100;
12322                reslen += rescnt;
12323                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12324                if (res0 == NULL){
12325                    PyErr_NoMemory();
12326                    goto onError;
12327                }
12328                res = res0 + reslen - rescnt;
12329                --rescnt;
12330            }
12331            *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12332        }
12333        else {
12334            /* Got a format specifier */
12335            int flags = 0;
12336            Py_ssize_t width = -1;
12337            int prec = -1;
12338            Py_UCS4 c = '\0';
12339            Py_UCS4 fill;
12340            int isnumok;
12341            PyObject *v = NULL;
12342            PyObject *temp = NULL;
12343            void *pbuf;
12344            Py_ssize_t pindex;
12345            Py_UNICODE sign;
12346            Py_ssize_t len, len1;
12347            Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
12348
12349            fmtpos++;
12350            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12351                Py_ssize_t keystart;
12352                Py_ssize_t keylen;
12353                PyObject *key;
12354                int pcount = 1;
12355
12356                if (dict == NULL) {
12357                    PyErr_SetString(PyExc_TypeError,
12358                                    "format requires a mapping");
12359                    goto onError;
12360                }
12361                ++fmtpos;
12362                --fmtcnt;
12363                keystart = fmtpos;
12364                /* Skip over balanced parentheses */
12365                while (pcount > 0 && --fmtcnt >= 0) {
12366                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
12367                        --pcount;
12368                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
12369                        ++pcount;
12370                    fmtpos++;
12371                }
12372                keylen = fmtpos - keystart - 1;
12373                if (fmtcnt < 0 || pcount > 0) {
12374                    PyErr_SetString(PyExc_ValueError,
12375                                    "incomplete format key");
12376                    goto onError;
12377                }
12378                key = PyUnicode_Substring((PyObject*)uformat,
12379                                          keystart, keystart + keylen);
12380                if (key == NULL)
12381                    goto onError;
12382                if (args_owned) {
12383                    Py_DECREF(args);
12384                    args_owned = 0;
12385                }
12386                args = PyObject_GetItem(dict, key);
12387                Py_DECREF(key);
12388                if (args == NULL) {
12389                    goto onError;
12390                }
12391                args_owned = 1;
12392                arglen = -1;
12393                argidx = -2;
12394            }
12395            while (--fmtcnt >= 0) {
12396                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
12397                case '-': flags |= F_LJUST; continue;
12398                case '+': flags |= F_SIGN; continue;
12399                case ' ': flags |= F_BLANK; continue;
12400                case '#': flags |= F_ALT; continue;
12401                case '0': flags |= F_ZERO; continue;
12402                }
12403                break;
12404            }
12405            if (c == '*') {
12406                v = getnextarg(args, arglen, &argidx);
12407                if (v == NULL)
12408                    goto onError;
12409                if (!PyLong_Check(v)) {
12410                    PyErr_SetString(PyExc_TypeError,
12411                                    "* wants int");
12412                    goto onError;
12413                }
12414                width = PyLong_AsLong(v);
12415                if (width == -1 && PyErr_Occurred())
12416                    goto onError;
12417                if (width < 0) {
12418                    flags |= F_LJUST;
12419                    width = -width;
12420                }
12421                if (--fmtcnt >= 0)
12422                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12423            }
12424            else if (c >= '0' && c <= '9') {
12425                width = c - '0';
12426                while (--fmtcnt >= 0) {
12427                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12428                    if (c < '0' || c > '9')
12429                        break;
12430                    if ((width*10) / 10 != width) {
12431                        PyErr_SetString(PyExc_ValueError,
12432                                        "width too big");
12433                        goto onError;
12434                    }
12435                    width = width*10 + (c - '0');
12436                }
12437            }
12438            if (c == '.') {
12439                prec = 0;
12440                if (--fmtcnt >= 0)
12441                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12442                if (c == '*') {
12443                    v = getnextarg(args, arglen, &argidx);
12444                    if (v == NULL)
12445                        goto onError;
12446                    if (!PyLong_Check(v)) {
12447                        PyErr_SetString(PyExc_TypeError,
12448                                        "* wants int");
12449                        goto onError;
12450                    }
12451                    prec = PyLong_AsLong(v);
12452                    if (prec == -1 && PyErr_Occurred())
12453                        goto onError;
12454                    if (prec < 0)
12455                        prec = 0;
12456                    if (--fmtcnt >= 0)
12457                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12458                }
12459                else if (c >= '0' && c <= '9') {
12460                    prec = c - '0';
12461                    while (--fmtcnt >= 0) {
12462                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12463                        if (c < '0' || c > '9')
12464                            break;
12465                        if ((prec*10) / 10 != prec) {
12466                            PyErr_SetString(PyExc_ValueError,
12467                                            "prec too big");
12468                            goto onError;
12469                        }
12470                        prec = prec*10 + (c - '0');
12471                    }
12472                }
12473            } /* prec */
12474            if (fmtcnt >= 0) {
12475                if (c == 'h' || c == 'l' || c == 'L') {
12476                    if (--fmtcnt >= 0)
12477                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12478                }
12479            }
12480            if (fmtcnt < 0) {
12481                PyErr_SetString(PyExc_ValueError,
12482                                "incomplete format");
12483                goto onError;
12484            }
12485            if (c != '%') {
12486                v = getnextarg(args, arglen, &argidx);
12487                if (v == NULL)
12488                    goto onError;
12489            }
12490            sign = 0;
12491            fill = ' ';
12492            switch (c) {
12493
12494            case '%':
12495                pbuf = formatbuf;
12496                kind = PyUnicode_4BYTE_KIND;
12497                /* presume that buffer length is at least 1 */
12498                PyUnicode_WRITE(kind, pbuf, 0, '%');
12499                len = 1;
12500                break;
12501
12502            case 's':
12503            case 'r':
12504            case 'a':
12505                if (PyUnicode_CheckExact(v) && c == 's') {
12506                    temp = v;
12507                    Py_INCREF(temp);
12508                }
12509                else {
12510                    if (c == 's')
12511                        temp = PyObject_Str(v);
12512                    else if (c == 'r')
12513                        temp = PyObject_Repr(v);
12514                    else
12515                        temp = PyObject_ASCII(v);
12516                    if (temp == NULL)
12517                        goto onError;
12518                    if (PyUnicode_Check(temp))
12519                        /* nothing to do */;
12520                    else {
12521                        Py_DECREF(temp);
12522                        PyErr_SetString(PyExc_TypeError,
12523                                        "%s argument has non-string str()");
12524                        goto onError;
12525                    }
12526                }
12527                if (PyUnicode_READY(temp) == -1) {
12528                    Py_CLEAR(temp);
12529                    goto onError;
12530                }
12531                pbuf = PyUnicode_DATA(temp);
12532                kind = PyUnicode_KIND(temp);
12533                len = PyUnicode_GET_LENGTH(temp);
12534                if (prec >= 0 && len > prec)
12535                    len = prec;
12536                break;
12537
12538            case 'i':
12539            case 'd':
12540            case 'u':
12541            case 'o':
12542            case 'x':
12543            case 'X':
12544                isnumok = 0;
12545                if (PyNumber_Check(v)) {
12546                    PyObject *iobj=NULL;
12547
12548                    if (PyLong_Check(v)) {
12549                        iobj = v;
12550                        Py_INCREF(iobj);
12551                    }
12552                    else {
12553                        iobj = PyNumber_Long(v);
12554                    }
12555                    if (iobj!=NULL) {
12556                        if (PyLong_Check(iobj)) {
12557                            isnumok = 1;
12558                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
12559                            Py_DECREF(iobj);
12560                            if (!temp)
12561                                goto onError;
12562                            if (PyUnicode_READY(temp) == -1) {
12563                                Py_CLEAR(temp);
12564                                goto onError;
12565                            }
12566                            pbuf = PyUnicode_DATA(temp);
12567                            kind = PyUnicode_KIND(temp);
12568                            len = PyUnicode_GET_LENGTH(temp);
12569                            sign = 1;
12570                        }
12571                        else {
12572                            Py_DECREF(iobj);
12573                        }
12574                    }
12575                }
12576                if (!isnumok) {
12577                    PyErr_Format(PyExc_TypeError,
12578                                 "%%%c format: a number is required, "
12579                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12580                    goto onError;
12581                }
12582                if (flags & F_ZERO)
12583                    fill = '0';
12584                break;
12585
12586            case 'e':
12587            case 'E':
12588            case 'f':
12589            case 'F':
12590            case 'g':
12591            case 'G':
12592                temp = formatfloat(v, flags, prec, c);
12593                if (!temp)
12594                    goto onError;
12595                if (PyUnicode_READY(temp) == -1) {
12596                    Py_CLEAR(temp);
12597                    goto onError;
12598                }
12599                pbuf = PyUnicode_DATA(temp);
12600                kind = PyUnicode_KIND(temp);
12601                len = PyUnicode_GET_LENGTH(temp);
12602                sign = 1;
12603                if (flags & F_ZERO)
12604                    fill = '0';
12605                break;
12606
12607            case 'c':
12608                pbuf = formatbuf;
12609                kind = PyUnicode_4BYTE_KIND;
12610                len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
12611                if (len < 0)
12612                    goto onError;
12613                break;
12614
12615            default:
12616                PyErr_Format(PyExc_ValueError,
12617                             "unsupported format character '%c' (0x%x) "
12618                             "at index %zd",
12619                             (31<=c && c<=126) ? (char)c : '?',
12620                             (int)c,
12621                             fmtpos - 1);
12622                goto onError;
12623            }
12624            /* pbuf is initialized here. */
12625            pindex = 0;
12626            if (sign) {
12627                if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12628                    PyUnicode_READ(kind, pbuf, pindex) == '+') {
12629                    sign = PyUnicode_READ(kind, pbuf, pindex++);
12630                    len--;
12631                }
12632                else if (flags & F_SIGN)
12633                    sign = '+';
12634                else if (flags & F_BLANK)
12635                    sign = ' ';
12636                else
12637                    sign = 0;
12638            }
12639            if (width < len)
12640                width = len;
12641            if (rescnt - (sign != 0) < width) {
12642                reslen -= rescnt;
12643                rescnt = width + fmtcnt + 100;
12644                reslen += rescnt;
12645                if (reslen < 0) {
12646                    Py_XDECREF(temp);
12647                    PyErr_NoMemory();
12648                    goto onError;
12649                }
12650                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12651                if (res0 == 0) {
12652                    PyErr_NoMemory();
12653                    Py_XDECREF(temp);
12654                    goto onError;
12655                }
12656                res = res0 + reslen - rescnt;
12657            }
12658            if (sign) {
12659                if (fill != ' ')
12660                    *res++ = sign;
12661                rescnt--;
12662                if (width > len)
12663                    width--;
12664            }
12665            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12666                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12667                assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12668                if (fill != ' ') {
12669                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12670                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12671                }
12672                rescnt -= 2;
12673                width -= 2;
12674                if (width < 0)
12675                    width = 0;
12676                len -= 2;
12677            }
12678            if (width > len && !(flags & F_LJUST)) {
12679                do {
12680                    --rescnt;
12681                    *res++ = fill;
12682                } while (--width > len);
12683            }
12684            if (fill == ' ') {
12685                if (sign)
12686                    *res++ = sign;
12687                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12688                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12689                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12690                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12691                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12692                }
12693            }
12694            /* Copy all characters, preserving len */
12695            len1 = len;
12696            while (len1--) {
12697                *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12698                rescnt--;
12699            }
12700            while (--width >= len) {
12701                --rescnt;
12702                *res++ = ' ';
12703            }
12704            if (dict && (argidx < arglen) && c != '%') {
12705                PyErr_SetString(PyExc_TypeError,
12706                                "not all arguments converted during string formatting");
12707                Py_XDECREF(temp);
12708                goto onError;
12709            }
12710            Py_XDECREF(temp);
12711        } /* '%' */
12712    } /* until end */
12713    if (argidx < arglen && !dict) {
12714        PyErr_SetString(PyExc_TypeError,
12715                        "not all arguments converted during string formatting");
12716        goto onError;
12717    }
12718
12719
12720    for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12721        if (*res > max)
12722            max = *res;
12723    result = PyUnicode_New(reslen - rescnt, max);
12724    if (!result)
12725        goto onError;
12726    kind = PyUnicode_KIND(result);
12727    for (res = res0; res < res0+reslen-rescnt; res++)
12728        PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12729    PyMem_Free(res0);
12730    if (args_owned) {
12731        Py_DECREF(args);
12732    }
12733    Py_DECREF(uformat);
12734    return (PyObject *)result;
12735
12736  onError:
12737    PyMem_Free(res0);
12738    Py_DECREF(uformat);
12739    if (args_owned) {
12740        Py_DECREF(args);
12741    }
12742    return NULL;
12743}
12744
12745static PyObject *
12746unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12747
12748static PyObject *
12749unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12750{
12751    PyObject *x = NULL;
12752    static char *kwlist[] = {"object", "encoding", "errors", 0};
12753    char *encoding = NULL;
12754    char *errors = NULL;
12755
12756    if (type != &PyUnicode_Type)
12757        return unicode_subtype_new(type, args, kwds);
12758    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
12759                                     kwlist, &x, &encoding, &errors))
12760        return NULL;
12761    if (x == NULL)
12762        return (PyObject *)PyUnicode_New(0, 0);
12763    if (encoding == NULL && errors == NULL)
12764        return PyObject_Str(x);
12765    else
12766        return PyUnicode_FromEncodedObject(x, encoding, errors);
12767}
12768
12769static PyObject *
12770unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12771{
12772    PyUnicodeObject *unicode, *self;
12773    Py_ssize_t length, char_size;
12774    int share_wstr, share_utf8;
12775    unsigned int kind;
12776    void *data;
12777
12778    assert(PyType_IsSubtype(type, &PyUnicode_Type));
12779
12780    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12781    if (unicode == NULL)
12782        return NULL;
12783    assert(_PyUnicode_CHECK(unicode));
12784    if (PyUnicode_READY(unicode))
12785        return NULL;
12786
12787    self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12788    if (self == NULL) {
12789        Py_DECREF(unicode);
12790        return NULL;
12791    }
12792    kind = PyUnicode_KIND(unicode);
12793    length = PyUnicode_GET_LENGTH(unicode);
12794
12795    _PyUnicode_LENGTH(self) = length;
12796    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12797    _PyUnicode_STATE(self).interned = 0;
12798    _PyUnicode_STATE(self).kind = kind;
12799    _PyUnicode_STATE(self).compact = 0;
12800    _PyUnicode_STATE(self).ascii = 0;
12801    _PyUnicode_STATE(self).ready = 1;
12802    _PyUnicode_WSTR(self) = NULL;
12803    _PyUnicode_UTF8_LENGTH(self) = 0;
12804    _PyUnicode_UTF8(self) = NULL;
12805    _PyUnicode_WSTR_LENGTH(self) = 0;
12806    _PyUnicode_DATA_ANY(self) = NULL;
12807
12808    share_utf8 = 0;
12809    share_wstr = 0;
12810    if (kind == PyUnicode_1BYTE_KIND) {
12811        char_size = 1;
12812        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12813            share_utf8 = 1;
12814    }
12815    else if (kind == PyUnicode_2BYTE_KIND) {
12816        char_size = 2;
12817        if (sizeof(wchar_t) == 2)
12818            share_wstr = 1;
12819    }
12820    else {
12821        assert(kind == PyUnicode_4BYTE_KIND);
12822        char_size = 4;
12823        if (sizeof(wchar_t) == 4)
12824            share_wstr = 1;
12825    }
12826
12827    /* Ensure we won't overflow the length. */
12828    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12829        PyErr_NoMemory();
12830        goto onError;
12831    }
12832    data = PyObject_MALLOC((length + 1) * char_size);
12833    if (data == NULL) {
12834        PyErr_NoMemory();
12835        goto onError;
12836    }
12837
12838    _PyUnicode_DATA_ANY(self) = data;
12839    if (share_utf8) {
12840        _PyUnicode_UTF8_LENGTH(self) = length;
12841        _PyUnicode_UTF8(self) = data;
12842    }
12843    if (share_wstr) {
12844        _PyUnicode_WSTR_LENGTH(self) = length;
12845        _PyUnicode_WSTR(self) = (wchar_t *)data;
12846    }
12847
12848    Py_MEMCPY(data, PyUnicode_DATA(unicode),
12849              PyUnicode_KIND_SIZE(kind, length + 1));
12850    Py_DECREF(unicode);
12851    return (PyObject *)self;
12852
12853onError:
12854    Py_DECREF(unicode);
12855    Py_DECREF(self);
12856    return NULL;
12857}
12858
12859PyDoc_STRVAR(unicode_doc,
12860             "str(string[, encoding[, errors]]) -> str\n\
12861\n\
12862Create a new string object from the given encoded string.\n\
12863encoding defaults to the current default string encoding.\n\
12864errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
12865
12866static PyObject *unicode_iter(PyObject *seq);
12867
12868PyTypeObject PyUnicode_Type = {
12869    PyVarObject_HEAD_INIT(&PyType_Type, 0)
12870    "str",              /* tp_name */
12871    sizeof(PyUnicodeObject),        /* tp_size */
12872    0,                  /* tp_itemsize */
12873    /* Slots */
12874    (destructor)unicode_dealloc,    /* tp_dealloc */
12875    0,                  /* tp_print */
12876    0,                  /* tp_getattr */
12877    0,                  /* tp_setattr */
12878    0,                  /* tp_reserved */
12879    unicode_repr,           /* tp_repr */
12880    &unicode_as_number,         /* tp_as_number */
12881    &unicode_as_sequence,       /* tp_as_sequence */
12882    &unicode_as_mapping,        /* tp_as_mapping */
12883    (hashfunc) unicode_hash,        /* tp_hash*/
12884    0,                  /* tp_call*/
12885    (reprfunc) unicode_str,     /* tp_str */
12886    PyObject_GenericGetAttr,        /* tp_getattro */
12887    0,                  /* tp_setattro */
12888    0,                  /* tp_as_buffer */
12889    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
12890    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
12891    unicode_doc,            /* tp_doc */
12892    0,                  /* tp_traverse */
12893    0,                  /* tp_clear */
12894    PyUnicode_RichCompare,      /* tp_richcompare */
12895    0,                  /* tp_weaklistoffset */
12896    unicode_iter,           /* tp_iter */
12897    0,                  /* tp_iternext */
12898    unicode_methods,            /* tp_methods */
12899    0,                  /* tp_members */
12900    0,                  /* tp_getset */
12901    &PyBaseObject_Type,         /* tp_base */
12902    0,                  /* tp_dict */
12903    0,                  /* tp_descr_get */
12904    0,                  /* tp_descr_set */
12905    0,                  /* tp_dictoffset */
12906    0,                  /* tp_init */
12907    0,                  /* tp_alloc */
12908    unicode_new,            /* tp_new */
12909    PyObject_Del,           /* tp_free */
12910};
12911
12912/* Initialize the Unicode implementation */
12913
12914void _PyUnicode_Init(void)
12915{
12916    int i;
12917
12918    /* XXX - move this array to unicodectype.c ? */
12919    Py_UCS2 linebreak[] = {
12920        0x000A, /* LINE FEED */
12921        0x000D, /* CARRIAGE RETURN */
12922        0x001C, /* FILE SEPARATOR */
12923        0x001D, /* GROUP SEPARATOR */
12924        0x001E, /* RECORD SEPARATOR */
12925        0x0085, /* NEXT LINE */
12926        0x2028, /* LINE SEPARATOR */
12927        0x2029, /* PARAGRAPH SEPARATOR */
12928    };
12929
12930    /* Init the implementation */
12931    unicode_empty = PyUnicode_New(0, 0);
12932    if (!unicode_empty)
12933        Py_FatalError("Can't create empty string");
12934
12935    for (i = 0; i < 256; i++)
12936        unicode_latin1[i] = NULL;
12937    if (PyType_Ready(&PyUnicode_Type) < 0)
12938        Py_FatalError("Can't initialize 'unicode'");
12939
12940    /* initialize the linebreak bloom filter */
12941    bloom_linebreak = make_bloom_mask(
12942        PyUnicode_2BYTE_KIND, linebreak,
12943        Py_ARRAY_LENGTH(linebreak));
12944
12945    PyType_Ready(&EncodingMapType);
12946}
12947
12948/* Finalize the Unicode implementation */
12949
12950int
12951PyUnicode_ClearFreeList(void)
12952{
12953    return 0;
12954}
12955
12956void
12957_PyUnicode_Fini(void)
12958{
12959    int i;
12960
12961    Py_XDECREF(unicode_empty);
12962    unicode_empty = NULL;
12963
12964    for (i = 0; i < 256; i++) {
12965        if (unicode_latin1[i]) {
12966            Py_DECREF(unicode_latin1[i]);
12967            unicode_latin1[i] = NULL;
12968        }
12969    }
12970    (void)PyUnicode_ClearFreeList();
12971}
12972
12973void
12974PyUnicode_InternInPlace(PyObject **p)
12975{
12976    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12977    PyObject *t;
12978#ifdef Py_DEBUG
12979    assert(s != NULL);
12980    assert(_PyUnicode_CHECK(s));
12981#else
12982    if (s == NULL || !PyUnicode_Check(s))
12983        return;
12984#endif
12985    /* If it's a subclass, we don't really know what putting
12986       it in the interned dict might do. */
12987    if (!PyUnicode_CheckExact(s))
12988        return;
12989    if (PyUnicode_CHECK_INTERNED(s))
12990        return;
12991    if (PyUnicode_READY(s) == -1) {
12992        assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
12993        return;
12994    }
12995    if (interned == NULL) {
12996        interned = PyDict_New();
12997        if (interned == NULL) {
12998            PyErr_Clear(); /* Don't leave an exception */
12999            return;
13000        }
13001    }
13002    /* It might be that the GetItem call fails even
13003       though the key is present in the dictionary,
13004       namely when this happens during a stack overflow. */
13005    Py_ALLOW_RECURSION
13006        t = PyDict_GetItem(interned, (PyObject *)s);
13007    Py_END_ALLOW_RECURSION
13008
13009        if (t) {
13010            Py_INCREF(t);
13011            Py_DECREF(*p);
13012            *p = t;
13013            return;
13014        }
13015
13016    PyThreadState_GET()->recursion_critical = 1;
13017    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13018        PyErr_Clear();
13019        PyThreadState_GET()->recursion_critical = 0;
13020        return;
13021    }
13022    PyThreadState_GET()->recursion_critical = 0;
13023    /* The two references in interned are not counted by refcnt.
13024       The deallocator will take care of this */
13025    Py_REFCNT(s) -= 2;
13026    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13027}
13028
13029void
13030PyUnicode_InternImmortal(PyObject **p)
13031{
13032    PyUnicodeObject *u = (PyUnicodeObject *)*p;
13033
13034    PyUnicode_InternInPlace(p);
13035    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13036        _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
13037        Py_INCREF(*p);
13038    }
13039}
13040
13041PyObject *
13042PyUnicode_InternFromString(const char *cp)
13043{
13044    PyObject *s = PyUnicode_FromString(cp);
13045    if (s == NULL)
13046        return NULL;
13047    PyUnicode_InternInPlace(&s);
13048    return s;
13049}
13050
13051void
13052_Py_ReleaseInternedUnicodeStrings(void)
13053{
13054    PyObject *keys;
13055    PyUnicodeObject *s;
13056    Py_ssize_t i, n;
13057    Py_ssize_t immortal_size = 0, mortal_size = 0;
13058
13059    if (interned == NULL || !PyDict_Check(interned))
13060        return;
13061    keys = PyDict_Keys(interned);
13062    if (keys == NULL || !PyList_Check(keys)) {
13063        PyErr_Clear();
13064        return;
13065    }
13066
13067    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13068       detector, interned unicode strings are not forcibly deallocated;
13069       rather, we give them their stolen references back, and then clear
13070       and DECREF the interned dict. */
13071
13072    n = PyList_GET_SIZE(keys);
13073    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13074            n);
13075    for (i = 0; i < n; i++) {
13076        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
13077        if (PyUnicode_READY(s) == -1)
13078            fprintf(stderr, "could not ready string\n");
13079        switch (PyUnicode_CHECK_INTERNED(s)) {
13080        case SSTATE_NOT_INTERNED:
13081            /* XXX Shouldn't happen */
13082            break;
13083        case SSTATE_INTERNED_IMMORTAL:
13084            Py_REFCNT(s) += 1;
13085            immortal_size += PyUnicode_GET_LENGTH(s);
13086            break;
13087        case SSTATE_INTERNED_MORTAL:
13088            Py_REFCNT(s) += 2;
13089            mortal_size += PyUnicode_GET_LENGTH(s);
13090            break;
13091        default:
13092            Py_FatalError("Inconsistent interned string state.");
13093        }
13094        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13095    }
13096    fprintf(stderr, "total size of all interned strings: "
13097            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13098            "mortal/immortal\n", mortal_size, immortal_size);
13099    Py_DECREF(keys);
13100    PyDict_Clear(interned);
13101    Py_DECREF(interned);
13102    interned = NULL;
13103}
13104
13105
13106/********************* Unicode Iterator **************************/
13107
13108typedef struct {
13109    PyObject_HEAD
13110    Py_ssize_t it_index;
13111    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
13112} unicodeiterobject;
13113
13114static void
13115unicodeiter_dealloc(unicodeiterobject *it)
13116{
13117    _PyObject_GC_UNTRACK(it);
13118    Py_XDECREF(it->it_seq);
13119    PyObject_GC_Del(it);
13120}
13121
13122static int
13123unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13124{
13125    Py_VISIT(it->it_seq);
13126    return 0;
13127}
13128
13129static PyObject *
13130unicodeiter_next(unicodeiterobject *it)
13131{
13132    PyUnicodeObject *seq;
13133    PyObject *item;
13134
13135    assert(it != NULL);
13136    seq = it->it_seq;
13137    if (seq == NULL)
13138        return NULL;
13139    assert(_PyUnicode_CHECK(seq));
13140
13141    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13142        int kind = PyUnicode_KIND(seq);
13143        void *data = PyUnicode_DATA(seq);
13144        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13145        item = PyUnicode_FromOrdinal(chr);
13146        if (item != NULL)
13147            ++it->it_index;
13148        return item;
13149    }
13150
13151    Py_DECREF(seq);
13152    it->it_seq = NULL;
13153    return NULL;
13154}
13155
13156static PyObject *
13157unicodeiter_len(unicodeiterobject *it)
13158{
13159    Py_ssize_t len = 0;
13160    if (it->it_seq)
13161        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13162    return PyLong_FromSsize_t(len);
13163}
13164
13165PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13166
13167static PyMethodDef unicodeiter_methods[] = {
13168    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
13169     length_hint_doc},
13170    {NULL,      NULL}       /* sentinel */
13171};
13172
13173PyTypeObject PyUnicodeIter_Type = {
13174    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13175    "str_iterator",         /* tp_name */
13176    sizeof(unicodeiterobject),      /* tp_basicsize */
13177    0,                  /* tp_itemsize */
13178    /* methods */
13179    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
13180    0,                  /* tp_print */
13181    0,                  /* tp_getattr */
13182    0,                  /* tp_setattr */
13183    0,                  /* tp_reserved */
13184    0,                  /* tp_repr */
13185    0,                  /* tp_as_number */
13186    0,                  /* tp_as_sequence */
13187    0,                  /* tp_as_mapping */
13188    0,                  /* tp_hash */
13189    0,                  /* tp_call */
13190    0,                  /* tp_str */
13191    PyObject_GenericGetAttr,        /* tp_getattro */
13192    0,                  /* tp_setattro */
13193    0,                  /* tp_as_buffer */
13194    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13195    0,                  /* tp_doc */
13196    (traverseproc)unicodeiter_traverse, /* tp_traverse */
13197    0,                  /* tp_clear */
13198    0,                  /* tp_richcompare */
13199    0,                  /* tp_weaklistoffset */
13200    PyObject_SelfIter,          /* tp_iter */
13201    (iternextfunc)unicodeiter_next,     /* tp_iternext */
13202    unicodeiter_methods,            /* tp_methods */
13203    0,
13204};
13205
13206static PyObject *
13207unicode_iter(PyObject *seq)
13208{
13209    unicodeiterobject *it;
13210
13211    if (!PyUnicode_Check(seq)) {
13212        PyErr_BadInternalCall();
13213        return NULL;
13214    }
13215    if (PyUnicode_READY(seq) == -1)
13216        return NULL;
13217    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13218    if (it == NULL)
13219        return NULL;
13220    it->it_index = 0;
13221    Py_INCREF(seq);
13222    it->it_seq = (PyUnicodeObject *)seq;
13223    _PyObject_GC_TRACK(it);
13224    return (PyObject *)it;
13225}
13226
13227#define UNIOP(x) Py_UNICODE_##x
13228#define UNIOP_t Py_UNICODE
13229#include "uniops.h"
13230#undef UNIOP
13231#undef UNIOP_t
13232#define UNIOP(x) Py_UCS4_##x
13233#define UNIOP_t Py_UCS4
13234#include "uniops.h"
13235#undef UNIOP
13236#undef UNIOP_t
13237
13238Py_UNICODE*
13239PyUnicode_AsUnicodeCopy(PyObject *object)
13240{
13241    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13242    Py_UNICODE *copy;
13243    Py_ssize_t size;
13244
13245    if (!PyUnicode_Check(unicode)) {
13246        PyErr_BadArgument();
13247        return NULL;
13248    }
13249    /* Ensure we won't overflow the size. */
13250    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13251        PyErr_NoMemory();
13252        return NULL;
13253    }
13254    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13255    size *= sizeof(Py_UNICODE);
13256    copy = PyMem_Malloc(size);
13257    if (copy == NULL) {
13258        PyErr_NoMemory();
13259        return NULL;
13260    }
13261    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13262    return copy;
13263}
13264
13265/* A _string module, to export formatter_parser and formatter_field_name_split
13266   to the string.Formatter class implemented in Python. */
13267
13268static PyMethodDef _string_methods[] = {
13269    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13270     METH_O, PyDoc_STR("split the argument as a field name")},
13271    {"formatter_parser", (PyCFunction) formatter_parser,
13272     METH_O, PyDoc_STR("parse the argument as a format string")},
13273    {NULL, NULL}
13274};
13275
13276static struct PyModuleDef _string_module = {
13277    PyModuleDef_HEAD_INIT,
13278    "_string",
13279    PyDoc_STR("string helper module"),
13280    0,
13281    _string_methods,
13282    NULL,
13283    NULL,
13284    NULL,
13285    NULL
13286};
13287
13288PyMODINIT_FUNC
13289PyInit__string(void)
13290{
13291    return PyModule_Create(&_string_module);
13292}
13293
13294
13295#ifdef __cplusplus
13296}
13297#endif
13298