unicodeobject.c revision fb9ea8c57eeab6837c830613524c1250488baed1
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49#ifdef Py_DEBUG
50#  define DONT_MAKE_RESULT_READY
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96#ifdef Py_DEBUG
97#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
98#else
99#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
101
102#define _PyUnicode_UTF8(op)                             \
103    (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op)                              \
105    (assert(_PyUnicode_CHECK(op)),                      \
106     assert(PyUnicode_IS_READY(op)),                    \
107     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
108         ((char*)((PyASCIIObject*)(op) + 1)) :          \
109         _PyUnicode_UTF8(op))
110#define _PyUnicode_UTF8_LENGTH(op)                      \
111    (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op)                       \
113    (assert(_PyUnicode_CHECK(op)),                      \
114     assert(PyUnicode_IS_READY(op)),                    \
115     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
116         ((PyASCIIObject*)(op))->length :               \
117         _PyUnicode_UTF8_LENGTH(op))
118#define _PyUnicode_WSTR(op)                             \
119    (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op)                      \
121    (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op)                           \
123    (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op)                            \
125    (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op)                             \
127    (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op)                             \
129    (assert(_PyUnicode_CHECK(op)),                      \
130     ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op)                       \
132    (assert(_PyUnicode_CHECK(op)),                      \
133     ((PyASCIIObject *)(op))->length)
134#define _PyUnicode_DATA_ANY(op)                         \
135    (((PyUnicodeObject*)(op))->data.any)
136
137#undef PyUnicode_READY
138#define PyUnicode_READY(op)                             \
139    (assert(_PyUnicode_CHECK(op)),                      \
140     (PyUnicode_IS_READY(op) ?                          \
141      0 :                                               \
142      _PyUnicode_Ready((PyObject *)(op))))
143
144#define _PyUnicode_READY_REPLACE(p_obj)                 \
145    (assert(_PyUnicode_CHECK(*p_obj)),                  \
146     (PyUnicode_IS_READY(*p_obj) ?                      \
147      0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
149#define _PyUnicode_SHARE_UTF8(op)                       \
150    (assert(_PyUnicode_CHECK(op)),                      \
151     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
152     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op)                       \
154    (assert(_PyUnicode_CHECK(op)),                      \
155     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
157/* true if the Unicode object has an allocated UTF-8 memory block
158   (not shared with other data) */
159#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
160    (assert(_PyUnicode_CHECK(op)),                      \
161     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
162      && _PyUnicode_UTF8(op)                            \
163      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
165/* true if the Unicode object has an allocated wstr memory block
166   (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
168    (assert(_PyUnicode_CHECK(op)),                      \
169     (_PyUnicode_WSTR(op) &&                            \
170      (!PyUnicode_IS_READY(op) ||                       \
171       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
173/* Generic helper macro to convert characters of different types.
174   from_type and to_type have to be valid type names, begin and end
175   are pointers to the source characters which should be of type
176   "from_type *".  to is a pointer of type "to_type *" and points to the
177   buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179    do {                                                \
180        const from_type *iter_; to_type *to_;           \
181        for (iter_ = (begin), to_ = (to_type *)(to);    \
182             iter_ < (end);                             \
183             ++iter_, ++to_) {                          \
184            *to_ = (to_type)*iter_;                     \
185        }                                               \
186    } while (0)
187
188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
191/* This dictionary holds all interned unicode strings.  Note that references
192   to strings in this dictionary are *not* counted in the string's ob_refcnt.
193   When the interned string reaches a refcnt of 0 the string deallocation
194   function will delete the reference from this dictionary.
195
196   Another way to look at this is that to say that the actual reference
197   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
198*/
199static PyObject *interned;
200
201/* The empty Unicode object is shared to improve performance. */
202static PyObject *unicode_empty;
203
204/* Single character Unicode strings in the Latin-1 range are being
205   shared as well. */
206static PyObject *unicode_latin1[256];
207
208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
210    0, 0, 0, 0, 0, 0, 0, 0,
211/*     case 0x0009: * CHARACTER TABULATION */
212/*     case 0x000A: * LINE FEED */
213/*     case 0x000B: * LINE TABULATION */
214/*     case 0x000C: * FORM FEED */
215/*     case 0x000D: * CARRIAGE RETURN */
216    0, 1, 1, 1, 1, 1, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218/*     case 0x001C: * FILE SEPARATOR */
219/*     case 0x001D: * GROUP SEPARATOR */
220/*     case 0x001E: * RECORD SEPARATOR */
221/*     case 0x001F: * UNIT SEPARATOR */
222    0, 0, 0, 0, 1, 1, 1, 1,
223/*     case 0x0020: * SPACE */
224    1, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0
237};
238
239/* forward */
240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
241static PyObject* get_latin1_char(unsigned char ch);
242static void copy_characters(
243    PyObject *to, Py_ssize_t to_start,
244    PyObject *from, Py_ssize_t from_start,
245    Py_ssize_t how_many);
246static int unicode_is_singleton(PyObject *unicode);
247
248static PyObject *
249unicode_encode_call_errorhandler(const char *errors,
250       PyObject **errorHandler,const char *encoding, const char *reason,
251       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
252       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
253
254static void
255raise_encode_exception(PyObject **exceptionObject,
256                       const char *encoding,
257                       const Py_UNICODE *unicode, Py_ssize_t size,
258                       Py_ssize_t startpos, Py_ssize_t endpos,
259                       const char *reason);
260
261/* Same for linebreaks */
262static unsigned char ascii_linebreak[] = {
263    0, 0, 0, 0, 0, 0, 0, 0,
264/*         0x000A, * LINE FEED */
265/*         0x000B, * LINE TABULATION */
266/*         0x000C, * FORM FEED */
267/*         0x000D, * CARRIAGE RETURN */
268    0, 0, 1, 1, 1, 1, 0, 0,
269    0, 0, 0, 0, 0, 0, 0, 0,
270/*         0x001C, * FILE SEPARATOR */
271/*         0x001D, * GROUP SEPARATOR */
272/*         0x001E, * RECORD SEPARATOR */
273    0, 0, 0, 0, 1, 1, 1, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0,
277    0, 0, 0, 0, 0, 0, 0, 0,
278
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0,
283    0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0
287};
288
289/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
290   This function is kept for backward compatibility with the old API. */
291Py_UNICODE
292PyUnicode_GetMax(void)
293{
294#ifdef Py_UNICODE_WIDE
295    return 0x10FFFF;
296#else
297    /* This is actually an illegal character, so it should
298       not be passed to unichr. */
299    return 0xFFFF;
300#endif
301}
302
303#ifdef Py_DEBUG
304int
305/* FIXME: use PyObject* type for op */
306_PyUnicode_CheckConsistency(void *op, int check_content)
307{
308    PyASCIIObject *ascii;
309    unsigned int kind;
310
311    assert(PyUnicode_Check(op));
312
313    ascii = (PyASCIIObject *)op;
314    kind = ascii->state.kind;
315
316    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
317        assert(kind == PyUnicode_1BYTE_KIND);
318        assert(ascii->state.ready == 1);
319    }
320    else {
321        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
322        void *data;
323
324        if (ascii->state.compact == 1) {
325            data = compact + 1;
326            assert(kind == PyUnicode_1BYTE_KIND
327                   || kind == PyUnicode_2BYTE_KIND
328                   || kind == PyUnicode_4BYTE_KIND);
329            assert(ascii->state.ascii == 0);
330            assert(ascii->state.ready == 1);
331            assert (compact->utf8 != data);
332        } else {
333            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
334
335            data = unicode->data.any;
336            if (kind == PyUnicode_WCHAR_KIND) {
337                assert(ascii->state.compact == 0);
338                assert(ascii->state.ascii == 0);
339                assert(ascii->state.ready == 0);
340                assert(ascii->wstr != NULL);
341                assert(data == NULL);
342                assert(compact->utf8 == NULL);
343                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
344            }
345            else {
346                assert(kind == PyUnicode_1BYTE_KIND
347                       || kind == PyUnicode_2BYTE_KIND
348                       || kind == PyUnicode_4BYTE_KIND);
349                assert(ascii->state.compact == 0);
350                assert(ascii->state.ready == 1);
351                assert(data != NULL);
352                if (ascii->state.ascii) {
353                    assert (compact->utf8 == data);
354                    assert (compact->utf8_length == ascii->length);
355                }
356                else
357                    assert (compact->utf8 != data);
358            }
359        }
360        if (kind != PyUnicode_WCHAR_KIND) {
361            if (
362#if SIZEOF_WCHAR_T == 2
363                kind == PyUnicode_2BYTE_KIND
364#else
365                kind == PyUnicode_4BYTE_KIND
366#endif
367               )
368            {
369                assert(ascii->wstr == data);
370                assert(compact->wstr_length == ascii->length);
371            } else
372                assert(ascii->wstr != data);
373        }
374
375        if (compact->utf8 == NULL)
376            assert(compact->utf8_length == 0);
377        if (ascii->wstr == NULL)
378            assert(compact->wstr_length == 0);
379    }
380    /* check that the best kind is used */
381    if (check_content && kind != PyUnicode_WCHAR_KIND)
382    {
383        Py_ssize_t i;
384        Py_UCS4 maxchar = 0;
385        void *data = PyUnicode_DATA(ascii);
386        for (i=0; i < ascii->length; i++)
387        {
388            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
389            if (ch > maxchar)
390                maxchar = ch;
391        }
392        if (kind == PyUnicode_1BYTE_KIND) {
393            if (ascii->state.ascii == 0)
394                assert(maxchar >= 128);
395            else
396                assert(maxchar < 128);
397        }
398        else if (kind == PyUnicode_2BYTE_KIND)
399            assert(maxchar >= 0x100);
400        else
401            assert(maxchar >= 0x10000);
402    }
403    if (check_content && !unicode_is_singleton((PyObject*)ascii))
404        assert(ascii->hash == -1);
405    return 1;
406}
407#endif
408
409/* --- Bloom Filters ----------------------------------------------------- */
410
411/* stuff to implement simple "bloom filters" for Unicode characters.
412   to keep things simple, we use a single bitmask, using the least 5
413   bits from each unicode characters as the bit index. */
414
415/* the linebreak mask is set up by Unicode_Init below */
416
417#if LONG_BIT >= 128
418#define BLOOM_WIDTH 128
419#elif LONG_BIT >= 64
420#define BLOOM_WIDTH 64
421#elif LONG_BIT >= 32
422#define BLOOM_WIDTH 32
423#else
424#error "LONG_BIT is smaller than 32"
425#endif
426
427#define BLOOM_MASK unsigned long
428
429static BLOOM_MASK bloom_linebreak;
430
431#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
432#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
433
434#define BLOOM_LINEBREAK(ch)                                             \
435    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
436     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
437
438Py_LOCAL_INLINE(BLOOM_MASK)
439make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
440{
441    /* calculate simple bloom-style bitmask for a given unicode string */
442
443    BLOOM_MASK mask;
444    Py_ssize_t i;
445
446    mask = 0;
447    for (i = 0; i < len; i++)
448        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
449
450    return mask;
451}
452
453#define BLOOM_MEMBER(mask, chr, str) \
454    (BLOOM(mask, chr) \
455     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
456
457/* --- Unicode Object ----------------------------------------------------- */
458
459static PyObject *
460fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
461
462Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
463                                 Py_ssize_t size, Py_UCS4 ch,
464                                 int direction)
465{
466    /* like wcschr, but doesn't stop at NULL characters */
467    Py_ssize_t i;
468    if (direction == 1) {
469        for(i = 0; i < size; i++)
470            if (PyUnicode_READ(kind, s, i) == ch)
471                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
472    }
473    else {
474        for(i = size-1; i >= 0; i--)
475            if (PyUnicode_READ(kind, s, i) == ch)
476                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
477    }
478    return NULL;
479}
480
481static PyObject*
482resize_compact(PyObject *unicode, Py_ssize_t length)
483{
484    Py_ssize_t char_size;
485    Py_ssize_t struct_size;
486    Py_ssize_t new_size;
487    int share_wstr;
488
489    assert(PyUnicode_IS_READY(unicode));
490    char_size = PyUnicode_CHARACTER_SIZE(unicode);
491    if (PyUnicode_IS_COMPACT_ASCII(unicode))
492        struct_size = sizeof(PyASCIIObject);
493    else
494        struct_size = sizeof(PyCompactUnicodeObject);
495    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
496
497    _Py_DEC_REFTOTAL;
498    _Py_ForgetReference(unicode);
499
500    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
501        PyErr_NoMemory();
502        return NULL;
503    }
504    new_size = (struct_size + (length + 1) * char_size);
505
506    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
507    if (unicode == NULL) {
508        PyObject_Del(unicode);
509        PyErr_NoMemory();
510        return NULL;
511    }
512    _Py_NewReference(unicode);
513    _PyUnicode_LENGTH(unicode) = length;
514    if (share_wstr) {
515        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
516        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
517            _PyUnicode_WSTR_LENGTH(unicode) = length;
518    }
519    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
520                    length, 0);
521    return unicode;
522}
523
524static int
525resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
526{
527    wchar_t *wstr;
528    assert(!PyUnicode_IS_COMPACT(unicode));
529    assert(Py_REFCNT(unicode) == 1);
530
531    _PyUnicode_DIRTY(unicode);
532
533    if (PyUnicode_IS_READY(unicode)) {
534        Py_ssize_t char_size;
535        Py_ssize_t new_size;
536        int share_wstr, share_utf8;
537        void *data;
538
539        data = _PyUnicode_DATA_ANY(unicode);
540        assert(data != NULL);
541        char_size = PyUnicode_CHARACTER_SIZE(unicode);
542        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
543        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
544        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
545        {
546            PyObject_DEL(_PyUnicode_UTF8(unicode));
547            _PyUnicode_UTF8(unicode) = NULL;
548            _PyUnicode_UTF8_LENGTH(unicode) = 0;
549        }
550
551        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
552            PyErr_NoMemory();
553            return -1;
554        }
555        new_size = (length + 1) * char_size;
556
557        data = (PyObject *)PyObject_REALLOC(data, new_size);
558        if (data == NULL) {
559            PyErr_NoMemory();
560            return -1;
561        }
562        _PyUnicode_DATA_ANY(unicode) = data;
563        if (share_wstr) {
564            _PyUnicode_WSTR(unicode) = data;
565            _PyUnicode_WSTR_LENGTH(unicode) = length;
566        }
567        if (share_utf8) {
568            _PyUnicode_UTF8(unicode) = data;
569            _PyUnicode_UTF8_LENGTH(unicode) = length;
570        }
571        _PyUnicode_LENGTH(unicode) = length;
572        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
573        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
574            assert(_PyUnicode_CheckConsistency(unicode, 0));
575            return 0;
576        }
577    }
578    assert(_PyUnicode_WSTR(unicode) != NULL);
579
580    /* check for integer overflow */
581    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
582        PyErr_NoMemory();
583        return -1;
584    }
585    wstr =  _PyUnicode_WSTR(unicode);
586    wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
587    if (!wstr) {
588        PyErr_NoMemory();
589        return -1;
590    }
591    _PyUnicode_WSTR(unicode) = wstr;
592    _PyUnicode_WSTR(unicode)[length] = 0;
593    _PyUnicode_WSTR_LENGTH(unicode) = length;
594    assert(_PyUnicode_CheckConsistency(unicode, 0));
595    return 0;
596}
597
598static PyObject*
599resize_copy(PyObject *unicode, Py_ssize_t length)
600{
601    Py_ssize_t copy_length;
602    if (PyUnicode_IS_COMPACT(unicode)) {
603        PyObject *copy;
604        assert(PyUnicode_IS_READY(unicode));
605
606        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
607        if (copy == NULL)
608            return NULL;
609
610        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
611        copy_characters(copy, 0, unicode, 0, copy_length);
612        return copy;
613    }
614    else {
615        PyUnicodeObject *w;
616        assert(_PyUnicode_WSTR(unicode) != NULL);
617        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
618        w = _PyUnicode_New(length);
619        if (w == NULL)
620            return NULL;
621        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
622        copy_length = Py_MIN(copy_length, length);
623        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
624                        copy_length);
625        return (PyObject*)w;
626    }
627}
628
629/* We allocate one more byte to make sure the string is
630   Ux0000 terminated; some code (e.g. new_identifier)
631   relies on that.
632
633   XXX This allocator could further be enhanced by assuring that the
634   free list never reduces its size below 1.
635
636*/
637
638#ifdef Py_DEBUG
639int unicode_old_new_calls = 0;
640#endif
641
642static PyUnicodeObject *
643_PyUnicode_New(Py_ssize_t length)
644{
645    register PyUnicodeObject *unicode;
646    size_t new_size;
647
648    /* Optimization for empty strings */
649    if (length == 0 && unicode_empty != NULL) {
650        Py_INCREF(unicode_empty);
651        return (PyUnicodeObject*)unicode_empty;
652    }
653
654    /* Ensure we won't overflow the size. */
655    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
656        return (PyUnicodeObject *)PyErr_NoMemory();
657    }
658    if (length < 0) {
659        PyErr_SetString(PyExc_SystemError,
660                        "Negative size passed to _PyUnicode_New");
661        return NULL;
662    }
663
664#ifdef Py_DEBUG
665    ++unicode_old_new_calls;
666#endif
667
668    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
669    if (unicode == NULL)
670        return NULL;
671    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
672    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
673    if (!_PyUnicode_WSTR(unicode)) {
674        PyErr_NoMemory();
675        goto onError;
676    }
677
678    /* Initialize the first element to guard against cases where
679     * the caller fails before initializing str -- unicode_resize()
680     * reads str[0], and the Keep-Alive optimization can keep memory
681     * allocated for str alive across a call to unicode_dealloc(unicode).
682     * We don't want unicode_resize to read uninitialized memory in
683     * that case.
684     */
685    _PyUnicode_WSTR(unicode)[0] = 0;
686    _PyUnicode_WSTR(unicode)[length] = 0;
687    _PyUnicode_WSTR_LENGTH(unicode) = length;
688    _PyUnicode_HASH(unicode) = -1;
689    _PyUnicode_STATE(unicode).interned = 0;
690    _PyUnicode_STATE(unicode).kind = 0;
691    _PyUnicode_STATE(unicode).compact = 0;
692    _PyUnicode_STATE(unicode).ready = 0;
693    _PyUnicode_STATE(unicode).ascii = 0;
694    _PyUnicode_DATA_ANY(unicode) = NULL;
695    _PyUnicode_LENGTH(unicode) = 0;
696    _PyUnicode_UTF8(unicode) = NULL;
697    _PyUnicode_UTF8_LENGTH(unicode) = 0;
698    return unicode;
699
700  onError:
701    /* XXX UNREF/NEWREF interface should be more symmetrical */
702    _Py_DEC_REFTOTAL;
703    _Py_ForgetReference((PyObject *)unicode);
704    PyObject_Del(unicode);
705    return NULL;
706}
707
708static const char*
709unicode_kind_name(PyObject *unicode)
710{
711    /* don't check consistency: unicode_kind_name() is called from
712       _PyUnicode_Dump() */
713    if (!PyUnicode_IS_COMPACT(unicode))
714    {
715        if (!PyUnicode_IS_READY(unicode))
716            return "wstr";
717        switch(PyUnicode_KIND(unicode))
718        {
719        case PyUnicode_1BYTE_KIND:
720            if (PyUnicode_IS_ASCII(unicode))
721                return "legacy ascii";
722            else
723                return "legacy latin1";
724        case PyUnicode_2BYTE_KIND:
725            return "legacy UCS2";
726        case PyUnicode_4BYTE_KIND:
727            return "legacy UCS4";
728        default:
729            return "<legacy invalid kind>";
730        }
731    }
732    assert(PyUnicode_IS_READY(unicode));
733    switch(PyUnicode_KIND(unicode))
734    {
735    case PyUnicode_1BYTE_KIND:
736        if (PyUnicode_IS_ASCII(unicode))
737            return "ascii";
738        else
739            return "latin1";
740    case PyUnicode_2BYTE_KIND:
741        return "UCS2";
742    case PyUnicode_4BYTE_KIND:
743        return "UCS4";
744    default:
745        return "<invalid compact kind>";
746    }
747}
748
749#ifdef Py_DEBUG
750int unicode_new_new_calls = 0;
751
752/* Functions wrapping macros for use in debugger */
753char *_PyUnicode_utf8(void *unicode){
754    return PyUnicode_UTF8(unicode);
755}
756
757void *_PyUnicode_compact_data(void *unicode) {
758    return _PyUnicode_COMPACT_DATA(unicode);
759}
760void *_PyUnicode_data(void *unicode){
761    printf("obj %p\n", unicode);
762    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
763    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
764    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
765    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
766    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
767    return PyUnicode_DATA(unicode);
768}
769
770void
771_PyUnicode_Dump(PyObject *op)
772{
773    PyASCIIObject *ascii = (PyASCIIObject *)op;
774    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
775    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
776    void *data;
777    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
778    if (ascii->state.compact)
779        data = (compact + 1);
780    else
781        data = unicode->data.any;
782    if (ascii->wstr == data)
783        printf("shared ");
784    printf("wstr=%p", ascii->wstr);
785    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
786        printf(" (%zu), ", compact->wstr_length);
787        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
788            printf("shared ");
789        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
790    }
791    printf(", data=%p\n", data);
792}
793#endif
794
795PyObject *
796PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
797{
798    PyObject *obj;
799    PyCompactUnicodeObject *unicode;
800    void *data;
801    int kind_state;
802    int is_sharing, is_ascii;
803    Py_ssize_t char_size;
804    Py_ssize_t struct_size;
805
806    /* Optimization for empty strings */
807    if (size == 0 && unicode_empty != NULL) {
808        Py_INCREF(unicode_empty);
809        return unicode_empty;
810    }
811
812#ifdef Py_DEBUG
813    ++unicode_new_new_calls;
814#endif
815
816    is_ascii = 0;
817    is_sharing = 0;
818    struct_size = sizeof(PyCompactUnicodeObject);
819    if (maxchar < 128) {
820        kind_state = PyUnicode_1BYTE_KIND;
821        char_size = 1;
822        is_ascii = 1;
823        struct_size = sizeof(PyASCIIObject);
824    }
825    else if (maxchar < 256) {
826        kind_state = PyUnicode_1BYTE_KIND;
827        char_size = 1;
828    }
829    else if (maxchar < 65536) {
830        kind_state = PyUnicode_2BYTE_KIND;
831        char_size = 2;
832        if (sizeof(wchar_t) == 2)
833            is_sharing = 1;
834    }
835    else {
836        kind_state = PyUnicode_4BYTE_KIND;
837        char_size = 4;
838        if (sizeof(wchar_t) == 4)
839            is_sharing = 1;
840    }
841
842    /* Ensure we won't overflow the size. */
843    if (size < 0) {
844        PyErr_SetString(PyExc_SystemError,
845                        "Negative size passed to PyUnicode_New");
846        return NULL;
847    }
848    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
849        return PyErr_NoMemory();
850
851    /* Duplicated allocation code from _PyObject_New() instead of a call to
852     * PyObject_New() so we are able to allocate space for the object and
853     * it's data buffer.
854     */
855    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
856    if (obj == NULL)
857        return PyErr_NoMemory();
858    obj = PyObject_INIT(obj, &PyUnicode_Type);
859    if (obj == NULL)
860        return NULL;
861
862    unicode = (PyCompactUnicodeObject *)obj;
863    if (is_ascii)
864        data = ((PyASCIIObject*)obj) + 1;
865    else
866        data = unicode + 1;
867    _PyUnicode_LENGTH(unicode) = size;
868    _PyUnicode_HASH(unicode) = -1;
869    _PyUnicode_STATE(unicode).interned = 0;
870    _PyUnicode_STATE(unicode).kind = kind_state;
871    _PyUnicode_STATE(unicode).compact = 1;
872    _PyUnicode_STATE(unicode).ready = 1;
873    _PyUnicode_STATE(unicode).ascii = is_ascii;
874    if (is_ascii) {
875        ((char*)data)[size] = 0;
876        _PyUnicode_WSTR(unicode) = NULL;
877    }
878    else if (kind_state == PyUnicode_1BYTE_KIND) {
879        ((char*)data)[size] = 0;
880        _PyUnicode_WSTR(unicode) = NULL;
881        _PyUnicode_WSTR_LENGTH(unicode) = 0;
882        unicode->utf8 = NULL;
883        unicode->utf8_length = 0;
884        }
885    else {
886        unicode->utf8 = NULL;
887        unicode->utf8_length = 0;
888        if (kind_state == PyUnicode_2BYTE_KIND)
889            ((Py_UCS2*)data)[size] = 0;
890        else /* kind_state == PyUnicode_4BYTE_KIND */
891            ((Py_UCS4*)data)[size] = 0;
892        if (is_sharing) {
893            _PyUnicode_WSTR_LENGTH(unicode) = size;
894            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
895        }
896        else {
897            _PyUnicode_WSTR_LENGTH(unicode) = 0;
898            _PyUnicode_WSTR(unicode) = NULL;
899        }
900    }
901    assert(_PyUnicode_CheckConsistency(unicode, 0));
902    return obj;
903}
904
905#if SIZEOF_WCHAR_T == 2
906/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
907   will decode surrogate pairs, the other conversions are implemented as macros
908   for efficiency.
909
910   This function assumes that unicode can hold one more code point than wstr
911   characters for a terminating null character. */
912static void
913unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
914                              PyUnicodeObject *unicode)
915{
916    const wchar_t *iter;
917    Py_UCS4 *ucs4_out;
918
919    assert(unicode != NULL);
920    assert(_PyUnicode_CHECK(unicode));
921    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
922    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
923
924    for (iter = begin; iter < end; ) {
925        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
926                           _PyUnicode_GET_LENGTH(unicode)));
927        if (*iter >= 0xD800 && *iter <= 0xDBFF
928            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
929        {
930            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
931            iter += 2;
932        }
933        else {
934            *ucs4_out++ = *iter;
935            iter++;
936        }
937    }
938    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
939                        _PyUnicode_GET_LENGTH(unicode)));
940
941}
942#endif
943
944static int
945_PyUnicode_Dirty(PyObject *unicode)
946{
947    assert(_PyUnicode_CHECK(unicode));
948    if (Py_REFCNT(unicode) != 1) {
949        PyErr_SetString(PyExc_SystemError,
950                        "Cannot modify a string having more than 1 reference");
951        return -1;
952    }
953    _PyUnicode_DIRTY(unicode);
954    return 0;
955}
956
957static int
958_copy_characters(PyObject *to, Py_ssize_t to_start,
959                 PyObject *from, Py_ssize_t from_start,
960                 Py_ssize_t how_many, int check_maxchar)
961{
962    unsigned int from_kind, to_kind;
963    void *from_data, *to_data;
964    int fast;
965
966    assert(PyUnicode_Check(from));
967    assert(PyUnicode_Check(to));
968    assert(PyUnicode_IS_READY(from));
969    assert(PyUnicode_IS_READY(to));
970
971    assert(PyUnicode_GET_LENGTH(from) >= how_many);
972    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
973    assert(0 <= how_many);
974
975    if (how_many == 0)
976        return 0;
977
978    from_kind = PyUnicode_KIND(from);
979    from_data = PyUnicode_DATA(from);
980    to_kind = PyUnicode_KIND(to);
981    to_data = PyUnicode_DATA(to);
982
983#ifdef Py_DEBUG
984    if (!check_maxchar
985        && (from_kind > to_kind
986            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
987    {
988        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
989        Py_UCS4 ch;
990        Py_ssize_t i;
991        for (i=0; i < how_many; i++) {
992            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
993            assert(ch <= to_maxchar);
994        }
995    }
996#endif
997    fast = (from_kind == to_kind);
998    if (check_maxchar
999        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1000    {
1001        /* deny latin1 => ascii */
1002        fast = 0;
1003    }
1004
1005    if (fast) {
1006        Py_MEMCPY((char*)to_data
1007                      + PyUnicode_KIND_SIZE(to_kind, to_start),
1008                  (char*)from_data
1009                      + PyUnicode_KIND_SIZE(from_kind, from_start),
1010                  PyUnicode_KIND_SIZE(to_kind, how_many));
1011    }
1012    else if (from_kind == PyUnicode_1BYTE_KIND
1013             && to_kind == PyUnicode_2BYTE_KIND)
1014    {
1015        _PyUnicode_CONVERT_BYTES(
1016            Py_UCS1, Py_UCS2,
1017            PyUnicode_1BYTE_DATA(from) + from_start,
1018            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1019            PyUnicode_2BYTE_DATA(to) + to_start
1020            );
1021    }
1022    else if (from_kind == PyUnicode_1BYTE_KIND
1023             && to_kind == PyUnicode_4BYTE_KIND)
1024    {
1025        _PyUnicode_CONVERT_BYTES(
1026            Py_UCS1, Py_UCS4,
1027            PyUnicode_1BYTE_DATA(from) + from_start,
1028            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1029            PyUnicode_4BYTE_DATA(to) + to_start
1030            );
1031    }
1032    else if (from_kind == PyUnicode_2BYTE_KIND
1033             && to_kind == PyUnicode_4BYTE_KIND)
1034    {
1035        _PyUnicode_CONVERT_BYTES(
1036            Py_UCS2, Py_UCS4,
1037            PyUnicode_2BYTE_DATA(from) + from_start,
1038            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1039            PyUnicode_4BYTE_DATA(to) + to_start
1040            );
1041    }
1042    else {
1043        /* check if max_char(from substring) <= max_char(to) */
1044        if (from_kind > to_kind
1045                /* latin1 => ascii */
1046            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1047        {
1048            /* slow path to check for character overflow */
1049            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1050            Py_UCS4 ch;
1051            Py_ssize_t i;
1052
1053            for (i=0; i < how_many; i++) {
1054                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1055                if (check_maxchar) {
1056                    if (ch > to_maxchar)
1057                        return 1;
1058                }
1059                else {
1060                    assert(ch <= to_maxchar);
1061                }
1062                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1063            }
1064        }
1065        else {
1066            return -1;
1067        }
1068    }
1069    return 0;
1070}
1071
1072static void
1073copy_characters(PyObject *to, Py_ssize_t to_start,
1074                       PyObject *from, Py_ssize_t from_start,
1075                       Py_ssize_t how_many)
1076{
1077    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1078}
1079
1080Py_ssize_t
1081PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1082                         PyObject *from, Py_ssize_t from_start,
1083                         Py_ssize_t how_many)
1084{
1085    int err;
1086
1087    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1088        PyErr_BadInternalCall();
1089        return -1;
1090    }
1091
1092    if (PyUnicode_READY(from))
1093        return -1;
1094    if (PyUnicode_READY(to))
1095        return -1;
1096
1097    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1098    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1099        PyErr_Format(PyExc_SystemError,
1100                     "Cannot write %zi characters at %zi "
1101                     "in a string of %zi characters",
1102                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1103        return -1;
1104    }
1105
1106    if (how_many == 0)
1107        return 0;
1108
1109    if (_PyUnicode_Dirty(to))
1110        return -1;
1111
1112    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1113    if (err) {
1114        PyErr_Format(PyExc_SystemError,
1115                     "Cannot copy %s characters "
1116                     "into a string of %s characters",
1117                     unicode_kind_name(from),
1118                     unicode_kind_name(to));
1119        return -1;
1120    }
1121    return how_many;
1122}
1123
1124/* Find the maximum code point and count the number of surrogate pairs so a
1125   correct string length can be computed before converting a string to UCS4.
1126   This function counts single surrogates as a character and not as a pair.
1127
1128   Return 0 on success, or -1 on error. */
1129static int
1130find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1131                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1132{
1133    const wchar_t *iter;
1134
1135    assert(num_surrogates != NULL && maxchar != NULL);
1136    *num_surrogates = 0;
1137    *maxchar = 0;
1138
1139    for (iter = begin; iter < end; ) {
1140        if (*iter > *maxchar) {
1141            *maxchar = *iter;
1142#if SIZEOF_WCHAR_T != 2
1143            if (*maxchar >= 0x10000)
1144                return 0;
1145#endif
1146        }
1147#if SIZEOF_WCHAR_T == 2
1148        if (*iter >= 0xD800 && *iter <= 0xDBFF
1149            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1150        {
1151            Py_UCS4 surrogate_val;
1152            surrogate_val = (((iter[0] & 0x3FF)<<10)
1153                             | (iter[1] & 0x3FF)) + 0x10000;
1154            ++(*num_surrogates);
1155            if (surrogate_val > *maxchar)
1156                *maxchar = surrogate_val;
1157            iter += 2;
1158        }
1159        else
1160            iter++;
1161#else
1162        iter++;
1163#endif
1164    }
1165    return 0;
1166}
1167
1168#ifdef Py_DEBUG
1169int unicode_ready_calls = 0;
1170#endif
1171
1172static int
1173unicode_ready(PyObject **p_obj, int replace)
1174{
1175    PyUnicodeObject *unicode;
1176    wchar_t *end;
1177    Py_UCS4 maxchar = 0;
1178    Py_ssize_t num_surrogates;
1179#if SIZEOF_WCHAR_T == 2
1180    Py_ssize_t length_wo_surrogates;
1181#endif
1182
1183    assert(p_obj != NULL);
1184    unicode = (PyUnicodeObject *)*p_obj;
1185
1186    /* _PyUnicode_Ready() is only intended for old-style API usage where
1187       strings were created using _PyObject_New() and where no canonical
1188       representation (the str field) has been set yet aka strings
1189       which are not yet ready. */
1190    assert(_PyUnicode_CHECK(unicode));
1191    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1192    assert(_PyUnicode_WSTR(unicode) != NULL);
1193    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1194    assert(_PyUnicode_UTF8(unicode) == NULL);
1195    /* Actually, it should neither be interned nor be anything else: */
1196    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1197
1198#ifdef Py_DEBUG
1199    ++unicode_ready_calls;
1200#endif
1201
1202#ifdef Py_DEBUG
1203    assert(!replace || Py_REFCNT(unicode) == 1);
1204#else
1205    if (replace && Py_REFCNT(unicode) != 1)
1206        replace = 0;
1207#endif
1208    if (replace) {
1209        Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1210        wchar_t *wstr = _PyUnicode_WSTR(unicode);
1211        /* Optimization for empty strings */
1212        if (len == 0) {
1213            Py_INCREF(unicode_empty);
1214            Py_DECREF(*p_obj);
1215            *p_obj = unicode_empty;
1216            return 0;
1217        }
1218        if (len == 1 && wstr[0] < 256) {
1219            PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1220            if (latin1_char == NULL)
1221                return -1;
1222            Py_DECREF(*p_obj);
1223            *p_obj = latin1_char;
1224            return 0;
1225        }
1226    }
1227
1228    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1229    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1230                                &maxchar, &num_surrogates) == -1)
1231        return -1;
1232
1233    if (maxchar < 256) {
1234        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1235        if (!_PyUnicode_DATA_ANY(unicode)) {
1236            PyErr_NoMemory();
1237            return -1;
1238        }
1239        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1240                                _PyUnicode_WSTR(unicode), end,
1241                                PyUnicode_1BYTE_DATA(unicode));
1242        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1243        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1244        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1245        if (maxchar < 128) {
1246            _PyUnicode_STATE(unicode).ascii = 1;
1247            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1248            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1249        }
1250        else {
1251            _PyUnicode_STATE(unicode).ascii = 0;
1252            _PyUnicode_UTF8(unicode) = NULL;
1253            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1254        }
1255        PyObject_FREE(_PyUnicode_WSTR(unicode));
1256        _PyUnicode_WSTR(unicode) = NULL;
1257        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1258    }
1259    /* In this case we might have to convert down from 4-byte native
1260       wchar_t to 2-byte unicode. */
1261    else if (maxchar < 65536) {
1262        assert(num_surrogates == 0 &&
1263               "FindMaxCharAndNumSurrogatePairs() messed up");
1264
1265#if SIZEOF_WCHAR_T == 2
1266        /* We can share representations and are done. */
1267        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1268        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1269        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1270        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1271        _PyUnicode_UTF8(unicode) = NULL;
1272        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1273#else
1274        /* sizeof(wchar_t) == 4 */
1275        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1276            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1277        if (!_PyUnicode_DATA_ANY(unicode)) {
1278            PyErr_NoMemory();
1279            return -1;
1280        }
1281        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1282                                _PyUnicode_WSTR(unicode), end,
1283                                PyUnicode_2BYTE_DATA(unicode));
1284        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1285        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1286        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1287        _PyUnicode_UTF8(unicode) = NULL;
1288        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1289        PyObject_FREE(_PyUnicode_WSTR(unicode));
1290        _PyUnicode_WSTR(unicode) = NULL;
1291        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1292#endif
1293    }
1294    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1295    else {
1296#if SIZEOF_WCHAR_T == 2
1297        /* in case the native representation is 2-bytes, we need to allocate a
1298           new normalized 4-byte version. */
1299        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1300        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1301        if (!_PyUnicode_DATA_ANY(unicode)) {
1302            PyErr_NoMemory();
1303            return -1;
1304        }
1305        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1306        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1307        _PyUnicode_UTF8(unicode) = NULL;
1308        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1309        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1310        _PyUnicode_STATE(unicode).ready = 1;
1311        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1312        PyObject_FREE(_PyUnicode_WSTR(unicode));
1313        _PyUnicode_WSTR(unicode) = NULL;
1314        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1315#else
1316        assert(num_surrogates == 0);
1317
1318        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1319        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1320        _PyUnicode_UTF8(unicode) = NULL;
1321        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1322        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1323#endif
1324        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1325    }
1326    _PyUnicode_STATE(unicode).ready = 1;
1327    assert(_PyUnicode_CheckConsistency(unicode, 1));
1328    return 0;
1329}
1330
1331int
1332_PyUnicode_ReadyReplace(PyObject **op)
1333{
1334    return unicode_ready(op, 1);
1335}
1336
1337int
1338_PyUnicode_Ready(PyObject *op)
1339{
1340    return unicode_ready(&op, 0);
1341}
1342
1343static void
1344unicode_dealloc(register PyUnicodeObject *unicode)
1345{
1346    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1347    case SSTATE_NOT_INTERNED:
1348        break;
1349
1350    case SSTATE_INTERNED_MORTAL:
1351        /* revive dead object temporarily for DelItem */
1352        Py_REFCNT(unicode) = 3;
1353        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1354            Py_FatalError(
1355                "deletion of interned string failed");
1356        break;
1357
1358    case SSTATE_INTERNED_IMMORTAL:
1359        Py_FatalError("Immortal interned string died.");
1360
1361    default:
1362        Py_FatalError("Inconsistent interned string state.");
1363    }
1364
1365    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1366        PyObject_DEL(_PyUnicode_WSTR(unicode));
1367    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1368        PyObject_DEL(_PyUnicode_UTF8(unicode));
1369
1370    if (PyUnicode_IS_COMPACT(unicode)) {
1371        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1372    }
1373    else {
1374        if (_PyUnicode_DATA_ANY(unicode))
1375            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1376        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1377    }
1378}
1379
1380#ifdef Py_DEBUG
1381static int
1382unicode_is_singleton(PyObject *unicode)
1383{
1384    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1385    if (unicode == unicode_empty)
1386        return 1;
1387    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1388    {
1389        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1390        if (ch < 256 && unicode_latin1[ch] == unicode)
1391            return 1;
1392    }
1393    return 0;
1394}
1395#endif
1396
1397static int
1398unicode_resizable(PyObject *unicode)
1399{
1400    if (Py_REFCNT(unicode) != 1)
1401        return 0;
1402    if (PyUnicode_CHECK_INTERNED(unicode))
1403        return 0;
1404#ifdef Py_DEBUG
1405    /* singleton refcount is greater than 1 */
1406    assert(!unicode_is_singleton(unicode));
1407#endif
1408    return 1;
1409}
1410
1411static int
1412unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1413{
1414    PyObject *unicode;
1415    Py_ssize_t old_length;
1416
1417    assert(p_unicode != NULL);
1418    unicode = *p_unicode;
1419
1420    assert(unicode != NULL);
1421    assert(PyUnicode_Check(unicode));
1422    assert(0 <= length);
1423
1424    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1425        old_length = PyUnicode_WSTR_LENGTH(unicode);
1426    else
1427        old_length = PyUnicode_GET_LENGTH(unicode);
1428    if (old_length == length)
1429        return 0;
1430
1431    if (!unicode_resizable(unicode)) {
1432        PyObject *copy = resize_copy(unicode, length);
1433        if (copy == NULL)
1434            return -1;
1435        Py_DECREF(*p_unicode);
1436        *p_unicode = copy;
1437        return 0;
1438    }
1439
1440    if (PyUnicode_IS_COMPACT(unicode)) {
1441        *p_unicode = resize_compact(unicode, length);
1442        if (*p_unicode == NULL)
1443            return -1;
1444        assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
1445        return 0;
1446    }
1447    return resize_inplace((PyUnicodeObject*)unicode, length);
1448}
1449
1450int
1451PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1452{
1453    PyObject *unicode;
1454    if (p_unicode == NULL) {
1455        PyErr_BadInternalCall();
1456        return -1;
1457    }
1458    unicode = *p_unicode;
1459    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1460        || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1461    {
1462        PyErr_BadInternalCall();
1463        return -1;
1464    }
1465    return unicode_resize(p_unicode, length);
1466}
1467
1468static PyObject*
1469get_latin1_char(unsigned char ch)
1470{
1471    PyObject *unicode = unicode_latin1[ch];
1472    if (!unicode) {
1473        unicode = PyUnicode_New(1, ch);
1474        if (!unicode)
1475            return NULL;
1476        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1477        assert(_PyUnicode_CheckConsistency(unicode, 1));
1478        unicode_latin1[ch] = unicode;
1479    }
1480    Py_INCREF(unicode);
1481    return unicode;
1482}
1483
1484PyObject *
1485PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1486{
1487    PyUnicodeObject *unicode;
1488    Py_UCS4 maxchar = 0;
1489    Py_ssize_t num_surrogates;
1490
1491    if (u == NULL)
1492        return (PyObject*)_PyUnicode_New(size);
1493
1494    /* If the Unicode data is known at construction time, we can apply
1495       some optimizations which share commonly used objects. */
1496
1497    /* Optimization for empty strings */
1498    if (size == 0 && unicode_empty != NULL) {
1499        Py_INCREF(unicode_empty);
1500        return unicode_empty;
1501    }
1502
1503    /* Single character Unicode objects in the Latin-1 range are
1504       shared when using this constructor */
1505    if (size == 1 && *u < 256)
1506        return get_latin1_char((unsigned char)*u);
1507
1508    /* If not empty and not single character, copy the Unicode data
1509       into the new object */
1510    if (find_maxchar_surrogates(u, u + size,
1511                                &maxchar, &num_surrogates) == -1)
1512        return NULL;
1513
1514    unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1515                                                maxchar);
1516    if (!unicode)
1517        return NULL;
1518
1519    switch (PyUnicode_KIND(unicode)) {
1520    case PyUnicode_1BYTE_KIND:
1521        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1522                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1523        break;
1524    case PyUnicode_2BYTE_KIND:
1525#if Py_UNICODE_SIZE == 2
1526        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1527#else
1528        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1529                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1530#endif
1531        break;
1532    case PyUnicode_4BYTE_KIND:
1533#if SIZEOF_WCHAR_T == 2
1534        /* This is the only case which has to process surrogates, thus
1535           a simple copy loop is not enough and we need a function. */
1536        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1537#else
1538        assert(num_surrogates == 0);
1539        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1540#endif
1541        break;
1542    default:
1543        assert(0 && "Impossible state");
1544    }
1545
1546    assert(_PyUnicode_CheckConsistency(unicode, 1));
1547    return (PyObject *)unicode;
1548}
1549
1550PyObject *
1551PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1552{
1553    PyUnicodeObject *unicode;
1554
1555    if (size < 0) {
1556        PyErr_SetString(PyExc_SystemError,
1557                        "Negative size passed to PyUnicode_FromStringAndSize");
1558        return NULL;
1559    }
1560
1561    /* If the Unicode data is known at construction time, we can apply
1562       some optimizations which share commonly used objects.
1563       Also, this means the input must be UTF-8, so fall back to the
1564       UTF-8 decoder at the end. */
1565    if (u != NULL) {
1566
1567        /* Optimization for empty strings */
1568        if (size == 0 && unicode_empty != NULL) {
1569            Py_INCREF(unicode_empty);
1570            return unicode_empty;
1571        }
1572
1573        /* Single characters are shared when using this constructor.
1574           Restrict to ASCII, since the input must be UTF-8. */
1575        if (size == 1 && Py_CHARMASK(*u) < 128)
1576            return get_latin1_char(Py_CHARMASK(*u));
1577
1578        return PyUnicode_DecodeUTF8(u, size, NULL);
1579    }
1580
1581    unicode = _PyUnicode_New(size);
1582    if (!unicode)
1583        return NULL;
1584
1585    return (PyObject *)unicode;
1586}
1587
1588PyObject *
1589PyUnicode_FromString(const char *u)
1590{
1591    size_t size = strlen(u);
1592    if (size > PY_SSIZE_T_MAX) {
1593        PyErr_SetString(PyExc_OverflowError, "input too long");
1594        return NULL;
1595    }
1596
1597    return PyUnicode_FromStringAndSize(u, size);
1598}
1599
1600static PyObject*
1601unicode_fromascii(const unsigned char* s, Py_ssize_t size)
1602{
1603    PyObject *res;
1604#ifdef Py_DEBUG
1605    const unsigned char *p;
1606    const unsigned char *end = s + size;
1607    for (p=s; p < end; p++) {
1608        assert(*p < 128);
1609    }
1610#endif
1611    res = PyUnicode_New(size, 127);
1612    if (!res)
1613        return NULL;
1614    memcpy(PyUnicode_1BYTE_DATA(res), s, size);
1615    return res;
1616}
1617
1618static Py_UCS4
1619kind_maxchar_limit(unsigned int kind)
1620{
1621    switch(kind) {
1622    case PyUnicode_1BYTE_KIND:
1623        return 0x80;
1624    case PyUnicode_2BYTE_KIND:
1625        return 0x100;
1626    case PyUnicode_4BYTE_KIND:
1627        return 0x10000;
1628    default:
1629        assert(0 && "invalid kind");
1630        return 0x10ffff;
1631    }
1632}
1633
1634static PyObject*
1635_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1636{
1637    PyObject *res;
1638    unsigned char max_char = 127;
1639    Py_ssize_t i;
1640
1641    assert(size >= 0);
1642    for (i = 0; i < size; i++) {
1643        if (u[i] & 0x80) {
1644            max_char = 255;
1645            break;
1646        }
1647    }
1648    res = PyUnicode_New(size, max_char);
1649    if (!res)
1650        return NULL;
1651    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1652    assert(_PyUnicode_CheckConsistency(res, 1));
1653    return res;
1654}
1655
1656static PyObject*
1657_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1658{
1659    PyObject *res;
1660    Py_UCS2 max_char = 0;
1661    Py_ssize_t i;
1662
1663    assert(size >= 0);
1664    for (i = 0; i < size; i++) {
1665        if (u[i] > max_char) {
1666            max_char = u[i];
1667            if (max_char >= 256)
1668                break;
1669        }
1670    }
1671    res = PyUnicode_New(size, max_char);
1672    if (!res)
1673        return NULL;
1674    if (max_char >= 256)
1675        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1676    else
1677        for (i = 0; i < size; i++)
1678            PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1679    assert(_PyUnicode_CheckConsistency(res, 1));
1680    return res;
1681}
1682
1683static PyObject*
1684_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1685{
1686    PyObject *res;
1687    Py_UCS4 max_char = 0;
1688    Py_ssize_t i;
1689
1690    assert(size >= 0);
1691    for (i = 0; i < size; i++) {
1692        if (u[i] > max_char) {
1693            max_char = u[i];
1694            if (max_char >= 0x10000)
1695                break;
1696        }
1697    }
1698    res = PyUnicode_New(size, max_char);
1699    if (!res)
1700        return NULL;
1701    if (max_char >= 0x10000)
1702        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1703    else {
1704        int kind = PyUnicode_KIND(res);
1705        void *data = PyUnicode_DATA(res);
1706        for (i = 0; i < size; i++)
1707            PyUnicode_WRITE(kind, data, i, u[i]);
1708    }
1709    assert(_PyUnicode_CheckConsistency(res, 1));
1710    return res;
1711}
1712
1713PyObject*
1714PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1715{
1716    switch(kind) {
1717    case PyUnicode_1BYTE_KIND:
1718        return _PyUnicode_FromUCS1(buffer, size);
1719    case PyUnicode_2BYTE_KIND:
1720        return _PyUnicode_FromUCS2(buffer, size);
1721    case PyUnicode_4BYTE_KIND:
1722        return _PyUnicode_FromUCS4(buffer, size);
1723    default:
1724        assert(0 && "invalid kind");
1725        PyErr_SetString(PyExc_SystemError, "invalid kind");
1726        return NULL;
1727    }
1728}
1729
1730PyObject*
1731PyUnicode_Copy(PyObject *unicode)
1732{
1733    Py_ssize_t size;
1734    PyObject *copy;
1735    void *data;
1736
1737    if (!PyUnicode_Check(unicode)) {
1738        PyErr_BadInternalCall();
1739        return NULL;
1740    }
1741    if (PyUnicode_READY(unicode))
1742        return NULL;
1743
1744    size = PyUnicode_GET_LENGTH(unicode);
1745    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1746    if (!copy)
1747        return NULL;
1748    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1749
1750    data = PyUnicode_DATA(unicode);
1751    switch (PyUnicode_KIND(unicode))
1752    {
1753    case PyUnicode_1BYTE_KIND:
1754        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1755        break;
1756    case PyUnicode_2BYTE_KIND:
1757        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1758        break;
1759    case PyUnicode_4BYTE_KIND:
1760        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1761        break;
1762    default:
1763        assert(0);
1764        break;
1765    }
1766    assert(_PyUnicode_CheckConsistency(copy, 1));
1767    return copy;
1768}
1769
1770
1771/* Widen Unicode objects to larger buffers. Don't write terminating null
1772   character. Return NULL on error. */
1773
1774void*
1775_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1776{
1777    Py_ssize_t len;
1778    void *result;
1779    unsigned int skind;
1780
1781    if (PyUnicode_READY(s))
1782        return NULL;
1783
1784    len = PyUnicode_GET_LENGTH(s);
1785    skind = PyUnicode_KIND(s);
1786    if (skind >= kind) {
1787        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
1788        return NULL;
1789    }
1790    switch(kind) {
1791    case PyUnicode_2BYTE_KIND:
1792        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1793        if (!result)
1794            return PyErr_NoMemory();
1795        assert(skind == PyUnicode_1BYTE_KIND);
1796        _PyUnicode_CONVERT_BYTES(
1797            Py_UCS1, Py_UCS2,
1798            PyUnicode_1BYTE_DATA(s),
1799            PyUnicode_1BYTE_DATA(s) + len,
1800            result);
1801        return result;
1802    case PyUnicode_4BYTE_KIND:
1803        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1804        if (!result)
1805            return PyErr_NoMemory();
1806        if (skind == PyUnicode_2BYTE_KIND) {
1807            _PyUnicode_CONVERT_BYTES(
1808                Py_UCS2, Py_UCS4,
1809                PyUnicode_2BYTE_DATA(s),
1810                PyUnicode_2BYTE_DATA(s) + len,
1811                result);
1812        }
1813        else {
1814            assert(skind == PyUnicode_1BYTE_KIND);
1815            _PyUnicode_CONVERT_BYTES(
1816                Py_UCS1, Py_UCS4,
1817                PyUnicode_1BYTE_DATA(s),
1818                PyUnicode_1BYTE_DATA(s) + len,
1819                result);
1820        }
1821        return result;
1822    default:
1823        break;
1824    }
1825    PyErr_SetString(PyExc_SystemError, "invalid kind");
1826    return NULL;
1827}
1828
1829static Py_UCS4*
1830as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1831        int copy_null)
1832{
1833    int kind;
1834    void *data;
1835    Py_ssize_t len, targetlen;
1836    if (PyUnicode_READY(string) == -1)
1837        return NULL;
1838    kind = PyUnicode_KIND(string);
1839    data = PyUnicode_DATA(string);
1840    len = PyUnicode_GET_LENGTH(string);
1841    targetlen = len;
1842    if (copy_null)
1843        targetlen++;
1844    if (!target) {
1845        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1846            PyErr_NoMemory();
1847            return NULL;
1848        }
1849        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1850        if (!target) {
1851            PyErr_NoMemory();
1852            return NULL;
1853        }
1854    }
1855    else {
1856        if (targetsize < targetlen) {
1857            PyErr_Format(PyExc_SystemError,
1858                         "string is longer than the buffer");
1859            if (copy_null && 0 < targetsize)
1860                target[0] = 0;
1861            return NULL;
1862        }
1863    }
1864    if (kind != PyUnicode_4BYTE_KIND) {
1865        Py_ssize_t i;
1866        for (i = 0; i < len; i++)
1867            target[i] = PyUnicode_READ(kind, data, i);
1868    }
1869    else
1870        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1871    if (copy_null)
1872        target[len] = 0;
1873    return target;
1874}
1875
1876Py_UCS4*
1877PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1878                 int copy_null)
1879{
1880    if (target == NULL || targetsize < 1) {
1881        PyErr_BadInternalCall();
1882        return NULL;
1883    }
1884    return as_ucs4(string, target, targetsize, copy_null);
1885}
1886
1887Py_UCS4*
1888PyUnicode_AsUCS4Copy(PyObject *string)
1889{
1890    return as_ucs4(string, NULL, 0, 1);
1891}
1892
1893#ifdef HAVE_WCHAR_H
1894
1895PyObject *
1896PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
1897{
1898    if (w == NULL) {
1899        if (size == 0)
1900            return PyUnicode_New(0, 0);
1901        PyErr_BadInternalCall();
1902        return NULL;
1903    }
1904
1905    if (size == -1) {
1906        size = wcslen(w);
1907    }
1908
1909    return PyUnicode_FromUnicode(w, size);
1910}
1911
1912#endif /* HAVE_WCHAR_H */
1913
1914static void
1915makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1916        int zeropad, int width, int precision, char c)
1917{
1918    *fmt++ = '%';
1919    if (width) {
1920        if (zeropad)
1921            *fmt++ = '0';
1922        fmt += sprintf(fmt, "%d", width);
1923    }
1924    if (precision)
1925        fmt += sprintf(fmt, ".%d", precision);
1926    if (longflag)
1927        *fmt++ = 'l';
1928    else if (longlongflag) {
1929        /* longlongflag should only ever be nonzero on machines with
1930           HAVE_LONG_LONG defined */
1931#ifdef HAVE_LONG_LONG
1932        char *f = PY_FORMAT_LONG_LONG;
1933        while (*f)
1934            *fmt++ = *f++;
1935#else
1936        /* we shouldn't ever get here */
1937        assert(0);
1938        *fmt++ = 'l';
1939#endif
1940    }
1941    else if (size_tflag) {
1942        char *f = PY_FORMAT_SIZE_T;
1943        while (*f)
1944            *fmt++ = *f++;
1945    }
1946    *fmt++ = c;
1947    *fmt = '\0';
1948}
1949
1950/* helper for PyUnicode_FromFormatV() */
1951
1952static const char*
1953parse_format_flags(const char *f,
1954                   int *p_width, int *p_precision,
1955                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1956{
1957    int width, precision, longflag, longlongflag, size_tflag;
1958
1959    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1960    f++;
1961    width = 0;
1962    while (Py_ISDIGIT((unsigned)*f))
1963        width = (width*10) + *f++ - '0';
1964    precision = 0;
1965    if (*f == '.') {
1966        f++;
1967        while (Py_ISDIGIT((unsigned)*f))
1968            precision = (precision*10) + *f++ - '0';
1969        if (*f == '%') {
1970            /* "%.3%s" => f points to "3" */
1971            f--;
1972        }
1973    }
1974    if (*f == '\0') {
1975        /* bogus format "%.1" => go backward, f points to "1" */
1976        f--;
1977    }
1978    if (p_width != NULL)
1979        *p_width = width;
1980    if (p_precision != NULL)
1981        *p_precision = precision;
1982
1983    /* Handle %ld, %lu, %lld and %llu. */
1984    longflag = 0;
1985    longlongflag = 0;
1986    size_tflag = 0;
1987
1988    if (*f == 'l') {
1989        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
1990            longflag = 1;
1991            ++f;
1992        }
1993#ifdef HAVE_LONG_LONG
1994        else if (f[1] == 'l' &&
1995                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
1996            longlongflag = 1;
1997            f += 2;
1998        }
1999#endif
2000    }
2001    /* handle the size_t flag. */
2002    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2003        size_tflag = 1;
2004        ++f;
2005    }
2006    if (p_longflag != NULL)
2007        *p_longflag = longflag;
2008    if (p_longlongflag != NULL)
2009        *p_longlongflag = longlongflag;
2010    if (p_size_tflag != NULL)
2011        *p_size_tflag = size_tflag;
2012    return f;
2013}
2014
2015/* maximum number of characters required for output of %ld.  21 characters
2016   allows for 64-bit integers (in decimal) and an optional sign. */
2017#define MAX_LONG_CHARS 21
2018/* maximum number of characters required for output of %lld.
2019   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2020   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2021#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2022
2023PyObject *
2024PyUnicode_FromFormatV(const char *format, va_list vargs)
2025{
2026    va_list count;
2027    Py_ssize_t callcount = 0;
2028    PyObject **callresults = NULL;
2029    PyObject **callresult = NULL;
2030    Py_ssize_t n = 0;
2031    int width = 0;
2032    int precision = 0;
2033    int zeropad;
2034    const char* f;
2035    PyObject *string;
2036    /* used by sprintf */
2037    char fmt[61]; /* should be enough for %0width.precisionlld */
2038    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2039    Py_UCS4 argmaxchar;
2040    Py_ssize_t numbersize = 0;
2041    char *numberresults = NULL;
2042    char *numberresult = NULL;
2043    Py_ssize_t i;
2044    int kind;
2045    void *data;
2046
2047    Py_VA_COPY(count, vargs);
2048    /* step 1: count the number of %S/%R/%A/%s format specifications
2049     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2050     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2051     * result in an array)
2052     * also estimate a upper bound for all the number formats in the string,
2053     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2054     * buffer before putting everything together. */
2055    for (f = format; *f; f++) {
2056        if (*f == '%') {
2057            int longlongflag;
2058            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2059            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2060            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2061                ++callcount;
2062
2063            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2064#ifdef HAVE_LONG_LONG
2065                if (longlongflag) {
2066                    if (width < MAX_LONG_LONG_CHARS)
2067                        width = MAX_LONG_LONG_CHARS;
2068                }
2069                else
2070#endif
2071                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2072                       including sign.  Decimal takes the most space.  This
2073                       isn't enough for octal.  If a width is specified we
2074                       need more (which we allocate later). */
2075                    if (width < MAX_LONG_CHARS)
2076                        width = MAX_LONG_CHARS;
2077
2078                /* account for the size + '\0' to separate numbers
2079                   inside of the numberresults buffer */
2080                numbersize += (width + 1);
2081            }
2082        }
2083        else if ((unsigned char)*f > 127) {
2084            PyErr_Format(PyExc_ValueError,
2085                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2086                "string, got a non-ASCII byte: 0x%02x",
2087                (unsigned char)*f);
2088            return NULL;
2089        }
2090    }
2091    /* step 2: allocate memory for the results of
2092     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2093    if (callcount) {
2094        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2095        if (!callresults) {
2096            PyErr_NoMemory();
2097            return NULL;
2098        }
2099        callresult = callresults;
2100    }
2101    /* step 2.5: allocate memory for the results of formating numbers */
2102    if (numbersize) {
2103        numberresults = PyObject_Malloc(numbersize);
2104        if (!numberresults) {
2105            PyErr_NoMemory();
2106            goto fail;
2107        }
2108        numberresult = numberresults;
2109    }
2110
2111    /* step 3: format numbers and figure out how large a buffer we need */
2112    for (f = format; *f; f++) {
2113        if (*f == '%') {
2114            const char* p;
2115            int longflag;
2116            int longlongflag;
2117            int size_tflag;
2118            int numprinted;
2119
2120            p = f;
2121            zeropad = (f[1] == '0');
2122            f = parse_format_flags(f, &width, &precision,
2123                                   &longflag, &longlongflag, &size_tflag);
2124            switch (*f) {
2125            case 'c':
2126            {
2127                Py_UCS4 ordinal = va_arg(count, int);
2128                maxchar = Py_MAX(maxchar, ordinal);
2129                n++;
2130                break;
2131            }
2132            case '%':
2133                n++;
2134                break;
2135            case 'i':
2136            case 'd':
2137                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2138                        width, precision, *f);
2139                if (longflag)
2140                    numprinted = sprintf(numberresult, fmt,
2141                                         va_arg(count, long));
2142#ifdef HAVE_LONG_LONG
2143                else if (longlongflag)
2144                    numprinted = sprintf(numberresult, fmt,
2145                                         va_arg(count, PY_LONG_LONG));
2146#endif
2147                else if (size_tflag)
2148                    numprinted = sprintf(numberresult, fmt,
2149                                         va_arg(count, Py_ssize_t));
2150                else
2151                    numprinted = sprintf(numberresult, fmt,
2152                                         va_arg(count, int));
2153                n += numprinted;
2154                /* advance by +1 to skip over the '\0' */
2155                numberresult += (numprinted + 1);
2156                assert(*(numberresult - 1) == '\0');
2157                assert(*(numberresult - 2) != '\0');
2158                assert(numprinted >= 0);
2159                assert(numberresult <= numberresults + numbersize);
2160                break;
2161            case 'u':
2162                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2163                        width, precision, 'u');
2164                if (longflag)
2165                    numprinted = sprintf(numberresult, fmt,
2166                                         va_arg(count, unsigned long));
2167#ifdef HAVE_LONG_LONG
2168                else if (longlongflag)
2169                    numprinted = sprintf(numberresult, fmt,
2170                                         va_arg(count, unsigned PY_LONG_LONG));
2171#endif
2172                else if (size_tflag)
2173                    numprinted = sprintf(numberresult, fmt,
2174                                         va_arg(count, size_t));
2175                else
2176                    numprinted = sprintf(numberresult, fmt,
2177                                         va_arg(count, unsigned int));
2178                n += numprinted;
2179                numberresult += (numprinted + 1);
2180                assert(*(numberresult - 1) == '\0');
2181                assert(*(numberresult - 2) != '\0');
2182                assert(numprinted >= 0);
2183                assert(numberresult <= numberresults + numbersize);
2184                break;
2185            case 'x':
2186                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2187                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2188                n += numprinted;
2189                numberresult += (numprinted + 1);
2190                assert(*(numberresult - 1) == '\0');
2191                assert(*(numberresult - 2) != '\0');
2192                assert(numprinted >= 0);
2193                assert(numberresult <= numberresults + numbersize);
2194                break;
2195            case 'p':
2196                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2197                /* %p is ill-defined:  ensure leading 0x. */
2198                if (numberresult[1] == 'X')
2199                    numberresult[1] = 'x';
2200                else if (numberresult[1] != 'x') {
2201                    memmove(numberresult + 2, numberresult,
2202                            strlen(numberresult) + 1);
2203                    numberresult[0] = '0';
2204                    numberresult[1] = 'x';
2205                    numprinted += 2;
2206                }
2207                n += numprinted;
2208                numberresult += (numprinted + 1);
2209                assert(*(numberresult - 1) == '\0');
2210                assert(*(numberresult - 2) != '\0');
2211                assert(numprinted >= 0);
2212                assert(numberresult <= numberresults + numbersize);
2213                break;
2214            case 's':
2215            {
2216                /* UTF-8 */
2217                const char *s = va_arg(count, const char*);
2218                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2219                if (!str)
2220                    goto fail;
2221                /* since PyUnicode_DecodeUTF8 returns already flexible
2222                   unicode objects, there is no need to call ready on them */
2223                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2224                maxchar = Py_MAX(maxchar, argmaxchar);
2225                n += PyUnicode_GET_LENGTH(str);
2226                /* Remember the str and switch to the next slot */
2227                *callresult++ = str;
2228                break;
2229            }
2230            case 'U':
2231            {
2232                PyObject *obj = va_arg(count, PyObject *);
2233                assert(obj && _PyUnicode_CHECK(obj));
2234                if (PyUnicode_READY(obj) == -1)
2235                    goto fail;
2236                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2237                maxchar = Py_MAX(maxchar, argmaxchar);
2238                n += PyUnicode_GET_LENGTH(obj);
2239                break;
2240            }
2241            case 'V':
2242            {
2243                PyObject *obj = va_arg(count, PyObject *);
2244                const char *str = va_arg(count, const char *);
2245                PyObject *str_obj;
2246                assert(obj || str);
2247                assert(!obj || _PyUnicode_CHECK(obj));
2248                if (obj) {
2249                    if (PyUnicode_READY(obj) == -1)
2250                        goto fail;
2251                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2252                    maxchar = Py_MAX(maxchar, argmaxchar);
2253                    n += PyUnicode_GET_LENGTH(obj);
2254                    *callresult++ = NULL;
2255                }
2256                else {
2257                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2258                    if (!str_obj)
2259                        goto fail;
2260                    if (PyUnicode_READY(str_obj)) {
2261                        Py_DECREF(str_obj);
2262                        goto fail;
2263                    }
2264                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2265                    maxchar = Py_MAX(maxchar, argmaxchar);
2266                    n += PyUnicode_GET_LENGTH(str_obj);
2267                    *callresult++ = str_obj;
2268                }
2269                break;
2270            }
2271            case 'S':
2272            {
2273                PyObject *obj = va_arg(count, PyObject *);
2274                PyObject *str;
2275                assert(obj);
2276                str = PyObject_Str(obj);
2277                if (!str || PyUnicode_READY(str) == -1)
2278                    goto fail;
2279                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2280                maxchar = Py_MAX(maxchar, argmaxchar);
2281                n += PyUnicode_GET_LENGTH(str);
2282                /* Remember the str and switch to the next slot */
2283                *callresult++ = str;
2284                break;
2285            }
2286            case 'R':
2287            {
2288                PyObject *obj = va_arg(count, PyObject *);
2289                PyObject *repr;
2290                assert(obj);
2291                repr = PyObject_Repr(obj);
2292                if (!repr || PyUnicode_READY(repr) == -1)
2293                    goto fail;
2294                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2295                maxchar = Py_MAX(maxchar, argmaxchar);
2296                n += PyUnicode_GET_LENGTH(repr);
2297                /* Remember the repr and switch to the next slot */
2298                *callresult++ = repr;
2299                break;
2300            }
2301            case 'A':
2302            {
2303                PyObject *obj = va_arg(count, PyObject *);
2304                PyObject *ascii;
2305                assert(obj);
2306                ascii = PyObject_ASCII(obj);
2307                if (!ascii || PyUnicode_READY(ascii) == -1)
2308                    goto fail;
2309                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2310                maxchar = Py_MAX(maxchar, argmaxchar);
2311                n += PyUnicode_GET_LENGTH(ascii);
2312                /* Remember the repr and switch to the next slot */
2313                *callresult++ = ascii;
2314                break;
2315            }
2316            default:
2317                /* if we stumble upon an unknown
2318                   formatting code, copy the rest of
2319                   the format string to the output
2320                   string. (we cannot just skip the
2321                   code, since there's no way to know
2322                   what's in the argument list) */
2323                n += strlen(p);
2324                goto expand;
2325            }
2326        } else
2327            n++;
2328    }
2329  expand:
2330    /* step 4: fill the buffer */
2331    /* Since we've analyzed how much space we need,
2332       we don't have to resize the string.
2333       There can be no errors beyond this point. */
2334    string = PyUnicode_New(n, maxchar);
2335    if (!string)
2336        goto fail;
2337    kind = PyUnicode_KIND(string);
2338    data = PyUnicode_DATA(string);
2339    callresult = callresults;
2340    numberresult = numberresults;
2341
2342    for (i = 0, f = format; *f; f++) {
2343        if (*f == '%') {
2344            const char* p;
2345
2346            p = f;
2347            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2348            /* checking for == because the last argument could be a empty
2349               string, which causes i to point to end, the assert at the end of
2350               the loop */
2351            assert(i <= PyUnicode_GET_LENGTH(string));
2352
2353            switch (*f) {
2354            case 'c':
2355            {
2356                const int ordinal = va_arg(vargs, int);
2357                PyUnicode_WRITE(kind, data, i++, ordinal);
2358                break;
2359            }
2360            case 'i':
2361            case 'd':
2362            case 'u':
2363            case 'x':
2364            case 'p':
2365                /* unused, since we already have the result */
2366                if (*f == 'p')
2367                    (void) va_arg(vargs, void *);
2368                else
2369                    (void) va_arg(vargs, int);
2370                /* extract the result from numberresults and append. */
2371                for (; *numberresult; ++i, ++numberresult)
2372                    PyUnicode_WRITE(kind, data, i, *numberresult);
2373                /* skip over the separating '\0' */
2374                assert(*numberresult == '\0');
2375                numberresult++;
2376                assert(numberresult <= numberresults + numbersize);
2377                break;
2378            case 's':
2379            {
2380                /* unused, since we already have the result */
2381                Py_ssize_t size;
2382                (void) va_arg(vargs, char *);
2383                size = PyUnicode_GET_LENGTH(*callresult);
2384                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2385                copy_characters(string, i, *callresult, 0, size);
2386                i += size;
2387                /* We're done with the unicode()/repr() => forget it */
2388                Py_DECREF(*callresult);
2389                /* switch to next unicode()/repr() result */
2390                ++callresult;
2391                break;
2392            }
2393            case 'U':
2394            {
2395                PyObject *obj = va_arg(vargs, PyObject *);
2396                Py_ssize_t size;
2397                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2398                size = PyUnicode_GET_LENGTH(obj);
2399                copy_characters(string, i, obj, 0, size);
2400                i += size;
2401                break;
2402            }
2403            case 'V':
2404            {
2405                Py_ssize_t size;
2406                PyObject *obj = va_arg(vargs, PyObject *);
2407                va_arg(vargs, const char *);
2408                if (obj) {
2409                    size = PyUnicode_GET_LENGTH(obj);
2410                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2411                    copy_characters(string, i, obj, 0, size);
2412                    i += size;
2413                } else {
2414                    size = PyUnicode_GET_LENGTH(*callresult);
2415                    assert(PyUnicode_KIND(*callresult) <=
2416                           PyUnicode_KIND(string));
2417                    copy_characters(string, i, *callresult, 0, size);
2418                    i += size;
2419                    Py_DECREF(*callresult);
2420                }
2421                ++callresult;
2422                break;
2423            }
2424            case 'S':
2425            case 'R':
2426            case 'A':
2427            {
2428                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2429                /* unused, since we already have the result */
2430                (void) va_arg(vargs, PyObject *);
2431                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2432                copy_characters(string, i, *callresult, 0,  size);
2433                i += size;
2434                /* We're done with the unicode()/repr() => forget it */
2435                Py_DECREF(*callresult);
2436                /* switch to next unicode()/repr() result */
2437                ++callresult;
2438                break;
2439            }
2440            case '%':
2441                PyUnicode_WRITE(kind, data, i++, '%');
2442                break;
2443            default:
2444                for (; *p; ++p, ++i)
2445                    PyUnicode_WRITE(kind, data, i, *p);
2446                assert(i == PyUnicode_GET_LENGTH(string));
2447                goto end;
2448            }
2449        }
2450        else {
2451            assert(i < PyUnicode_GET_LENGTH(string));
2452            PyUnicode_WRITE(kind, data, i++, *f);
2453        }
2454    }
2455    assert(i == PyUnicode_GET_LENGTH(string));
2456
2457  end:
2458    if (callresults)
2459        PyObject_Free(callresults);
2460    if (numberresults)
2461        PyObject_Free(numberresults);
2462    assert(_PyUnicode_CheckConsistency(string, 1));
2463    return (PyObject *)string;
2464  fail:
2465    if (callresults) {
2466        PyObject **callresult2 = callresults;
2467        while (callresult2 < callresult) {
2468            Py_XDECREF(*callresult2);
2469            ++callresult2;
2470        }
2471        PyObject_Free(callresults);
2472    }
2473    if (numberresults)
2474        PyObject_Free(numberresults);
2475    return NULL;
2476}
2477
2478PyObject *
2479PyUnicode_FromFormat(const char *format, ...)
2480{
2481    PyObject* ret;
2482    va_list vargs;
2483
2484#ifdef HAVE_STDARG_PROTOTYPES
2485    va_start(vargs, format);
2486#else
2487    va_start(vargs);
2488#endif
2489    ret = PyUnicode_FromFormatV(format, vargs);
2490    va_end(vargs);
2491    return ret;
2492}
2493
2494#ifdef HAVE_WCHAR_H
2495
2496/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2497   convert a Unicode object to a wide character string.
2498
2499   - If w is NULL: return the number of wide characters (including the null
2500     character) required to convert the unicode object. Ignore size argument.
2501
2502   - Otherwise: return the number of wide characters (excluding the null
2503     character) written into w. Write at most size wide characters (including
2504     the null character). */
2505static Py_ssize_t
2506unicode_aswidechar(PyUnicodeObject *unicode,
2507                   wchar_t *w,
2508                   Py_ssize_t size)
2509{
2510    Py_ssize_t res;
2511    const wchar_t *wstr;
2512
2513    wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2514    if (wstr == NULL)
2515        return -1;
2516
2517    if (w != NULL) {
2518        if (size > res)
2519            size = res + 1;
2520        else
2521            res = size;
2522        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2523        return res;
2524    }
2525    else
2526        return res + 1;
2527}
2528
2529Py_ssize_t
2530PyUnicode_AsWideChar(PyObject *unicode,
2531                     wchar_t *w,
2532                     Py_ssize_t size)
2533{
2534    if (unicode == NULL) {
2535        PyErr_BadInternalCall();
2536        return -1;
2537    }
2538    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
2539}
2540
2541wchar_t*
2542PyUnicode_AsWideCharString(PyObject *unicode,
2543                           Py_ssize_t *size)
2544{
2545    wchar_t* buffer;
2546    Py_ssize_t buflen;
2547
2548    if (unicode == NULL) {
2549        PyErr_BadInternalCall();
2550        return NULL;
2551    }
2552
2553    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
2554    if (buflen == -1)
2555        return NULL;
2556    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2557        PyErr_NoMemory();
2558        return NULL;
2559    }
2560
2561    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2562    if (buffer == NULL) {
2563        PyErr_NoMemory();
2564        return NULL;
2565    }
2566    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
2567    if (buflen == -1)
2568        return NULL;
2569    if (size != NULL)
2570        *size = buflen;
2571    return buffer;
2572}
2573
2574#endif /* HAVE_WCHAR_H */
2575
2576PyObject *
2577PyUnicode_FromOrdinal(int ordinal)
2578{
2579    PyObject *v;
2580    if (ordinal < 0 || ordinal > 0x10ffff) {
2581        PyErr_SetString(PyExc_ValueError,
2582                        "chr() arg not in range(0x110000)");
2583        return NULL;
2584    }
2585
2586    if (ordinal < 256)
2587        return get_latin1_char(ordinal);
2588
2589    v = PyUnicode_New(1, ordinal);
2590    if (v == NULL)
2591        return NULL;
2592    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2593    assert(_PyUnicode_CheckConsistency(v, 1));
2594    return v;
2595}
2596
2597PyObject *
2598PyUnicode_FromObject(register PyObject *obj)
2599{
2600    /* XXX Perhaps we should make this API an alias of
2601       PyObject_Str() instead ?! */
2602    if (PyUnicode_CheckExact(obj)) {
2603        if (PyUnicode_READY(obj))
2604            return NULL;
2605        Py_INCREF(obj);
2606        return obj;
2607    }
2608    if (PyUnicode_Check(obj)) {
2609        /* For a Unicode subtype that's not a Unicode object,
2610           return a true Unicode object with the same data. */
2611        return PyUnicode_Copy(obj);
2612    }
2613    PyErr_Format(PyExc_TypeError,
2614                 "Can't convert '%.100s' object to str implicitly",
2615                 Py_TYPE(obj)->tp_name);
2616    return NULL;
2617}
2618
2619PyObject *
2620PyUnicode_FromEncodedObject(register PyObject *obj,
2621                            const char *encoding,
2622                            const char *errors)
2623{
2624    Py_buffer buffer;
2625    PyObject *v;
2626
2627    if (obj == NULL) {
2628        PyErr_BadInternalCall();
2629        return NULL;
2630    }
2631
2632    /* Decoding bytes objects is the most common case and should be fast */
2633    if (PyBytes_Check(obj)) {
2634        if (PyBytes_GET_SIZE(obj) == 0) {
2635            Py_INCREF(unicode_empty);
2636            v = unicode_empty;
2637        }
2638        else {
2639            v = PyUnicode_Decode(
2640                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2641                    encoding, errors);
2642        }
2643        return v;
2644    }
2645
2646    if (PyUnicode_Check(obj)) {
2647        PyErr_SetString(PyExc_TypeError,
2648                        "decoding str is not supported");
2649        return NULL;
2650    }
2651
2652    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2653    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2654        PyErr_Format(PyExc_TypeError,
2655                     "coercing to str: need bytes, bytearray "
2656                     "or buffer-like object, %.80s found",
2657                     Py_TYPE(obj)->tp_name);
2658        return NULL;
2659    }
2660
2661    if (buffer.len == 0) {
2662        Py_INCREF(unicode_empty);
2663        v = unicode_empty;
2664    }
2665    else
2666        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2667
2668    PyBuffer_Release(&buffer);
2669    return v;
2670}
2671
2672/* Convert encoding to lower case and replace '_' with '-' in order to
2673   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2674   1 on success. */
2675static int
2676normalize_encoding(const char *encoding,
2677                   char *lower,
2678                   size_t lower_len)
2679{
2680    const char *e;
2681    char *l;
2682    char *l_end;
2683
2684    e = encoding;
2685    l = lower;
2686    l_end = &lower[lower_len - 1];
2687    while (*e) {
2688        if (l == l_end)
2689            return 0;
2690        if (Py_ISUPPER(*e)) {
2691            *l++ = Py_TOLOWER(*e++);
2692        }
2693        else if (*e == '_') {
2694            *l++ = '-';
2695            e++;
2696        }
2697        else {
2698            *l++ = *e++;
2699        }
2700    }
2701    *l = '\0';
2702    return 1;
2703}
2704
2705PyObject *
2706PyUnicode_Decode(const char *s,
2707                 Py_ssize_t size,
2708                 const char *encoding,
2709                 const char *errors)
2710{
2711    PyObject *buffer = NULL, *unicode;
2712    Py_buffer info;
2713    char lower[11];  /* Enough for any encoding shortcut */
2714
2715    if (encoding == NULL)
2716        return PyUnicode_DecodeUTF8(s, size, errors);
2717
2718    /* Shortcuts for common default encodings */
2719    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2720        if ((strcmp(lower, "utf-8") == 0) ||
2721            (strcmp(lower, "utf8") == 0))
2722            return PyUnicode_DecodeUTF8(s, size, errors);
2723        else if ((strcmp(lower, "latin-1") == 0) ||
2724                 (strcmp(lower, "latin1") == 0) ||
2725                 (strcmp(lower, "iso-8859-1") == 0))
2726            return PyUnicode_DecodeLatin1(s, size, errors);
2727#ifdef HAVE_MBCS
2728        else if (strcmp(lower, "mbcs") == 0)
2729            return PyUnicode_DecodeMBCS(s, size, errors);
2730#endif
2731        else if (strcmp(lower, "ascii") == 0)
2732            return PyUnicode_DecodeASCII(s, size, errors);
2733        else if (strcmp(lower, "utf-16") == 0)
2734            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2735        else if (strcmp(lower, "utf-32") == 0)
2736            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2737    }
2738
2739    /* Decode via the codec registry */
2740    buffer = NULL;
2741    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2742        goto onError;
2743    buffer = PyMemoryView_FromBuffer(&info);
2744    if (buffer == NULL)
2745        goto onError;
2746    unicode = PyCodec_Decode(buffer, encoding, errors);
2747    if (unicode == NULL)
2748        goto onError;
2749    if (!PyUnicode_Check(unicode)) {
2750        PyErr_Format(PyExc_TypeError,
2751                     "decoder did not return a str object (type=%.400s)",
2752                     Py_TYPE(unicode)->tp_name);
2753        Py_DECREF(unicode);
2754        goto onError;
2755    }
2756    Py_DECREF(buffer);
2757#ifndef DONT_MAKE_RESULT_READY
2758    if (_PyUnicode_READY_REPLACE(&unicode)) {
2759        Py_DECREF(unicode);
2760        return NULL;
2761    }
2762#endif
2763    assert(_PyUnicode_CheckConsistency(unicode, 1));
2764    return unicode;
2765
2766  onError:
2767    Py_XDECREF(buffer);
2768    return NULL;
2769}
2770
2771PyObject *
2772PyUnicode_AsDecodedObject(PyObject *unicode,
2773                          const char *encoding,
2774                          const char *errors)
2775{
2776    PyObject *v;
2777
2778    if (!PyUnicode_Check(unicode)) {
2779        PyErr_BadArgument();
2780        goto onError;
2781    }
2782
2783    if (encoding == NULL)
2784        encoding = PyUnicode_GetDefaultEncoding();
2785
2786    /* Decode via the codec registry */
2787    v = PyCodec_Decode(unicode, encoding, errors);
2788    if (v == NULL)
2789        goto onError;
2790    assert(_PyUnicode_CheckConsistency(v, 1));
2791    return v;
2792
2793  onError:
2794    return NULL;
2795}
2796
2797PyObject *
2798PyUnicode_AsDecodedUnicode(PyObject *unicode,
2799                           const char *encoding,
2800                           const char *errors)
2801{
2802    PyObject *v;
2803
2804    if (!PyUnicode_Check(unicode)) {
2805        PyErr_BadArgument();
2806        goto onError;
2807    }
2808
2809    if (encoding == NULL)
2810        encoding = PyUnicode_GetDefaultEncoding();
2811
2812    /* Decode via the codec registry */
2813    v = PyCodec_Decode(unicode, encoding, errors);
2814    if (v == NULL)
2815        goto onError;
2816    if (!PyUnicode_Check(v)) {
2817        PyErr_Format(PyExc_TypeError,
2818                     "decoder did not return a str object (type=%.400s)",
2819                     Py_TYPE(v)->tp_name);
2820        Py_DECREF(v);
2821        goto onError;
2822    }
2823    assert(_PyUnicode_CheckConsistency(v, 1));
2824    return v;
2825
2826  onError:
2827    return NULL;
2828}
2829
2830PyObject *
2831PyUnicode_Encode(const Py_UNICODE *s,
2832                 Py_ssize_t size,
2833                 const char *encoding,
2834                 const char *errors)
2835{
2836    PyObject *v, *unicode;
2837
2838    unicode = PyUnicode_FromUnicode(s, size);
2839    if (unicode == NULL)
2840        return NULL;
2841    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2842    Py_DECREF(unicode);
2843    return v;
2844}
2845
2846PyObject *
2847PyUnicode_AsEncodedObject(PyObject *unicode,
2848                          const char *encoding,
2849                          const char *errors)
2850{
2851    PyObject *v;
2852
2853    if (!PyUnicode_Check(unicode)) {
2854        PyErr_BadArgument();
2855        goto onError;
2856    }
2857
2858    if (encoding == NULL)
2859        encoding = PyUnicode_GetDefaultEncoding();
2860
2861    /* Encode via the codec registry */
2862    v = PyCodec_Encode(unicode, encoding, errors);
2863    if (v == NULL)
2864        goto onError;
2865    return v;
2866
2867  onError:
2868    return NULL;
2869}
2870
2871PyObject *
2872PyUnicode_EncodeFSDefault(PyObject *unicode)
2873{
2874#ifdef HAVE_MBCS
2875    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2876                                PyUnicode_GET_SIZE(unicode),
2877                                NULL);
2878#elif defined(__APPLE__)
2879    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
2880#else
2881    PyInterpreterState *interp = PyThreadState_GET()->interp;
2882    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2883       cannot use it to encode and decode filenames before it is loaded. Load
2884       the Python codec requires to encode at least its own filename. Use the C
2885       version of the locale codec until the codec registry is initialized and
2886       the Python codec is loaded.
2887
2888       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2889       cannot only rely on it: check also interp->fscodec_initialized for
2890       subinterpreters. */
2891    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2892        return PyUnicode_AsEncodedString(unicode,
2893                                         Py_FileSystemDefaultEncoding,
2894                                         "surrogateescape");
2895    }
2896    else {
2897        /* locale encoding with surrogateescape */
2898        wchar_t *wchar;
2899        char *bytes;
2900        PyObject *bytes_obj;
2901        size_t error_pos;
2902
2903        wchar = PyUnicode_AsWideCharString(unicode, NULL);
2904        if (wchar == NULL)
2905            return NULL;
2906        bytes = _Py_wchar2char(wchar, &error_pos);
2907        if (bytes == NULL) {
2908            if (error_pos != (size_t)-1) {
2909                char *errmsg = strerror(errno);
2910                PyObject *exc = NULL;
2911                if (errmsg == NULL)
2912                    errmsg = "Py_wchar2char() failed";
2913                raise_encode_exception(&exc,
2914                    "filesystemencoding",
2915                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2916                    error_pos, error_pos+1,
2917                    errmsg);
2918                Py_XDECREF(exc);
2919            }
2920            else
2921                PyErr_NoMemory();
2922            PyMem_Free(wchar);
2923            return NULL;
2924        }
2925        PyMem_Free(wchar);
2926
2927        bytes_obj = PyBytes_FromString(bytes);
2928        PyMem_Free(bytes);
2929        return bytes_obj;
2930    }
2931#endif
2932}
2933
2934PyObject *
2935PyUnicode_AsEncodedString(PyObject *unicode,
2936                          const char *encoding,
2937                          const char *errors)
2938{
2939    PyObject *v;
2940    char lower[11];  /* Enough for any encoding shortcut */
2941
2942    if (!PyUnicode_Check(unicode)) {
2943        PyErr_BadArgument();
2944        return NULL;
2945    }
2946
2947    if (encoding == NULL) {
2948        if (errors == NULL || strcmp(errors, "strict") == 0)
2949            return _PyUnicode_AsUTF8String(unicode, NULL);
2950        else
2951            return _PyUnicode_AsUTF8String(unicode, errors);
2952    }
2953
2954    /* Shortcuts for common default encodings */
2955    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2956        if ((strcmp(lower, "utf-8") == 0) ||
2957            (strcmp(lower, "utf8") == 0))
2958        {
2959            if (errors == NULL || strcmp(errors, "strict") == 0)
2960                return _PyUnicode_AsUTF8String(unicode, NULL);
2961            else
2962                return _PyUnicode_AsUTF8String(unicode, errors);
2963        }
2964        else if ((strcmp(lower, "latin-1") == 0) ||
2965                 (strcmp(lower, "latin1") == 0) ||
2966                 (strcmp(lower, "iso-8859-1") == 0))
2967            return _PyUnicode_AsLatin1String(unicode, errors);
2968#ifdef HAVE_MBCS
2969        else if (strcmp(lower, "mbcs") == 0)
2970            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2971                                        PyUnicode_GET_SIZE(unicode),
2972                                        errors);
2973#endif
2974        else if (strcmp(lower, "ascii") == 0)
2975            return _PyUnicode_AsASCIIString(unicode, errors);
2976    }
2977
2978    /* Encode via the codec registry */
2979    v = PyCodec_Encode(unicode, encoding, errors);
2980    if (v == NULL)
2981        return NULL;
2982
2983    /* The normal path */
2984    if (PyBytes_Check(v))
2985        return v;
2986
2987    /* If the codec returns a buffer, raise a warning and convert to bytes */
2988    if (PyByteArray_Check(v)) {
2989        int error;
2990        PyObject *b;
2991
2992        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2993            "encoder %s returned bytearray instead of bytes",
2994            encoding);
2995        if (error) {
2996            Py_DECREF(v);
2997            return NULL;
2998        }
2999
3000        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3001        Py_DECREF(v);
3002        return b;
3003    }
3004
3005    PyErr_Format(PyExc_TypeError,
3006                 "encoder did not return a bytes object (type=%.400s)",
3007                 Py_TYPE(v)->tp_name);
3008    Py_DECREF(v);
3009    return NULL;
3010}
3011
3012PyObject *
3013PyUnicode_AsEncodedUnicode(PyObject *unicode,
3014                           const char *encoding,
3015                           const char *errors)
3016{
3017    PyObject *v;
3018
3019    if (!PyUnicode_Check(unicode)) {
3020        PyErr_BadArgument();
3021        goto onError;
3022    }
3023
3024    if (encoding == NULL)
3025        encoding = PyUnicode_GetDefaultEncoding();
3026
3027    /* Encode via the codec registry */
3028    v = PyCodec_Encode(unicode, encoding, errors);
3029    if (v == NULL)
3030        goto onError;
3031    if (!PyUnicode_Check(v)) {
3032        PyErr_Format(PyExc_TypeError,
3033                     "encoder did not return an str object (type=%.400s)",
3034                     Py_TYPE(v)->tp_name);
3035        Py_DECREF(v);
3036        goto onError;
3037    }
3038    return v;
3039
3040  onError:
3041    return NULL;
3042}
3043
3044PyObject*
3045PyUnicode_DecodeFSDefault(const char *s) {
3046    Py_ssize_t size = (Py_ssize_t)strlen(s);
3047    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3048}
3049
3050PyObject*
3051PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3052{
3053#ifdef HAVE_MBCS
3054    return PyUnicode_DecodeMBCS(s, size, NULL);
3055#elif defined(__APPLE__)
3056    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3057#else
3058    PyInterpreterState *interp = PyThreadState_GET()->interp;
3059    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3060       cannot use it to encode and decode filenames before it is loaded. Load
3061       the Python codec requires to encode at least its own filename. Use the C
3062       version of the locale codec until the codec registry is initialized and
3063       the Python codec is loaded.
3064
3065       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3066       cannot only rely on it: check also interp->fscodec_initialized for
3067       subinterpreters. */
3068    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3069        return PyUnicode_Decode(s, size,
3070                                Py_FileSystemDefaultEncoding,
3071                                "surrogateescape");
3072    }
3073    else {
3074        /* locale encoding with surrogateescape */
3075        wchar_t *wchar;
3076        PyObject *unicode;
3077        size_t len;
3078
3079        if (s[size] != '\0' || size != strlen(s)) {
3080            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3081            return NULL;
3082        }
3083
3084        wchar = _Py_char2wchar(s, &len);
3085        if (wchar == NULL)
3086            return PyErr_NoMemory();
3087
3088        unicode = PyUnicode_FromWideChar(wchar, len);
3089        PyMem_Free(wchar);
3090        return unicode;
3091    }
3092#endif
3093}
3094
3095
3096int
3097PyUnicode_FSConverter(PyObject* arg, void* addr)
3098{
3099    PyObject *output = NULL;
3100    Py_ssize_t size;
3101    void *data;
3102    if (arg == NULL) {
3103        Py_DECREF(*(PyObject**)addr);
3104        return 1;
3105    }
3106    if (PyBytes_Check(arg)) {
3107        output = arg;
3108        Py_INCREF(output);
3109    }
3110    else {
3111        arg = PyUnicode_FromObject(arg);
3112        if (!arg)
3113            return 0;
3114        output = PyUnicode_EncodeFSDefault(arg);
3115        Py_DECREF(arg);
3116        if (!output)
3117            return 0;
3118        if (!PyBytes_Check(output)) {
3119            Py_DECREF(output);
3120            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3121            return 0;
3122        }
3123    }
3124    size = PyBytes_GET_SIZE(output);
3125    data = PyBytes_AS_STRING(output);
3126    if (size != strlen(data)) {
3127        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3128        Py_DECREF(output);
3129        return 0;
3130    }
3131    *(PyObject**)addr = output;
3132    return Py_CLEANUP_SUPPORTED;
3133}
3134
3135
3136int
3137PyUnicode_FSDecoder(PyObject* arg, void* addr)
3138{
3139    PyObject *output = NULL;
3140    if (arg == NULL) {
3141        Py_DECREF(*(PyObject**)addr);
3142        return 1;
3143    }
3144    if (PyUnicode_Check(arg)) {
3145        if (PyUnicode_READY(arg))
3146            return 0;
3147        output = arg;
3148        Py_INCREF(output);
3149    }
3150    else {
3151        arg = PyBytes_FromObject(arg);
3152        if (!arg)
3153            return 0;
3154        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3155                                                  PyBytes_GET_SIZE(arg));
3156        Py_DECREF(arg);
3157        if (!output)
3158            return 0;
3159        if (!PyUnicode_Check(output)) {
3160            Py_DECREF(output);
3161            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3162            return 0;
3163        }
3164    }
3165    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3166                 PyUnicode_GET_LENGTH(output), 0, 1)) {
3167        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3168        Py_DECREF(output);
3169        return 0;
3170    }
3171    *(PyObject**)addr = output;
3172    return Py_CLEANUP_SUPPORTED;
3173}
3174
3175
3176char*
3177PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3178{
3179    PyObject *bytes;
3180    PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3181
3182    if (!PyUnicode_Check(unicode)) {
3183        PyErr_BadArgument();
3184        return NULL;
3185    }
3186    if (PyUnicode_READY(u) == -1)
3187        return NULL;
3188
3189    if (PyUnicode_UTF8(unicode) == NULL) {
3190        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3191        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3192        if (bytes == NULL)
3193            return NULL;
3194        _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3195        if (_PyUnicode_UTF8(u) == NULL) {
3196            Py_DECREF(bytes);
3197            return NULL;
3198        }
3199        _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3200        Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
3201        Py_DECREF(bytes);
3202    }
3203
3204    if (psize)
3205        *psize = PyUnicode_UTF8_LENGTH(unicode);
3206    return PyUnicode_UTF8(unicode);
3207}
3208
3209char*
3210PyUnicode_AsUTF8(PyObject *unicode)
3211{
3212    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3213}
3214
3215#ifdef Py_DEBUG
3216int unicode_as_unicode_calls = 0;
3217#endif
3218
3219
3220Py_UNICODE *
3221PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3222{
3223    PyUnicodeObject *u;
3224    const unsigned char *one_byte;
3225#if SIZEOF_WCHAR_T == 4
3226    const Py_UCS2 *two_bytes;
3227#else
3228    const Py_UCS4 *four_bytes;
3229    const Py_UCS4 *ucs4_end;
3230    Py_ssize_t num_surrogates;
3231#endif
3232    wchar_t *w;
3233    wchar_t *wchar_end;
3234
3235    if (!PyUnicode_Check(unicode)) {
3236        PyErr_BadArgument();
3237        return NULL;
3238    }
3239    u = (PyUnicodeObject*)unicode;
3240    if (_PyUnicode_WSTR(u) == NULL) {
3241        /* Non-ASCII compact unicode object */
3242        assert(_PyUnicode_KIND(u) != 0);
3243        assert(PyUnicode_IS_READY(u));
3244
3245#ifdef Py_DEBUG
3246        ++unicode_as_unicode_calls;
3247#endif
3248
3249        if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3250#if SIZEOF_WCHAR_T == 2
3251            four_bytes = PyUnicode_4BYTE_DATA(u);
3252            ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3253            num_surrogates = 0;
3254
3255            for (; four_bytes < ucs4_end; ++four_bytes) {
3256                if (*four_bytes > 0xFFFF)
3257                    ++num_surrogates;
3258            }
3259
3260            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3261                    sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3262            if (!_PyUnicode_WSTR(u)) {
3263                PyErr_NoMemory();
3264                return NULL;
3265            }
3266            _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3267
3268            w = _PyUnicode_WSTR(u);
3269            wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3270            four_bytes = PyUnicode_4BYTE_DATA(u);
3271            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3272                if (*four_bytes > 0xFFFF) {
3273                    /* encode surrogate pair in this case */
3274                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3275                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3276                }
3277                else
3278                    *w = *four_bytes;
3279
3280                if (w > wchar_end) {
3281                    assert(0 && "Miscalculated string end");
3282                }
3283            }
3284            *w = 0;
3285#else
3286            /* sizeof(wchar_t) == 4 */
3287            Py_FatalError("Impossible unicode object state, wstr and str "
3288                          "should share memory already.");
3289            return NULL;
3290#endif
3291        }
3292        else {
3293            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3294                                                  (_PyUnicode_LENGTH(u) + 1));
3295            if (!_PyUnicode_WSTR(u)) {
3296                PyErr_NoMemory();
3297                return NULL;
3298            }
3299            if (!PyUnicode_IS_COMPACT_ASCII(u))
3300                _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3301            w = _PyUnicode_WSTR(u);
3302            wchar_end = w + _PyUnicode_LENGTH(u);
3303
3304            if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3305                one_byte = PyUnicode_1BYTE_DATA(u);
3306                for (; w < wchar_end; ++one_byte, ++w)
3307                    *w = *one_byte;
3308                /* null-terminate the wstr */
3309                *w = 0;
3310            }
3311            else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3312#if SIZEOF_WCHAR_T == 4
3313                two_bytes = PyUnicode_2BYTE_DATA(u);
3314                for (; w < wchar_end; ++two_bytes, ++w)
3315                    *w = *two_bytes;
3316                /* null-terminate the wstr */
3317                *w = 0;
3318#else
3319                /* sizeof(wchar_t) == 2 */
3320                PyObject_FREE(_PyUnicode_WSTR(u));
3321                _PyUnicode_WSTR(u) = NULL;
3322                Py_FatalError("Impossible unicode object state, wstr "
3323                              "and str should share memory already.");
3324                return NULL;
3325#endif
3326            }
3327            else {
3328                assert(0 && "This should never happen.");
3329            }
3330        }
3331    }
3332    if (size != NULL)
3333        *size = PyUnicode_WSTR_LENGTH(u);
3334    return _PyUnicode_WSTR(u);
3335}
3336
3337Py_UNICODE *
3338PyUnicode_AsUnicode(PyObject *unicode)
3339{
3340    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3341}
3342
3343
3344Py_ssize_t
3345PyUnicode_GetSize(PyObject *unicode)
3346{
3347    if (!PyUnicode_Check(unicode)) {
3348        PyErr_BadArgument();
3349        goto onError;
3350    }
3351    return PyUnicode_GET_SIZE(unicode);
3352
3353  onError:
3354    return -1;
3355}
3356
3357Py_ssize_t
3358PyUnicode_GetLength(PyObject *unicode)
3359{
3360    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3361        PyErr_BadArgument();
3362        return -1;
3363    }
3364
3365    return PyUnicode_GET_LENGTH(unicode);
3366}
3367
3368Py_UCS4
3369PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3370{
3371    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3372        PyErr_BadArgument();
3373        return (Py_UCS4)-1;
3374    }
3375    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3376        PyErr_SetString(PyExc_IndexError, "string index out of range");
3377        return (Py_UCS4)-1;
3378    }
3379    return PyUnicode_READ_CHAR(unicode, index);
3380}
3381
3382int
3383PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3384{
3385    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3386        PyErr_BadArgument();
3387        return -1;
3388    }
3389    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3390        PyErr_SetString(PyExc_IndexError, "string index out of range");
3391        return -1;
3392    }
3393    if (_PyUnicode_Dirty(unicode))
3394        return -1;
3395    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3396                    index, ch);
3397    return 0;
3398}
3399
3400const char *
3401PyUnicode_GetDefaultEncoding(void)
3402{
3403    return "utf-8";
3404}
3405
3406/* create or adjust a UnicodeDecodeError */
3407static void
3408make_decode_exception(PyObject **exceptionObject,
3409                      const char *encoding,
3410                      const char *input, Py_ssize_t length,
3411                      Py_ssize_t startpos, Py_ssize_t endpos,
3412                      const char *reason)
3413{
3414    if (*exceptionObject == NULL) {
3415        *exceptionObject = PyUnicodeDecodeError_Create(
3416            encoding, input, length, startpos, endpos, reason);
3417    }
3418    else {
3419        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3420            goto onError;
3421        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3422            goto onError;
3423        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3424            goto onError;
3425    }
3426    return;
3427
3428onError:
3429    Py_DECREF(*exceptionObject);
3430    *exceptionObject = NULL;
3431}
3432
3433/* error handling callback helper:
3434   build arguments, call the callback and check the arguments,
3435   if no exception occurred, copy the replacement to the output
3436   and adjust various state variables.
3437   return 0 on success, -1 on error
3438*/
3439
3440static int
3441unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3442                                 const char *encoding, const char *reason,
3443                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3444                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3445                                 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
3446{
3447    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3448
3449    PyObject *restuple = NULL;
3450    PyObject *repunicode = NULL;
3451    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
3452    Py_ssize_t insize;
3453    Py_ssize_t requiredsize;
3454    Py_ssize_t newpos;
3455    const Py_UNICODE *repptr;
3456    PyObject *inputobj = NULL;
3457    Py_ssize_t repsize;
3458    int res = -1;
3459
3460    if (*errorHandler == NULL) {
3461        *errorHandler = PyCodec_LookupError(errors);
3462        if (*errorHandler == NULL)
3463            goto onError;
3464    }
3465
3466    make_decode_exception(exceptionObject,
3467        encoding,
3468        *input, *inend - *input,
3469        *startinpos, *endinpos,
3470        reason);
3471    if (*exceptionObject == NULL)
3472        goto onError;
3473
3474    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3475    if (restuple == NULL)
3476        goto onError;
3477    if (!PyTuple_Check(restuple)) {
3478        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3479        goto onError;
3480    }
3481    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3482        goto onError;
3483
3484    /* Copy back the bytes variables, which might have been modified by the
3485       callback */
3486    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3487    if (!inputobj)
3488        goto onError;
3489    if (!PyBytes_Check(inputobj)) {
3490        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3491    }
3492    *input = PyBytes_AS_STRING(inputobj);
3493    insize = PyBytes_GET_SIZE(inputobj);
3494    *inend = *input + insize;
3495    /* we can DECREF safely, as the exception has another reference,
3496       so the object won't go away. */
3497    Py_DECREF(inputobj);
3498
3499    if (newpos<0)
3500        newpos = insize+newpos;
3501    if (newpos<0 || newpos>insize) {
3502        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3503        goto onError;
3504    }
3505
3506    /* need more space? (at least enough for what we
3507       have+the replacement+the rest of the string (starting
3508       at the new input position), so we won't have to check space
3509       when there are no errors in the rest of the string) */
3510    repptr = PyUnicode_AS_UNICODE(repunicode);
3511    repsize = PyUnicode_GET_SIZE(repunicode);
3512    requiredsize = *outpos + repsize + insize-newpos;
3513    if (requiredsize > outsize) {
3514        if (requiredsize<2*outsize)
3515            requiredsize = 2*outsize;
3516        if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
3517            goto onError;
3518        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
3519    }
3520    *endinpos = newpos;
3521    *inptr = *input + newpos;
3522    Py_UNICODE_COPY(*outptr, repptr, repsize);
3523    *outptr += repsize;
3524    *outpos += repsize;
3525
3526    /* we made it! */
3527    res = 0;
3528
3529  onError:
3530    Py_XDECREF(restuple);
3531    return res;
3532}
3533
3534/* --- UTF-7 Codec -------------------------------------------------------- */
3535
3536/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3537
3538/* Three simple macros defining base-64. */
3539
3540/* Is c a base-64 character? */
3541
3542#define IS_BASE64(c) \
3543    (((c) >= 'A' && (c) <= 'Z') ||     \
3544     ((c) >= 'a' && (c) <= 'z') ||     \
3545     ((c) >= '0' && (c) <= '9') ||     \
3546     (c) == '+' || (c) == '/')
3547
3548/* given that c is a base-64 character, what is its base-64 value? */
3549
3550#define FROM_BASE64(c)                                                  \
3551    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3552     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3553     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3554     (c) == '+' ? 62 : 63)
3555
3556/* What is the base-64 character of the bottom 6 bits of n? */
3557
3558#define TO_BASE64(n)  \
3559    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3560
3561/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3562 * decoded as itself.  We are permissive on decoding; the only ASCII
3563 * byte not decoding to itself is the + which begins a base64
3564 * string. */
3565
3566#define DECODE_DIRECT(c)                                \
3567    ((c) <= 127 && (c) != '+')
3568
3569/* The UTF-7 encoder treats ASCII characters differently according to
3570 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3571 * the above).  See RFC2152.  This array identifies these different
3572 * sets:
3573 * 0 : "Set D"
3574 *     alphanumeric and '(),-./:?
3575 * 1 : "Set O"
3576 *     !"#$%&*;<=>@[]^_`{|}
3577 * 2 : "whitespace"
3578 *     ht nl cr sp
3579 * 3 : special (must be base64 encoded)
3580 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3581 */
3582
3583static
3584char utf7_category[128] = {
3585/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3586    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3587/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3588    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3589/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3590    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3591/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3592    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3593/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3594    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3595/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3596    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3597/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3598    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3599/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3600    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3601};
3602
3603/* ENCODE_DIRECT: this character should be encoded as itself.  The
3604 * answer depends on whether we are encoding set O as itself, and also
3605 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3606 * clear that the answers to these questions vary between
3607 * applications, so this code needs to be flexible.  */
3608
3609#define ENCODE_DIRECT(c, directO, directWS)             \
3610    ((c) < 128 && (c) > 0 &&                            \
3611     ((utf7_category[(c)] == 0) ||                      \
3612      (directWS && (utf7_category[(c)] == 2)) ||        \
3613      (directO && (utf7_category[(c)] == 1))))
3614
3615PyObject *
3616PyUnicode_DecodeUTF7(const char *s,
3617                     Py_ssize_t size,
3618                     const char *errors)
3619{
3620    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3621}
3622
3623/* The decoder.  The only state we preserve is our read position,
3624 * i.e. how many characters we have consumed.  So if we end in the
3625 * middle of a shift sequence we have to back off the read position
3626 * and the output to the beginning of the sequence, otherwise we lose
3627 * all the shift state (seen bits, number of bits seen, high
3628 * surrogate). */
3629
3630PyObject *
3631PyUnicode_DecodeUTF7Stateful(const char *s,
3632                             Py_ssize_t size,
3633                             const char *errors,
3634                             Py_ssize_t *consumed)
3635{
3636    const char *starts = s;
3637    Py_ssize_t startinpos;
3638    Py_ssize_t endinpos;
3639    Py_ssize_t outpos;
3640    const char *e;
3641    PyUnicodeObject *unicode;
3642    Py_UNICODE *p;
3643    const char *errmsg = "";
3644    int inShift = 0;
3645    Py_UNICODE *shiftOutStart;
3646    unsigned int base64bits = 0;
3647    unsigned long base64buffer = 0;
3648    Py_UNICODE surrogate = 0;
3649    PyObject *errorHandler = NULL;
3650    PyObject *exc = NULL;
3651
3652    unicode = _PyUnicode_New(size);
3653    if (!unicode)
3654        return NULL;
3655    if (size == 0) {
3656        if (consumed)
3657            *consumed = 0;
3658        return (PyObject *)unicode;
3659    }
3660
3661    p = PyUnicode_AS_UNICODE(unicode);
3662    shiftOutStart = p;
3663    e = s + size;
3664
3665    while (s < e) {
3666        Py_UNICODE ch;
3667      restart:
3668        ch = (unsigned char) *s;
3669
3670        if (inShift) { /* in a base-64 section */
3671            if (IS_BASE64(ch)) { /* consume a base-64 character */
3672                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3673                base64bits += 6;
3674                s++;
3675                if (base64bits >= 16) {
3676                    /* we have enough bits for a UTF-16 value */
3677                    Py_UNICODE outCh = (Py_UNICODE)
3678                                       (base64buffer >> (base64bits-16));
3679                    base64bits -= 16;
3680                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3681                    if (surrogate) {
3682                        /* expecting a second surrogate */
3683                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3684#ifdef Py_UNICODE_WIDE
3685                            *p++ = (((surrogate & 0x3FF)<<10)
3686                                    | (outCh & 0x3FF)) + 0x10000;
3687#else
3688                            *p++ = surrogate;
3689                            *p++ = outCh;
3690#endif
3691                            surrogate = 0;
3692                        }
3693                        else {
3694                            surrogate = 0;
3695                            errmsg = "second surrogate missing";
3696                            goto utf7Error;
3697                        }
3698                    }
3699                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3700                        /* first surrogate */
3701                        surrogate = outCh;
3702                    }
3703                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3704                        errmsg = "unexpected second surrogate";
3705                        goto utf7Error;
3706                    }
3707                    else {
3708                        *p++ = outCh;
3709                    }
3710                }
3711            }
3712            else { /* now leaving a base-64 section */
3713                inShift = 0;
3714                s++;
3715                if (surrogate) {
3716                    errmsg = "second surrogate missing at end of shift sequence";
3717                    goto utf7Error;
3718                }
3719                if (base64bits > 0) { /* left-over bits */
3720                    if (base64bits >= 6) {
3721                        /* We've seen at least one base-64 character */
3722                        errmsg = "partial character in shift sequence";
3723                        goto utf7Error;
3724                    }
3725                    else {
3726                        /* Some bits remain; they should be zero */
3727                        if (base64buffer != 0) {
3728                            errmsg = "non-zero padding bits in shift sequence";
3729                            goto utf7Error;
3730                        }
3731                    }
3732                }
3733                if (ch != '-') {
3734                    /* '-' is absorbed; other terminating
3735                       characters are preserved */
3736                    *p++ = ch;
3737                }
3738            }
3739        }
3740        else if ( ch == '+' ) {
3741            startinpos = s-starts;
3742            s++; /* consume '+' */
3743            if (s < e && *s == '-') { /* '+-' encodes '+' */
3744                s++;
3745                *p++ = '+';
3746            }
3747            else { /* begin base64-encoded section */
3748                inShift = 1;
3749                shiftOutStart = p;
3750                base64bits = 0;
3751            }
3752        }
3753        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3754            *p++ = ch;
3755            s++;
3756        }
3757        else {
3758            startinpos = s-starts;
3759            s++;
3760            errmsg = "unexpected special character";
3761            goto utf7Error;
3762        }
3763        continue;
3764utf7Error:
3765        outpos = p-PyUnicode_AS_UNICODE(unicode);
3766        endinpos = s-starts;
3767        if (unicode_decode_call_errorhandler(
3768                errors, &errorHandler,
3769                "utf7", errmsg,
3770                &starts, &e, &startinpos, &endinpos, &exc, &s,
3771                &unicode, &outpos, &p))
3772            goto onError;
3773    }
3774
3775    /* end of string */
3776
3777    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3778        /* if we're in an inconsistent state, that's an error */
3779        if (surrogate ||
3780                (base64bits >= 6) ||
3781                (base64bits > 0 && base64buffer != 0)) {
3782            outpos = p-PyUnicode_AS_UNICODE(unicode);
3783            endinpos = size;
3784            if (unicode_decode_call_errorhandler(
3785                    errors, &errorHandler,
3786                    "utf7", "unterminated shift sequence",
3787                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3788                    &unicode, &outpos, &p))
3789                goto onError;
3790            if (s < e)
3791                goto restart;
3792        }
3793    }
3794
3795    /* return state */
3796    if (consumed) {
3797        if (inShift) {
3798            p = shiftOutStart; /* back off output */
3799            *consumed = startinpos;
3800        }
3801        else {
3802            *consumed = s-starts;
3803        }
3804    }
3805
3806    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
3807        goto onError;
3808
3809    Py_XDECREF(errorHandler);
3810    Py_XDECREF(exc);
3811#ifndef DONT_MAKE_RESULT_READY
3812    if (_PyUnicode_READY_REPLACE(&unicode)) {
3813        Py_DECREF(unicode);
3814        return NULL;
3815    }
3816#endif
3817    assert(_PyUnicode_CheckConsistency(unicode, 1));
3818    return (PyObject *)unicode;
3819
3820  onError:
3821    Py_XDECREF(errorHandler);
3822    Py_XDECREF(exc);
3823    Py_DECREF(unicode);
3824    return NULL;
3825}
3826
3827
3828PyObject *
3829PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3830                     Py_ssize_t size,
3831                     int base64SetO,
3832                     int base64WhiteSpace,
3833                     const char *errors)
3834{
3835    PyObject *v;
3836    /* It might be possible to tighten this worst case */
3837    Py_ssize_t allocated = 8 * size;
3838    int inShift = 0;
3839    Py_ssize_t i = 0;
3840    unsigned int base64bits = 0;
3841    unsigned long base64buffer = 0;
3842    char * out;
3843    char * start;
3844
3845    if (size == 0)
3846        return PyBytes_FromStringAndSize(NULL, 0);
3847
3848    if (allocated / 8 != size)
3849        return PyErr_NoMemory();
3850
3851    v = PyBytes_FromStringAndSize(NULL, allocated);
3852    if (v == NULL)
3853        return NULL;
3854
3855    start = out = PyBytes_AS_STRING(v);
3856    for (;i < size; ++i) {
3857        Py_UNICODE ch = s[i];
3858
3859        if (inShift) {
3860            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3861                /* shifting out */
3862                if (base64bits) { /* output remaining bits */
3863                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
3864                    base64buffer = 0;
3865                    base64bits = 0;
3866                }
3867                inShift = 0;
3868                /* Characters not in the BASE64 set implicitly unshift the sequence
3869                   so no '-' is required, except if the character is itself a '-' */
3870                if (IS_BASE64(ch) || ch == '-') {
3871                    *out++ = '-';
3872                }
3873                *out++ = (char) ch;
3874            }
3875            else {
3876                goto encode_char;
3877            }
3878        }
3879        else { /* not in a shift sequence */
3880            if (ch == '+') {
3881                *out++ = '+';
3882                        *out++ = '-';
3883            }
3884            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3885                *out++ = (char) ch;
3886            }
3887            else {
3888                *out++ = '+';
3889                inShift = 1;
3890                goto encode_char;
3891            }
3892        }
3893        continue;
3894encode_char:
3895#ifdef Py_UNICODE_WIDE
3896        if (ch >= 0x10000) {
3897            /* code first surrogate */
3898            base64bits += 16;
3899            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3900            while (base64bits >= 6) {
3901                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3902                base64bits -= 6;
3903            }
3904            /* prepare second surrogate */
3905            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
3906        }
3907#endif
3908        base64bits += 16;
3909        base64buffer = (base64buffer << 16) | ch;
3910        while (base64bits >= 6) {
3911            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3912            base64bits -= 6;
3913        }
3914    }
3915    if (base64bits)
3916        *out++= TO_BASE64(base64buffer << (6-base64bits) );
3917    if (inShift)
3918        *out++ = '-';
3919    if (_PyBytes_Resize(&v, out - start) < 0)
3920        return NULL;
3921    return v;
3922}
3923
3924#undef IS_BASE64
3925#undef FROM_BASE64
3926#undef TO_BASE64
3927#undef DECODE_DIRECT
3928#undef ENCODE_DIRECT
3929
3930/* --- UTF-8 Codec -------------------------------------------------------- */
3931
3932static
3933char utf8_code_length[256] = {
3934    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
3935       illegal prefix.  See RFC 3629 for details */
3936    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3937    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3938    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3939    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3940    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3941    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3942    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3943    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3944    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
3945    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3946    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3947    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3948    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3949    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3950    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3951    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
3952};
3953
3954PyObject *
3955PyUnicode_DecodeUTF8(const char *s,
3956                     Py_ssize_t size,
3957                     const char *errors)
3958{
3959    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3960}
3961
3962/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3963#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3964
3965/* Mask to quickly check whether a C 'long' contains a
3966   non-ASCII, UTF8-encoded char. */
3967#if (SIZEOF_LONG == 8)
3968# define ASCII_CHAR_MASK 0x8080808080808080L
3969#elif (SIZEOF_LONG == 4)
3970# define ASCII_CHAR_MASK 0x80808080L
3971#else
3972# error C 'long' size should be either 4 or 8!
3973#endif
3974
3975/* Scans a UTF-8 string and returns the maximum character to be expected,
3976   the size of the decoded unicode string and if any major errors were
3977   encountered.
3978
3979   This function does check basic UTF-8 sanity, it does however NOT CHECK
3980   if the string contains surrogates, and if all continuation bytes are
3981   within the correct ranges, these checks are performed in
3982   PyUnicode_DecodeUTF8Stateful.
3983
3984   If it sets has_errors to 1, it means the value of unicode_size and max_char
3985   will be bogus and you should not rely on useful information in them.
3986   */
3987static Py_UCS4
3988utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3989                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3990                                  int *has_errors)
3991{
3992    Py_ssize_t n;
3993    Py_ssize_t char_count = 0;
3994    Py_UCS4 max_char = 127, new_max;
3995    Py_UCS4 upper_bound;
3996    const unsigned char *p = (const unsigned char *)s;
3997    const unsigned char *end = p + string_size;
3998    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3999    int err = 0;
4000
4001    for (; p < end && !err; ++p, ++char_count) {
4002        /* Only check value if it's not a ASCII char... */
4003        if (*p < 0x80) {
4004            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4005               an explanation. */
4006            if (!((size_t) p & LONG_PTR_MASK)) {
4007                /* Help register allocation */
4008                register const unsigned char *_p = p;
4009                while (_p < aligned_end) {
4010                    unsigned long value = *(unsigned long *) _p;
4011                    if (value & ASCII_CHAR_MASK)
4012                        break;
4013                    _p += SIZEOF_LONG;
4014                    char_count += SIZEOF_LONG;
4015                }
4016                p = _p;
4017                if (p == end)
4018                    break;
4019            }
4020        }
4021        if (*p >= 0x80) {
4022            n = utf8_code_length[*p];
4023            new_max = max_char;
4024            switch (n) {
4025            /* invalid start byte */
4026            case 0:
4027                err = 1;
4028                break;
4029            case 2:
4030                /* Code points between 0x00FF and 0x07FF inclusive.
4031                   Approximate the upper bound of the code point,
4032                   if this flips over 255 we can be sure it will be more
4033                   than 255 and the string will need 2 bytes per code coint,
4034                   if it stays under or equal to 255, we can be sure 1 byte
4035                   is enough.
4036                   ((*p & 0b00011111) << 6) | 0b00111111 */
4037                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4038                if (max_char < upper_bound)
4039                    new_max = upper_bound;
4040                /* Ensure we track at least that we left ASCII space. */
4041                if (new_max < 128)
4042                    new_max = 128;
4043                break;
4044            case 3:
4045                /* Between 0x0FFF and 0xFFFF inclusive, so values are
4046                   always > 255 and <= 65535 and will always need 2 bytes. */
4047                if (max_char < 65535)
4048                    new_max = 65535;
4049                break;
4050            case 4:
4051                /* Code point will be above 0xFFFF for sure in this case. */
4052                new_max = 65537;
4053                break;
4054            /* Internal error, this should be caught by the first if */
4055            case 1:
4056            default:
4057                assert(0 && "Impossible case in utf8_max_char_and_size");
4058                err = 1;
4059            }
4060            /* Instead of number of overall bytes for this code point,
4061               n contains the number of following bytes: */
4062            --n;
4063            /* Check if the follow up chars are all valid continuation bytes */
4064            if (n >= 1) {
4065                const unsigned char *cont;
4066                if ((p + n) >= end) {
4067                    if (consumed == 0)
4068                        /* incomplete data, non-incremental decoding */
4069                        err = 1;
4070                    break;
4071                }
4072                for (cont = p + 1; cont < (p + n); ++cont) {
4073                    if ((*cont & 0xc0) != 0x80) {
4074                        err = 1;
4075                        break;
4076                    }
4077                }
4078                p += n;
4079            }
4080            else
4081                err = 1;
4082            max_char = new_max;
4083        }
4084    }
4085
4086    if (unicode_size)
4087        *unicode_size = char_count;
4088    if (has_errors)
4089        *has_errors = err;
4090    return max_char;
4091}
4092
4093/* Similar to PyUnicode_WRITE but can also write into wstr field
4094   of the legacy unicode representation */
4095#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4096    do { \
4097        const int k_ = (kind); \
4098        if (k_ == PyUnicode_WCHAR_KIND) \
4099            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4100        else if (k_ == PyUnicode_1BYTE_KIND) \
4101            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4102        else if (k_ == PyUnicode_2BYTE_KIND) \
4103            ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4104        else \
4105            ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4106    } while (0)
4107
4108PyObject *
4109PyUnicode_DecodeUTF8Stateful(const char *s,
4110                             Py_ssize_t size,
4111                             const char *errors,
4112                             Py_ssize_t *consumed)
4113{
4114    const char *starts = s;
4115    int n;
4116    int k;
4117    Py_ssize_t startinpos;
4118    Py_ssize_t endinpos;
4119    const char *e, *aligned_end;
4120    PyUnicodeObject *unicode;
4121    const char *errmsg = "";
4122    PyObject *errorHandler = NULL;
4123    PyObject *exc = NULL;
4124    Py_UCS4 maxchar = 0;
4125    Py_ssize_t unicode_size;
4126    Py_ssize_t i;
4127    int kind;
4128    void *data;
4129    int has_errors;
4130    Py_UNICODE *error_outptr;
4131#if SIZEOF_WCHAR_T == 2
4132    Py_ssize_t wchar_offset = 0;
4133#endif
4134
4135    if (size == 0) {
4136        if (consumed)
4137            *consumed = 0;
4138        return (PyObject *)PyUnicode_New(0, 0);
4139    }
4140    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4141                                                consumed, &has_errors);
4142    if (has_errors) {
4143        unicode = _PyUnicode_New(size);
4144        if (!unicode)
4145            return NULL;
4146        kind = PyUnicode_WCHAR_KIND;
4147        data = PyUnicode_AS_UNICODE(unicode);
4148        assert(data != NULL);
4149    }
4150    else {
4151        unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4152        if (!unicode)
4153            return NULL;
4154        /* When the string is ASCII only, just use memcpy and return.
4155           unicode_size may be != size if there is an incomplete UTF-8
4156           sequence at the end of the ASCII block.  */
4157        if (maxchar < 128 && size == unicode_size) {
4158            Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4159            return (PyObject *)unicode;
4160        }
4161        kind = PyUnicode_KIND(unicode);
4162        data = PyUnicode_DATA(unicode);
4163    }
4164    /* Unpack UTF-8 encoded data */
4165    i = 0;
4166    e = s + size;
4167    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4168
4169    while (s < e) {
4170        Py_UCS4 ch = (unsigned char)*s;
4171
4172        if (ch < 0x80) {
4173            /* Fast path for runs of ASCII characters. Given that common UTF-8
4174               input will consist of an overwhelming majority of ASCII
4175               characters, we try to optimize for this case by checking
4176               as many characters as a C 'long' can contain.
4177               First, check if we can do an aligned read, as most CPUs have
4178               a penalty for unaligned reads.
4179            */
4180            if (!((size_t) s & LONG_PTR_MASK)) {
4181                /* Help register allocation */
4182                register const char *_s = s;
4183                register Py_ssize_t _i = i;
4184                while (_s < aligned_end) {
4185                    /* Read a whole long at a time (either 4 or 8 bytes),
4186                       and do a fast unrolled copy if it only contains ASCII
4187                       characters. */
4188                    unsigned long value = *(unsigned long *) _s;
4189                    if (value & ASCII_CHAR_MASK)
4190                        break;
4191                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4192                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4193                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4194                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
4195#if (SIZEOF_LONG == 8)
4196                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4197                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4198                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4199                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
4200#endif
4201                    _s += SIZEOF_LONG;
4202                    _i += SIZEOF_LONG;
4203                }
4204                s = _s;
4205                i = _i;
4206                if (s == e)
4207                    break;
4208                ch = (unsigned char)*s;
4209            }
4210        }
4211
4212        if (ch < 0x80) {
4213            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4214            s++;
4215            continue;
4216        }
4217
4218        n = utf8_code_length[ch];
4219
4220        if (s + n > e) {
4221            if (consumed)
4222                break;
4223            else {
4224                errmsg = "unexpected end of data";
4225                startinpos = s-starts;
4226                endinpos = startinpos+1;
4227                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4228                    endinpos++;
4229                goto utf8Error;
4230            }
4231        }
4232
4233        switch (n) {
4234
4235        case 0:
4236            errmsg = "invalid start byte";
4237            startinpos = s-starts;
4238            endinpos = startinpos+1;
4239            goto utf8Error;
4240
4241        case 1:
4242            errmsg = "internal error";
4243            startinpos = s-starts;
4244            endinpos = startinpos+1;
4245            goto utf8Error;
4246
4247        case 2:
4248            if ((s[1] & 0xc0) != 0x80) {
4249                errmsg = "invalid continuation byte";
4250                startinpos = s-starts;
4251                endinpos = startinpos + 1;
4252                goto utf8Error;
4253            }
4254            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4255            assert ((ch > 0x007F) && (ch <= 0x07FF));
4256            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4257            break;
4258
4259        case 3:
4260            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4261               will result in surrogates in range d800-dfff. Surrogates are
4262               not valid UTF-8 so they are rejected.
4263               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4264               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4265            if ((s[1] & 0xc0) != 0x80 ||
4266                (s[2] & 0xc0) != 0x80 ||
4267                ((unsigned char)s[0] == 0xE0 &&
4268                 (unsigned char)s[1] < 0xA0) ||
4269                ((unsigned char)s[0] == 0xED &&
4270                 (unsigned char)s[1] > 0x9F)) {
4271                errmsg = "invalid continuation byte";
4272                startinpos = s-starts;
4273                endinpos = startinpos + 1;
4274
4275                /* if s[1] first two bits are 1 and 0, then the invalid
4276                   continuation byte is s[2], so increment endinpos by 1,
4277                   if not, s[1] is invalid and endinpos doesn't need to
4278                   be incremented. */
4279                if ((s[1] & 0xC0) == 0x80)
4280                    endinpos++;
4281                goto utf8Error;
4282            }
4283            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4284            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4285            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4286            break;
4287
4288        case 4:
4289            if ((s[1] & 0xc0) != 0x80 ||
4290                (s[2] & 0xc0) != 0x80 ||
4291                (s[3] & 0xc0) != 0x80 ||
4292                ((unsigned char)s[0] == 0xF0 &&
4293                 (unsigned char)s[1] < 0x90) ||
4294                ((unsigned char)s[0] == 0xF4 &&
4295                 (unsigned char)s[1] > 0x8F)) {
4296                errmsg = "invalid continuation byte";
4297                startinpos = s-starts;
4298                endinpos = startinpos + 1;
4299                if ((s[1] & 0xC0) == 0x80) {
4300                    endinpos++;
4301                    if ((s[2] & 0xC0) == 0x80)
4302                        endinpos++;
4303                }
4304                goto utf8Error;
4305            }
4306            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4307                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4308            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4309
4310            /* If the string is flexible or we have native UCS-4, write
4311               directly.. */
4312            if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4313                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4314
4315            else {
4316                /* compute and append the two surrogates: */
4317
4318                /* translate from 10000..10FFFF to 0..FFFF */
4319                ch -= 0x10000;
4320
4321                /* high surrogate = top 10 bits added to D800 */
4322                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4323                                       (Py_UNICODE)(0xD800 + (ch >> 10)));
4324
4325                /* low surrogate = bottom 10 bits added to DC00 */
4326                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4327                                       (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4328            }
4329#if SIZEOF_WCHAR_T == 2
4330            wchar_offset++;
4331#endif
4332            break;
4333        }
4334        s += n;
4335        continue;
4336
4337      utf8Error:
4338        /* If this is not yet a resizable string, make it one.. */
4339        if (kind != PyUnicode_WCHAR_KIND) {
4340            const Py_UNICODE *u;
4341            PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4342            if (!new_unicode)
4343                goto onError;
4344            u = PyUnicode_AsUnicode((PyObject *)unicode);
4345            if (!u)
4346                goto onError;
4347#if SIZEOF_WCHAR_T == 2
4348            i += wchar_offset;
4349#endif
4350            Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4351            Py_DECREF(unicode);
4352            unicode = new_unicode;
4353            kind = 0;
4354            data = PyUnicode_AS_UNICODE(new_unicode);
4355            assert(data != NULL);
4356        }
4357        error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
4358        if (unicode_decode_call_errorhandler(
4359                errors, &errorHandler,
4360                "utf8", errmsg,
4361                &starts, &e, &startinpos, &endinpos, &exc, &s,
4362                &unicode, &i, &error_outptr))
4363            goto onError;
4364        /* Update data because unicode_decode_call_errorhandler might have
4365           re-created or resized the unicode object. */
4366        data = PyUnicode_AS_UNICODE(unicode);
4367        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4368    }
4369    /* Ensure the unicode_size calculation above was correct: */
4370    assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4371
4372    if (consumed)
4373        *consumed = s-starts;
4374
4375    /* Adjust length and ready string when it contained errors and
4376       is of the old resizable kind. */
4377    if (kind == PyUnicode_WCHAR_KIND) {
4378        if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
4379            goto onError;
4380    }
4381
4382    Py_XDECREF(errorHandler);
4383    Py_XDECREF(exc);
4384#ifndef DONT_MAKE_RESULT_READY
4385    if (_PyUnicode_READY_REPLACE(&unicode)) {
4386        Py_DECREF(unicode);
4387        return NULL;
4388    }
4389#endif
4390    assert(_PyUnicode_CheckConsistency(unicode, 1));
4391    return (PyObject *)unicode;
4392
4393  onError:
4394    Py_XDECREF(errorHandler);
4395    Py_XDECREF(exc);
4396    Py_DECREF(unicode);
4397    return NULL;
4398}
4399
4400#undef WRITE_FLEXIBLE_OR_WSTR
4401
4402#ifdef __APPLE__
4403
4404/* Simplified UTF-8 decoder using surrogateescape error handler,
4405   used to decode the command line arguments on Mac OS X. */
4406
4407wchar_t*
4408_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4409{
4410    int n;
4411    const char *e;
4412    wchar_t *unicode, *p;
4413
4414    /* Note: size will always be longer than the resulting Unicode
4415       character count */
4416    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4417        PyErr_NoMemory();
4418        return NULL;
4419    }
4420    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4421    if (!unicode)
4422        return NULL;
4423
4424    /* Unpack UTF-8 encoded data */
4425    p = unicode;
4426    e = s + size;
4427    while (s < e) {
4428        Py_UCS4 ch = (unsigned char)*s;
4429
4430        if (ch < 0x80) {
4431            *p++ = (wchar_t)ch;
4432            s++;
4433            continue;
4434        }
4435
4436        n = utf8_code_length[ch];
4437        if (s + n > e) {
4438            goto surrogateescape;
4439        }
4440
4441        switch (n) {
4442        case 0:
4443        case 1:
4444            goto surrogateescape;
4445
4446        case 2:
4447            if ((s[1] & 0xc0) != 0x80)
4448                goto surrogateescape;
4449            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4450            assert ((ch > 0x007F) && (ch <= 0x07FF));
4451            *p++ = (wchar_t)ch;
4452            break;
4453
4454        case 3:
4455            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4456               will result in surrogates in range d800-dfff. Surrogates are
4457               not valid UTF-8 so they are rejected.
4458               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4459               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4460            if ((s[1] & 0xc0) != 0x80 ||
4461                (s[2] & 0xc0) != 0x80 ||
4462                ((unsigned char)s[0] == 0xE0 &&
4463                 (unsigned char)s[1] < 0xA0) ||
4464                ((unsigned char)s[0] == 0xED &&
4465                 (unsigned char)s[1] > 0x9F)) {
4466
4467                goto surrogateescape;
4468            }
4469            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4470            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4471            *p++ = (wchar_t)ch;
4472            break;
4473
4474        case 4:
4475            if ((s[1] & 0xc0) != 0x80 ||
4476                (s[2] & 0xc0) != 0x80 ||
4477                (s[3] & 0xc0) != 0x80 ||
4478                ((unsigned char)s[0] == 0xF0 &&
4479                 (unsigned char)s[1] < 0x90) ||
4480                ((unsigned char)s[0] == 0xF4 &&
4481                 (unsigned char)s[1] > 0x8F)) {
4482                goto surrogateescape;
4483            }
4484            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4485                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4486            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4487
4488#if SIZEOF_WCHAR_T == 4
4489            *p++ = (wchar_t)ch;
4490#else
4491            /*  compute and append the two surrogates: */
4492
4493            /*  translate from 10000..10FFFF to 0..FFFF */
4494            ch -= 0x10000;
4495
4496            /*  high surrogate = top 10 bits added to D800 */
4497            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4498
4499            /*  low surrogate = bottom 10 bits added to DC00 */
4500            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4501#endif
4502            break;
4503        }
4504        s += n;
4505        continue;
4506
4507      surrogateescape:
4508        *p++ = 0xDC00 + ch;
4509        s++;
4510    }
4511    *p = L'\0';
4512    return unicode;
4513}
4514
4515#endif /* __APPLE__ */
4516
4517/* Primary internal function which creates utf8 encoded bytes objects.
4518
4519   Allocation strategy:  if the string is short, convert into a stack buffer
4520   and allocate exactly as much space needed at the end.  Else allocate the
4521   maximum possible needed (4 result bytes per Unicode character), and return
4522   the excess memory at the end.
4523*/
4524PyObject *
4525_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
4526{
4527#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4528
4529    Py_ssize_t i;                /* index into s of next input byte */
4530    PyObject *result;            /* result string object */
4531    char *p;                     /* next free byte in output buffer */
4532    Py_ssize_t nallocated;      /* number of result bytes allocated */
4533    Py_ssize_t nneeded;            /* number of result bytes needed */
4534    char stackbuf[MAX_SHORT_UNICHARS * 4];
4535    PyObject *errorHandler = NULL;
4536    PyObject *exc = NULL;
4537    int kind;
4538    void *data;
4539    Py_ssize_t size;
4540    PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4541#if SIZEOF_WCHAR_T == 2
4542    Py_ssize_t wchar_offset = 0;
4543#endif
4544
4545    if (!PyUnicode_Check(unicode)) {
4546        PyErr_BadArgument();
4547        return NULL;
4548    }
4549
4550    if (PyUnicode_READY(unicode) == -1)
4551        return NULL;
4552
4553    if (PyUnicode_UTF8(unicode))
4554        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4555                                         PyUnicode_UTF8_LENGTH(unicode));
4556
4557    kind = PyUnicode_KIND(unicode);
4558    data = PyUnicode_DATA(unicode);
4559    size = PyUnicode_GET_LENGTH(unicode);
4560
4561    assert(size >= 0);
4562
4563    if (size <= MAX_SHORT_UNICHARS) {
4564        /* Write into the stack buffer; nallocated can't overflow.
4565         * At the end, we'll allocate exactly as much heap space as it
4566         * turns out we need.
4567         */
4568        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4569        result = NULL;   /* will allocate after we're done */
4570        p = stackbuf;
4571    }
4572    else {
4573        /* Overallocate on the heap, and give the excess back at the end. */
4574        nallocated = size * 4;
4575        if (nallocated / 4 != size)  /* overflow! */
4576            return PyErr_NoMemory();
4577        result = PyBytes_FromStringAndSize(NULL, nallocated);
4578        if (result == NULL)
4579            return NULL;
4580        p = PyBytes_AS_STRING(result);
4581    }
4582
4583    for (i = 0; i < size;) {
4584        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4585
4586        if (ch < 0x80)
4587            /* Encode ASCII */
4588            *p++ = (char) ch;
4589
4590        else if (ch < 0x0800) {
4591            /* Encode Latin-1 */
4592            *p++ = (char)(0xc0 | (ch >> 6));
4593            *p++ = (char)(0x80 | (ch & 0x3f));
4594        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4595            Py_ssize_t newpos;
4596            PyObject *rep;
4597            Py_ssize_t repsize, k, startpos;
4598            startpos = i-1;
4599#if SIZEOF_WCHAR_T == 2
4600            startpos += wchar_offset;
4601#endif
4602            rep = unicode_encode_call_errorhandler(
4603                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4604                  PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4605                  &exc, startpos, startpos+1, &newpos);
4606            if (!rep)
4607                goto error;
4608
4609            if (PyBytes_Check(rep))
4610                repsize = PyBytes_GET_SIZE(rep);
4611            else
4612                repsize = PyUnicode_GET_SIZE(rep);
4613
4614            if (repsize > 4) {
4615                Py_ssize_t offset;
4616
4617                if (result == NULL)
4618                    offset = p - stackbuf;
4619                else
4620                    offset = p - PyBytes_AS_STRING(result);
4621
4622                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4623                    /* integer overflow */
4624                    PyErr_NoMemory();
4625                    goto error;
4626                }
4627                nallocated += repsize - 4;
4628                if (result != NULL) {
4629                    if (_PyBytes_Resize(&result, nallocated) < 0)
4630                        goto error;
4631                } else {
4632                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4633                    if (result == NULL)
4634                        goto error;
4635                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4636                }
4637                p = PyBytes_AS_STRING(result) + offset;
4638            }
4639
4640            if (PyBytes_Check(rep)) {
4641                char *prep = PyBytes_AS_STRING(rep);
4642                for(k = repsize; k > 0; k--)
4643                    *p++ = *prep++;
4644            } else /* rep is unicode */ {
4645                const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4646                Py_UNICODE c;
4647
4648                for(k=0; k<repsize; k++) {
4649                    c = prep[k];
4650                    if (0x80 <= c) {
4651                        raise_encode_exception(&exc, "utf-8",
4652                                               PyUnicode_AS_UNICODE(unicode),
4653                                               size, i-1, i,
4654                                               "surrogates not allowed");
4655                        goto error;
4656                    }
4657                    *p++ = (char)prep[k];
4658                }
4659            }
4660            Py_DECREF(rep);
4661        } else if (ch < 0x10000) {
4662            *p++ = (char)(0xe0 | (ch >> 12));
4663            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4664            *p++ = (char)(0x80 | (ch & 0x3f));
4665        } else /* ch >= 0x10000 */ {
4666            /* Encode UCS4 Unicode ordinals */
4667            *p++ = (char)(0xf0 | (ch >> 18));
4668            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4669            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4670            *p++ = (char)(0x80 | (ch & 0x3f));
4671#if SIZEOF_WCHAR_T == 2
4672            wchar_offset++;
4673#endif
4674        }
4675    }
4676
4677    if (result == NULL) {
4678        /* This was stack allocated. */
4679        nneeded = p - stackbuf;
4680        assert(nneeded <= nallocated);
4681        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4682    }
4683    else {
4684        /* Cut back to size actually needed. */
4685        nneeded = p - PyBytes_AS_STRING(result);
4686        assert(nneeded <= nallocated);
4687        _PyBytes_Resize(&result, nneeded);
4688    }
4689
4690    Py_XDECREF(errorHandler);
4691    Py_XDECREF(exc);
4692    return result;
4693 error:
4694    Py_XDECREF(errorHandler);
4695    Py_XDECREF(exc);
4696    Py_XDECREF(result);
4697    return NULL;
4698
4699#undef MAX_SHORT_UNICHARS
4700}
4701
4702PyObject *
4703PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4704                     Py_ssize_t size,
4705                     const char *errors)
4706{
4707    PyObject *v, *unicode;
4708
4709    unicode = PyUnicode_FromUnicode(s, size);
4710    if (unicode == NULL)
4711        return NULL;
4712    v = _PyUnicode_AsUTF8String(unicode, errors);
4713    Py_DECREF(unicode);
4714    return v;
4715}
4716
4717PyObject *
4718PyUnicode_AsUTF8String(PyObject *unicode)
4719{
4720    return _PyUnicode_AsUTF8String(unicode, NULL);
4721}
4722
4723/* --- UTF-32 Codec ------------------------------------------------------- */
4724
4725PyObject *
4726PyUnicode_DecodeUTF32(const char *s,
4727                      Py_ssize_t size,
4728                      const char *errors,
4729                      int *byteorder)
4730{
4731    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4732}
4733
4734PyObject *
4735PyUnicode_DecodeUTF32Stateful(const char *s,
4736                              Py_ssize_t size,
4737                              const char *errors,
4738                              int *byteorder,
4739                              Py_ssize_t *consumed)
4740{
4741    const char *starts = s;
4742    Py_ssize_t startinpos;
4743    Py_ssize_t endinpos;
4744    Py_ssize_t outpos;
4745    PyUnicodeObject *unicode;
4746    Py_UNICODE *p;
4747#ifndef Py_UNICODE_WIDE
4748    int pairs = 0;
4749    const unsigned char *qq;
4750#else
4751    const int pairs = 0;
4752#endif
4753    const unsigned char *q, *e;
4754    int bo = 0;       /* assume native ordering by default */
4755    const char *errmsg = "";
4756    /* Offsets from q for retrieving bytes in the right order. */
4757#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4758    int iorder[] = {0, 1, 2, 3};
4759#else
4760    int iorder[] = {3, 2, 1, 0};
4761#endif
4762    PyObject *errorHandler = NULL;
4763    PyObject *exc = NULL;
4764
4765    q = (unsigned char *)s;
4766    e = q + size;
4767
4768    if (byteorder)
4769        bo = *byteorder;
4770
4771    /* Check for BOM marks (U+FEFF) in the input and adjust current
4772       byte order setting accordingly. In native mode, the leading BOM
4773       mark is skipped, in all other modes, it is copied to the output
4774       stream as-is (giving a ZWNBSP character). */
4775    if (bo == 0) {
4776        if (size >= 4) {
4777            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4778                (q[iorder[1]] << 8) | q[iorder[0]];
4779#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4780            if (bom == 0x0000FEFF) {
4781                q += 4;
4782                bo = -1;
4783            }
4784            else if (bom == 0xFFFE0000) {
4785                q += 4;
4786                bo = 1;
4787            }
4788#else
4789            if (bom == 0x0000FEFF) {
4790                q += 4;
4791                bo = 1;
4792            }
4793            else if (bom == 0xFFFE0000) {
4794                q += 4;
4795                bo = -1;
4796            }
4797#endif
4798        }
4799    }
4800
4801    if (bo == -1) {
4802        /* force LE */
4803        iorder[0] = 0;
4804        iorder[1] = 1;
4805        iorder[2] = 2;
4806        iorder[3] = 3;
4807    }
4808    else if (bo == 1) {
4809        /* force BE */
4810        iorder[0] = 3;
4811        iorder[1] = 2;
4812        iorder[2] = 1;
4813        iorder[3] = 0;
4814    }
4815
4816    /* On narrow builds we split characters outside the BMP into two
4817       codepoints => count how much extra space we need. */
4818#ifndef Py_UNICODE_WIDE
4819    for (qq = q; qq < e; qq += 4)
4820        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4821            pairs++;
4822#endif
4823
4824    /* This might be one to much, because of a BOM */
4825    unicode = _PyUnicode_New((size+3)/4+pairs);
4826    if (!unicode)
4827        return NULL;
4828    if (size == 0)
4829        return (PyObject *)unicode;
4830
4831    /* Unpack UTF-32 encoded data */
4832    p = PyUnicode_AS_UNICODE(unicode);
4833
4834    while (q < e) {
4835        Py_UCS4 ch;
4836        /* remaining bytes at the end? (size should be divisible by 4) */
4837        if (e-q<4) {
4838            if (consumed)
4839                break;
4840            errmsg = "truncated data";
4841            startinpos = ((const char *)q)-starts;
4842            endinpos = ((const char *)e)-starts;
4843            goto utf32Error;
4844            /* The remaining input chars are ignored if the callback
4845               chooses to skip the input */
4846        }
4847        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4848            (q[iorder[1]] << 8) | q[iorder[0]];
4849
4850        if (ch >= 0x110000)
4851        {
4852            errmsg = "codepoint not in range(0x110000)";
4853            startinpos = ((const char *)q)-starts;
4854            endinpos = startinpos+4;
4855            goto utf32Error;
4856        }
4857#ifndef Py_UNICODE_WIDE
4858        if (ch >= 0x10000)
4859        {
4860            *p++ = 0xD800 | ((ch-0x10000) >> 10);
4861            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4862        }
4863        else
4864#endif
4865            *p++ = ch;
4866        q += 4;
4867        continue;
4868      utf32Error:
4869        outpos = p-PyUnicode_AS_UNICODE(unicode);
4870        if (unicode_decode_call_errorhandler(
4871                errors, &errorHandler,
4872                "utf32", errmsg,
4873                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4874                &unicode, &outpos, &p))
4875            goto onError;
4876    }
4877
4878    if (byteorder)
4879        *byteorder = bo;
4880
4881    if (consumed)
4882        *consumed = (const char *)q-starts;
4883
4884    /* Adjust length */
4885    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
4886        goto onError;
4887
4888    Py_XDECREF(errorHandler);
4889    Py_XDECREF(exc);
4890#ifndef DONT_MAKE_RESULT_READY
4891    if (_PyUnicode_READY_REPLACE(&unicode)) {
4892        Py_DECREF(unicode);
4893        return NULL;
4894    }
4895#endif
4896    assert(_PyUnicode_CheckConsistency(unicode, 1));
4897    return (PyObject *)unicode;
4898
4899  onError:
4900    Py_DECREF(unicode);
4901    Py_XDECREF(errorHandler);
4902    Py_XDECREF(exc);
4903    return NULL;
4904}
4905
4906PyObject *
4907PyUnicode_EncodeUTF32(const Py_UNICODE *s,
4908                      Py_ssize_t size,
4909                      const char *errors,
4910                      int byteorder)
4911{
4912    PyObject *v;
4913    unsigned char *p;
4914    Py_ssize_t nsize, bytesize;
4915#ifndef Py_UNICODE_WIDE
4916    Py_ssize_t i, pairs;
4917#else
4918    const int pairs = 0;
4919#endif
4920    /* Offsets from p for storing byte pairs in the right order. */
4921#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4922    int iorder[] = {0, 1, 2, 3};
4923#else
4924    int iorder[] = {3, 2, 1, 0};
4925#endif
4926
4927#define STORECHAR(CH)                           \
4928    do {                                        \
4929        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
4930        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
4931        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
4932        p[iorder[0]] = (CH) & 0xff;             \
4933        p += 4;                                 \
4934    } while(0)
4935
4936    /* In narrow builds we can output surrogate pairs as one codepoint,
4937       so we need less space. */
4938#ifndef Py_UNICODE_WIDE
4939    for (i = pairs = 0; i < size-1; i++)
4940        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4941            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4942            pairs++;
4943#endif
4944    nsize = (size - pairs + (byteorder == 0));
4945    bytesize = nsize * 4;
4946    if (bytesize / 4 != nsize)
4947        return PyErr_NoMemory();
4948    v = PyBytes_FromStringAndSize(NULL, bytesize);
4949    if (v == NULL)
4950        return NULL;
4951
4952    p = (unsigned char *)PyBytes_AS_STRING(v);
4953    if (byteorder == 0)
4954        STORECHAR(0xFEFF);
4955    if (size == 0)
4956        goto done;
4957
4958    if (byteorder == -1) {
4959        /* force LE */
4960        iorder[0] = 0;
4961        iorder[1] = 1;
4962        iorder[2] = 2;
4963        iorder[3] = 3;
4964    }
4965    else if (byteorder == 1) {
4966        /* force BE */
4967        iorder[0] = 3;
4968        iorder[1] = 2;
4969        iorder[2] = 1;
4970        iorder[3] = 0;
4971    }
4972
4973    while (size-- > 0) {
4974        Py_UCS4 ch = *s++;
4975#ifndef Py_UNICODE_WIDE
4976        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4977            Py_UCS4 ch2 = *s;
4978            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4979                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4980                s++;
4981                size--;
4982            }
4983        }
4984#endif
4985        STORECHAR(ch);
4986    }
4987
4988  done:
4989    return v;
4990#undef STORECHAR
4991}
4992
4993PyObject *
4994PyUnicode_AsUTF32String(PyObject *unicode)
4995{
4996    if (!PyUnicode_Check(unicode)) {
4997        PyErr_BadArgument();
4998        return NULL;
4999    }
5000    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
5001                                 PyUnicode_GET_SIZE(unicode),
5002                                 NULL,
5003                                 0);
5004}
5005
5006/* --- UTF-16 Codec ------------------------------------------------------- */
5007
5008PyObject *
5009PyUnicode_DecodeUTF16(const char *s,
5010                      Py_ssize_t size,
5011                      const char *errors,
5012                      int *byteorder)
5013{
5014    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5015}
5016
5017/* Two masks for fast checking of whether a C 'long' may contain
5018   UTF16-encoded surrogate characters. This is an efficient heuristic,
5019   assuming that non-surrogate characters with a code point >= 0x8000 are
5020   rare in most input.
5021   FAST_CHAR_MASK is used when the input is in native byte ordering,
5022   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
5023*/
5024#if (SIZEOF_LONG == 8)
5025# define FAST_CHAR_MASK         0x8000800080008000L
5026# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5027#elif (SIZEOF_LONG == 4)
5028# define FAST_CHAR_MASK         0x80008000L
5029# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5030#else
5031# error C 'long' size should be either 4 or 8!
5032#endif
5033
5034PyObject *
5035PyUnicode_DecodeUTF16Stateful(const char *s,
5036                              Py_ssize_t size,
5037                              const char *errors,
5038                              int *byteorder,
5039                              Py_ssize_t *consumed)
5040{
5041    const char *starts = s;
5042    Py_ssize_t startinpos;
5043    Py_ssize_t endinpos;
5044    Py_ssize_t outpos;
5045    PyUnicodeObject *unicode;
5046    Py_UNICODE *p;
5047    const unsigned char *q, *e, *aligned_end;
5048    int bo = 0;       /* assume native ordering by default */
5049    int native_ordering = 0;
5050    const char *errmsg = "";
5051    /* Offsets from q for retrieving byte pairs in the right order. */
5052#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5053    int ihi = 1, ilo = 0;
5054#else
5055    int ihi = 0, ilo = 1;
5056#endif
5057    PyObject *errorHandler = NULL;
5058    PyObject *exc = NULL;
5059
5060    /* Note: size will always be longer than the resulting Unicode
5061       character count */
5062    unicode = _PyUnicode_New(size);
5063    if (!unicode)
5064        return NULL;
5065    if (size == 0)
5066        return (PyObject *)unicode;
5067
5068    /* Unpack UTF-16 encoded data */
5069    p = PyUnicode_AS_UNICODE(unicode);
5070    q = (unsigned char *)s;
5071    e = q + size - 1;
5072
5073    if (byteorder)
5074        bo = *byteorder;
5075
5076    /* Check for BOM marks (U+FEFF) in the input and adjust current
5077       byte order setting accordingly. In native mode, the leading BOM
5078       mark is skipped, in all other modes, it is copied to the output
5079       stream as-is (giving a ZWNBSP character). */
5080    if (bo == 0) {
5081        if (size >= 2) {
5082            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
5083#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5084            if (bom == 0xFEFF) {
5085                q += 2;
5086                bo = -1;
5087            }
5088            else if (bom == 0xFFFE) {
5089                q += 2;
5090                bo = 1;
5091            }
5092#else
5093            if (bom == 0xFEFF) {
5094                q += 2;
5095                bo = 1;
5096            }
5097            else if (bom == 0xFFFE) {
5098                q += 2;
5099                bo = -1;
5100            }
5101#endif
5102        }
5103    }
5104
5105    if (bo == -1) {
5106        /* force LE */
5107        ihi = 1;
5108        ilo = 0;
5109    }
5110    else if (bo == 1) {
5111        /* force BE */
5112        ihi = 0;
5113        ilo = 1;
5114    }
5115#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5116    native_ordering = ilo < ihi;
5117#else
5118    native_ordering = ilo > ihi;
5119#endif
5120
5121    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5122    while (q < e) {
5123        Py_UNICODE ch;
5124        /* First check for possible aligned read of a C 'long'. Unaligned
5125           reads are more expensive, better to defer to another iteration. */
5126        if (!((size_t) q & LONG_PTR_MASK)) {
5127            /* Fast path for runs of non-surrogate chars. */
5128            register const unsigned char *_q = q;
5129            Py_UNICODE *_p = p;
5130            if (native_ordering) {
5131                /* Native ordering is simple: as long as the input cannot
5132                   possibly contain a surrogate char, do an unrolled copy
5133                   of several 16-bit code points to the target object.
5134                   The non-surrogate check is done on several input bytes
5135                   at a time (as many as a C 'long' can contain). */
5136                while (_q < aligned_end) {
5137                    unsigned long data = * (unsigned long *) _q;
5138                    if (data & FAST_CHAR_MASK)
5139                        break;
5140                    _p[0] = ((unsigned short *) _q)[0];
5141                    _p[1] = ((unsigned short *) _q)[1];
5142#if (SIZEOF_LONG == 8)
5143                    _p[2] = ((unsigned short *) _q)[2];
5144                    _p[3] = ((unsigned short *) _q)[3];
5145#endif
5146                    _q += SIZEOF_LONG;
5147                    _p += SIZEOF_LONG / 2;
5148                }
5149            }
5150            else {
5151                /* Byteswapped ordering is similar, but we must decompose
5152                   the copy bytewise, and take care of zero'ing out the
5153                   upper bytes if the target object is in 32-bit units
5154                   (that is, in UCS-4 builds). */
5155                while (_q < aligned_end) {
5156                    unsigned long data = * (unsigned long *) _q;
5157                    if (data & SWAPPED_FAST_CHAR_MASK)
5158                        break;
5159                    /* Zero upper bytes in UCS-4 builds */
5160#if (Py_UNICODE_SIZE > 2)
5161                    _p[0] = 0;
5162                    _p[1] = 0;
5163#if (SIZEOF_LONG == 8)
5164                    _p[2] = 0;
5165                    _p[3] = 0;
5166#endif
5167#endif
5168                    /* Issue #4916; UCS-4 builds on big endian machines must
5169                       fill the two last bytes of each 4-byte unit. */
5170#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5171# define OFF 2
5172#else
5173# define OFF 0
5174#endif
5175                    ((unsigned char *) _p)[OFF + 1] = _q[0];
5176                    ((unsigned char *) _p)[OFF + 0] = _q[1];
5177                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5178                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5179#if (SIZEOF_LONG == 8)
5180                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5181                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5182                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5183                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5184#endif
5185#undef OFF
5186                    _q += SIZEOF_LONG;
5187                    _p += SIZEOF_LONG / 2;
5188                }
5189            }
5190            p = _p;
5191            q = _q;
5192            if (q >= e)
5193                break;
5194        }
5195        ch = (q[ihi] << 8) | q[ilo];
5196
5197        q += 2;
5198
5199        if (ch < 0xD800 || ch > 0xDFFF) {
5200            *p++ = ch;
5201            continue;
5202        }
5203
5204        /* UTF-16 code pair: */
5205        if (q > e) {
5206            errmsg = "unexpected end of data";
5207            startinpos = (((const char *)q) - 2) - starts;
5208            endinpos = ((const char *)e) + 1 - starts;
5209            goto utf16Error;
5210        }
5211        if (0xD800 <= ch && ch <= 0xDBFF) {
5212            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5213            q += 2;
5214            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5215#ifndef Py_UNICODE_WIDE
5216                *p++ = ch;
5217                *p++ = ch2;
5218#else
5219                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5220#endif
5221                continue;
5222            }
5223            else {
5224                errmsg = "illegal UTF-16 surrogate";
5225                startinpos = (((const char *)q)-4)-starts;
5226                endinpos = startinpos+2;
5227                goto utf16Error;
5228            }
5229
5230        }
5231        errmsg = "illegal encoding";
5232        startinpos = (((const char *)q)-2)-starts;
5233        endinpos = startinpos+2;
5234        /* Fall through to report the error */
5235
5236      utf16Error:
5237        outpos = p - PyUnicode_AS_UNICODE(unicode);
5238        if (unicode_decode_call_errorhandler(
5239                errors,
5240                &errorHandler,
5241                "utf16", errmsg,
5242                &starts,
5243                (const char **)&e,
5244                &startinpos,
5245                &endinpos,
5246                &exc,
5247                (const char **)&q,
5248                &unicode,
5249                &outpos,
5250                &p))
5251            goto onError;
5252    }
5253    /* remaining byte at the end? (size should be even) */
5254    if (e == q) {
5255        if (!consumed) {
5256            errmsg = "truncated data";
5257            startinpos = ((const char *)q) - starts;
5258            endinpos = ((const char *)e) + 1 - starts;
5259            outpos = p - PyUnicode_AS_UNICODE(unicode);
5260            if (unicode_decode_call_errorhandler(
5261                    errors,
5262                    &errorHandler,
5263                    "utf16", errmsg,
5264                    &starts,
5265                    (const char **)&e,
5266                    &startinpos,
5267                    &endinpos,
5268                    &exc,
5269                    (const char **)&q,
5270                    &unicode,
5271                    &outpos,
5272                    &p))
5273                goto onError;
5274            /* The remaining input chars are ignored if the callback
5275               chooses to skip the input */
5276        }
5277    }
5278
5279    if (byteorder)
5280        *byteorder = bo;
5281
5282    if (consumed)
5283        *consumed = (const char *)q-starts;
5284
5285    /* Adjust length */
5286    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5287        goto onError;
5288
5289    Py_XDECREF(errorHandler);
5290    Py_XDECREF(exc);
5291#ifndef DONT_MAKE_RESULT_READY
5292    if (_PyUnicode_READY_REPLACE(&unicode)) {
5293        Py_DECREF(unicode);
5294        return NULL;
5295    }
5296#endif
5297    assert(_PyUnicode_CheckConsistency(unicode, 1));
5298    return (PyObject *)unicode;
5299
5300  onError:
5301    Py_DECREF(unicode);
5302    Py_XDECREF(errorHandler);
5303    Py_XDECREF(exc);
5304    return NULL;
5305}
5306
5307#undef FAST_CHAR_MASK
5308#undef SWAPPED_FAST_CHAR_MASK
5309
5310PyObject *
5311PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5312                      Py_ssize_t size,
5313                      const char *errors,
5314                      int byteorder)
5315{
5316    PyObject *v;
5317    unsigned char *p;
5318    Py_ssize_t nsize, bytesize;
5319#ifdef Py_UNICODE_WIDE
5320    Py_ssize_t i, pairs;
5321#else
5322    const int pairs = 0;
5323#endif
5324    /* Offsets from p for storing byte pairs in the right order. */
5325#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5326    int ihi = 1, ilo = 0;
5327#else
5328    int ihi = 0, ilo = 1;
5329#endif
5330
5331#define STORECHAR(CH)                           \
5332    do {                                        \
5333        p[ihi] = ((CH) >> 8) & 0xff;            \
5334        p[ilo] = (CH) & 0xff;                   \
5335        p += 2;                                 \
5336    } while(0)
5337
5338#ifdef Py_UNICODE_WIDE
5339    for (i = pairs = 0; i < size; i++)
5340        if (s[i] >= 0x10000)
5341            pairs++;
5342#endif
5343    /* 2 * (size + pairs + (byteorder == 0)) */
5344    if (size > PY_SSIZE_T_MAX ||
5345        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5346        return PyErr_NoMemory();
5347    nsize = size + pairs + (byteorder == 0);
5348    bytesize = nsize * 2;
5349    if (bytesize / 2 != nsize)
5350        return PyErr_NoMemory();
5351    v = PyBytes_FromStringAndSize(NULL, bytesize);
5352    if (v == NULL)
5353        return NULL;
5354
5355    p = (unsigned char *)PyBytes_AS_STRING(v);
5356    if (byteorder == 0)
5357        STORECHAR(0xFEFF);
5358    if (size == 0)
5359        goto done;
5360
5361    if (byteorder == -1) {
5362        /* force LE */
5363        ihi = 1;
5364        ilo = 0;
5365    }
5366    else if (byteorder == 1) {
5367        /* force BE */
5368        ihi = 0;
5369        ilo = 1;
5370    }
5371
5372    while (size-- > 0) {
5373        Py_UNICODE ch = *s++;
5374        Py_UNICODE ch2 = 0;
5375#ifdef Py_UNICODE_WIDE
5376        if (ch >= 0x10000) {
5377            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5378            ch  = 0xD800 | ((ch-0x10000) >> 10);
5379        }
5380#endif
5381        STORECHAR(ch);
5382        if (ch2)
5383            STORECHAR(ch2);
5384    }
5385
5386  done:
5387    return v;
5388#undef STORECHAR
5389}
5390
5391PyObject *
5392PyUnicode_AsUTF16String(PyObject *unicode)
5393{
5394    if (!PyUnicode_Check(unicode)) {
5395        PyErr_BadArgument();
5396        return NULL;
5397    }
5398    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
5399                                 PyUnicode_GET_SIZE(unicode),
5400                                 NULL,
5401                                 0);
5402}
5403
5404/* --- Unicode Escape Codec ----------------------------------------------- */
5405
5406/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5407   if all the escapes in the string make it still a valid ASCII string.
5408   Returns -1 if any escapes were found which cause the string to
5409   pop out of ASCII range.  Otherwise returns the length of the
5410   required buffer to hold the string.
5411   */
5412Py_ssize_t
5413length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5414{
5415    const unsigned char *p = (const unsigned char *)s;
5416    const unsigned char *end = p + size;
5417    Py_ssize_t length = 0;
5418
5419    if (size < 0)
5420        return -1;
5421
5422    for (; p < end; ++p) {
5423        if (*p > 127) {
5424            /* Non-ASCII */
5425            return -1;
5426        }
5427        else if (*p != '\\') {
5428            /* Normal character */
5429            ++length;
5430        }
5431        else {
5432            /* Backslash-escape, check next char */
5433            ++p;
5434            /* Escape sequence reaches till end of string or
5435               non-ASCII follow-up. */
5436            if (p >= end || *p > 127)
5437                return -1;
5438            switch (*p) {
5439            case '\n':
5440                /* backslash + \n result in zero characters */
5441                break;
5442            case '\\': case '\'': case '\"':
5443            case 'b': case 'f': case 't':
5444            case 'n': case 'r': case 'v': case 'a':
5445                ++length;
5446                break;
5447            case '0': case '1': case '2': case '3':
5448            case '4': case '5': case '6': case '7':
5449            case 'x': case 'u': case 'U': case 'N':
5450                /* these do not guarantee ASCII characters */
5451                return -1;
5452            default:
5453                /* count the backslash + the other character */
5454                length += 2;
5455            }
5456        }
5457    }
5458    return length;
5459}
5460
5461/* Similar to PyUnicode_WRITE but either write into wstr field
5462   or treat string as ASCII. */
5463#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5464    do { \
5465        if ((kind) != PyUnicode_WCHAR_KIND) \
5466            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5467        else \
5468            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5469    } while (0)
5470
5471#define WRITE_WSTR(buf, index, value) \
5472    assert(kind == PyUnicode_WCHAR_KIND), \
5473    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5474
5475
5476static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5477
5478PyObject *
5479PyUnicode_DecodeUnicodeEscape(const char *s,
5480                              Py_ssize_t size,
5481                              const char *errors)
5482{
5483    const char *starts = s;
5484    Py_ssize_t startinpos;
5485    Py_ssize_t endinpos;
5486    int j;
5487    PyUnicodeObject *v;
5488    Py_UNICODE *p;
5489    const char *end;
5490    char* message;
5491    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5492    PyObject *errorHandler = NULL;
5493    PyObject *exc = NULL;
5494    Py_ssize_t ascii_length;
5495    Py_ssize_t i;
5496    int kind;
5497    void *data;
5498
5499    ascii_length = length_of_escaped_ascii_string(s, size);
5500
5501    /* After length_of_escaped_ascii_string() there are two alternatives,
5502       either the string is pure ASCII with named escapes like \n, etc.
5503       and we determined it's exact size (common case)
5504       or it contains \x, \u, ... escape sequences.  then we create a
5505       legacy wchar string and resize it at the end of this function. */
5506    if (ascii_length >= 0) {
5507        v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5508        if (!v)
5509            goto onError;
5510        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5511        kind = PyUnicode_1BYTE_KIND;
5512        data = PyUnicode_DATA(v);
5513    }
5514    else {
5515        /* Escaped strings will always be longer than the resulting
5516           Unicode string, so we start with size here and then reduce the
5517           length after conversion to the true value.
5518           (but if the error callback returns a long replacement string
5519           we'll have to allocate more space) */
5520        v = _PyUnicode_New(size);
5521        if (!v)
5522            goto onError;
5523        kind = PyUnicode_WCHAR_KIND;
5524        data = PyUnicode_AS_UNICODE(v);
5525    }
5526
5527    if (size == 0)
5528        return (PyObject *)v;
5529    i = 0;
5530    end = s + size;
5531
5532    while (s < end) {
5533        unsigned char c;
5534        Py_UNICODE x;
5535        int digits;
5536
5537        if (kind == PyUnicode_WCHAR_KIND) {
5538            assert(i < _PyUnicode_WSTR_LENGTH(v));
5539        }
5540        else {
5541            /* The only case in which i == ascii_length is a backslash
5542               followed by a newline. */
5543            assert(i <= ascii_length);
5544        }
5545
5546        /* Non-escape characters are interpreted as Unicode ordinals */
5547        if (*s != '\\') {
5548            WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
5549            continue;
5550        }
5551
5552        startinpos = s-starts;
5553        /* \ - Escapes */
5554        s++;
5555        c = *s++;
5556        if (s > end)
5557            c = '\0'; /* Invalid after \ */
5558
5559        if (kind == PyUnicode_WCHAR_KIND) {
5560            assert(i < _PyUnicode_WSTR_LENGTH(v));
5561        }
5562        else {
5563            /* The only case in which i == ascii_length is a backslash
5564               followed by a newline. */
5565            assert(i < ascii_length || (i == ascii_length && c == '\n'));
5566        }
5567
5568        switch (c) {
5569
5570            /* \x escapes */
5571        case '\n': break;
5572        case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5573        case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5574        case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5575        case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5576        /* FF */
5577        case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5578        case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5579        case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5580        case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5581        /* VT */
5582        case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5583        /* BEL, not classic C */
5584        case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
5585
5586            /* \OOO (octal) escapes */
5587        case '0': case '1': case '2': case '3':
5588        case '4': case '5': case '6': case '7':
5589            x = s[-1] - '0';
5590            if (s < end && '0' <= *s && *s <= '7') {
5591                x = (x<<3) + *s++ - '0';
5592                if (s < end && '0' <= *s && *s <= '7')
5593                    x = (x<<3) + *s++ - '0';
5594            }
5595            WRITE_WSTR(data, i++, x);
5596            break;
5597
5598            /* hex escapes */
5599            /* \xXX */
5600        case 'x':
5601            digits = 2;
5602            message = "truncated \\xXX escape";
5603            goto hexescape;
5604
5605            /* \uXXXX */
5606        case 'u':
5607            digits = 4;
5608            message = "truncated \\uXXXX escape";
5609            goto hexescape;
5610
5611            /* \UXXXXXXXX */
5612        case 'U':
5613            digits = 8;
5614            message = "truncated \\UXXXXXXXX escape";
5615        hexescape:
5616            chr = 0;
5617            p = PyUnicode_AS_UNICODE(v) + i;
5618            if (s+digits>end) {
5619                endinpos = size;
5620                if (unicode_decode_call_errorhandler(
5621                        errors, &errorHandler,
5622                        "unicodeescape", "end of string in escape sequence",
5623                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5624                        &v, &i, &p))
5625                    goto onError;
5626                data = PyUnicode_AS_UNICODE(v);
5627                goto nextByte;
5628            }
5629            for (j = 0; j < digits; ++j) {
5630                c = (unsigned char) s[j];
5631                if (!Py_ISXDIGIT(c)) {
5632                    endinpos = (s+j+1)-starts;
5633                    p = PyUnicode_AS_UNICODE(v) + i;
5634                    if (unicode_decode_call_errorhandler(
5635                            errors, &errorHandler,
5636                            "unicodeescape", message,
5637                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5638                            &v, &i, &p))
5639                        goto onError;
5640                    data = PyUnicode_AS_UNICODE(v);
5641                    goto nextByte;
5642                }
5643                chr = (chr<<4) & ~0xF;
5644                if (c >= '0' && c <= '9')
5645                    chr += c - '0';
5646                else if (c >= 'a' && c <= 'f')
5647                    chr += 10 + c - 'a';
5648                else
5649                    chr += 10 + c - 'A';
5650            }
5651            s += j;
5652            if (chr == 0xffffffff && PyErr_Occurred())
5653                /* _decoding_error will have already written into the
5654                   target buffer. */
5655                break;
5656        store:
5657            /* when we get here, chr is a 32-bit unicode character */
5658            if (chr <= 0xffff)
5659                /* UCS-2 character */
5660                WRITE_WSTR(data, i++, chr);
5661            else if (chr <= 0x10ffff) {
5662                /* UCS-4 character. Either store directly, or as
5663                   surrogate pair. */
5664#ifdef Py_UNICODE_WIDE
5665                WRITE_WSTR(data, i++, chr);
5666#else
5667                chr -= 0x10000L;
5668                WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5669                WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
5670#endif
5671            } else {
5672                endinpos = s-starts;
5673                p = PyUnicode_AS_UNICODE(v) + i;
5674                if (unicode_decode_call_errorhandler(
5675                        errors, &errorHandler,
5676                        "unicodeescape", "illegal Unicode character",
5677                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5678                        &v, &i, &p))
5679                    goto onError;
5680                data = PyUnicode_AS_UNICODE(v);
5681            }
5682            break;
5683
5684            /* \N{name} */
5685        case 'N':
5686            message = "malformed \\N character escape";
5687            if (ucnhash_CAPI == NULL) {
5688                /* load the unicode data module */
5689                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5690                                                PyUnicodeData_CAPSULE_NAME, 1);
5691                if (ucnhash_CAPI == NULL)
5692                    goto ucnhashError;
5693            }
5694            if (*s == '{') {
5695                const char *start = s+1;
5696                /* look for the closing brace */
5697                while (*s != '}' && s < end)
5698                    s++;
5699                if (s > start && s < end && *s == '}') {
5700                    /* found a name.  look it up in the unicode database */
5701                    message = "unknown Unicode character name";
5702                    s++;
5703                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5704                                              &chr))
5705                        goto store;
5706                }
5707            }
5708            endinpos = s-starts;
5709            p = PyUnicode_AS_UNICODE(v) + i;
5710            if (unicode_decode_call_errorhandler(
5711                    errors, &errorHandler,
5712                    "unicodeescape", message,
5713                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5714                    &v, &i, &p))
5715                goto onError;
5716            data = PyUnicode_AS_UNICODE(v);
5717            break;
5718
5719        default:
5720            if (s > end) {
5721                assert(kind == PyUnicode_WCHAR_KIND);
5722                message = "\\ at end of string";
5723                s--;
5724                endinpos = s-starts;
5725                p = PyUnicode_AS_UNICODE(v) + i;
5726                if (unicode_decode_call_errorhandler(
5727                        errors, &errorHandler,
5728                        "unicodeescape", message,
5729                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5730                        &v, &i, &p))
5731                    goto onError;
5732                data = PyUnicode_AS_UNICODE(v);
5733            }
5734            else {
5735                WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5736                WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
5737            }
5738            break;
5739        }
5740      nextByte:
5741        ;
5742    }
5743    /* Ensure the length prediction worked in case of ASCII strings */
5744    assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5745
5746    if (kind == PyUnicode_WCHAR_KIND)
5747    {
5748        if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5749            goto onError;
5750    }
5751    Py_XDECREF(errorHandler);
5752    Py_XDECREF(exc);
5753#ifndef DONT_MAKE_RESULT_READY
5754    if (_PyUnicode_READY_REPLACE(&v)) {
5755        Py_DECREF(v);
5756        return NULL;
5757    }
5758#endif
5759    assert(_PyUnicode_CheckConsistency(v, 1));
5760    return (PyObject *)v;
5761
5762  ucnhashError:
5763    PyErr_SetString(
5764        PyExc_UnicodeError,
5765        "\\N escapes not supported (can't load unicodedata module)"
5766        );
5767    Py_XDECREF(v);
5768    Py_XDECREF(errorHandler);
5769    Py_XDECREF(exc);
5770    return NULL;
5771
5772  onError:
5773    Py_XDECREF(v);
5774    Py_XDECREF(errorHandler);
5775    Py_XDECREF(exc);
5776    return NULL;
5777}
5778
5779#undef WRITE_ASCII_OR_WSTR
5780#undef WRITE_WSTR
5781
5782/* Return a Unicode-Escape string version of the Unicode object.
5783
5784   If quotes is true, the string is enclosed in u"" or u'' quotes as
5785   appropriate.
5786
5787*/
5788
5789static const char *hexdigits = "0123456789abcdef";
5790
5791PyObject *
5792PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5793                              Py_ssize_t size)
5794{
5795    PyObject *repr;
5796    char *p;
5797
5798#ifdef Py_UNICODE_WIDE
5799    const Py_ssize_t expandsize = 10;
5800#else
5801    const Py_ssize_t expandsize = 6;
5802#endif
5803
5804    /* XXX(nnorwitz): rather than over-allocating, it would be
5805       better to choose a different scheme.  Perhaps scan the
5806       first N-chars of the string and allocate based on that size.
5807    */
5808    /* Initial allocation is based on the longest-possible unichr
5809       escape.
5810
5811       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5812       unichr, so in this case it's the longest unichr escape. In
5813       narrow (UTF-16) builds this is five chars per source unichr
5814       since there are two unichrs in the surrogate pair, so in narrow
5815       (UTF-16) builds it's not the longest unichr escape.
5816
5817       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5818       so in the narrow (UTF-16) build case it's the longest unichr
5819       escape.
5820    */
5821
5822    if (size == 0)
5823        return PyBytes_FromStringAndSize(NULL, 0);
5824
5825    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5826        return PyErr_NoMemory();
5827
5828    repr = PyBytes_FromStringAndSize(NULL,
5829                                     2
5830                                     + expandsize*size
5831                                     + 1);
5832    if (repr == NULL)
5833        return NULL;
5834
5835    p = PyBytes_AS_STRING(repr);
5836
5837    while (size-- > 0) {
5838        Py_UNICODE ch = *s++;
5839
5840        /* Escape backslashes */
5841        if (ch == '\\') {
5842            *p++ = '\\';
5843            *p++ = (char) ch;
5844            continue;
5845        }
5846
5847#ifdef Py_UNICODE_WIDE
5848        /* Map 21-bit characters to '\U00xxxxxx' */
5849        else if (ch >= 0x10000) {
5850            *p++ = '\\';
5851            *p++ = 'U';
5852            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5853            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5854            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5855            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5856            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5857            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5858            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5859            *p++ = hexdigits[ch & 0x0000000F];
5860            continue;
5861        }
5862#else
5863        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5864        else if (ch >= 0xD800 && ch < 0xDC00) {
5865            Py_UNICODE ch2;
5866            Py_UCS4 ucs;
5867
5868            ch2 = *s++;
5869            size--;
5870            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5871                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5872                *p++ = '\\';
5873                *p++ = 'U';
5874                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5875                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5876                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5877                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5878                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5879                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5880                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5881                *p++ = hexdigits[ucs & 0x0000000F];
5882                continue;
5883            }
5884            /* Fall through: isolated surrogates are copied as-is */
5885            s--;
5886            size++;
5887        }
5888#endif
5889
5890        /* Map 16-bit characters to '\uxxxx' */
5891        if (ch >= 256) {
5892            *p++ = '\\';
5893            *p++ = 'u';
5894            *p++ = hexdigits[(ch >> 12) & 0x000F];
5895            *p++ = hexdigits[(ch >> 8) & 0x000F];
5896            *p++ = hexdigits[(ch >> 4) & 0x000F];
5897            *p++ = hexdigits[ch & 0x000F];
5898        }
5899
5900        /* Map special whitespace to '\t', \n', '\r' */
5901        else if (ch == '\t') {
5902            *p++ = '\\';
5903            *p++ = 't';
5904        }
5905        else if (ch == '\n') {
5906            *p++ = '\\';
5907            *p++ = 'n';
5908        }
5909        else if (ch == '\r') {
5910            *p++ = '\\';
5911            *p++ = 'r';
5912        }
5913
5914        /* Map non-printable US ASCII to '\xhh' */
5915        else if (ch < ' ' || ch >= 0x7F) {
5916            *p++ = '\\';
5917            *p++ = 'x';
5918            *p++ = hexdigits[(ch >> 4) & 0x000F];
5919            *p++ = hexdigits[ch & 0x000F];
5920        }
5921
5922        /* Copy everything else as-is */
5923        else
5924            *p++ = (char) ch;
5925    }
5926
5927    assert(p - PyBytes_AS_STRING(repr) > 0);
5928    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5929        return NULL;
5930    return repr;
5931}
5932
5933PyObject *
5934PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5935{
5936    PyObject *s;
5937    if (!PyUnicode_Check(unicode)) {
5938        PyErr_BadArgument();
5939        return NULL;
5940    }
5941    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5942                                      PyUnicode_GET_SIZE(unicode));
5943    return s;
5944}
5945
5946/* --- Raw Unicode Escape Codec ------------------------------------------- */
5947
5948PyObject *
5949PyUnicode_DecodeRawUnicodeEscape(const char *s,
5950                                 Py_ssize_t size,
5951                                 const char *errors)
5952{
5953    const char *starts = s;
5954    Py_ssize_t startinpos;
5955    Py_ssize_t endinpos;
5956    Py_ssize_t outpos;
5957    PyUnicodeObject *v;
5958    Py_UNICODE *p;
5959    const char *end;
5960    const char *bs;
5961    PyObject *errorHandler = NULL;
5962    PyObject *exc = NULL;
5963
5964    /* Escaped strings will always be longer than the resulting
5965       Unicode string, so we start with size here and then reduce the
5966       length after conversion to the true value. (But decoding error
5967       handler might have to resize the string) */
5968    v = _PyUnicode_New(size);
5969    if (v == NULL)
5970        goto onError;
5971    if (size == 0)
5972        return (PyObject *)v;
5973    p = PyUnicode_AS_UNICODE(v);
5974    end = s + size;
5975    while (s < end) {
5976        unsigned char c;
5977        Py_UCS4 x;
5978        int i;
5979        int count;
5980
5981        /* Non-escape characters are interpreted as Unicode ordinals */
5982        if (*s != '\\') {
5983            *p++ = (unsigned char)*s++;
5984            continue;
5985        }
5986        startinpos = s-starts;
5987
5988        /* \u-escapes are only interpreted iff the number of leading
5989           backslashes if odd */
5990        bs = s;
5991        for (;s < end;) {
5992            if (*s != '\\')
5993                break;
5994            *p++ = (unsigned char)*s++;
5995        }
5996        if (((s - bs) & 1) == 0 ||
5997            s >= end ||
5998            (*s != 'u' && *s != 'U')) {
5999            continue;
6000        }
6001        p--;
6002        count = *s=='u' ? 4 : 8;
6003        s++;
6004
6005        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6006        outpos = p-PyUnicode_AS_UNICODE(v);
6007        for (x = 0, i = 0; i < count; ++i, ++s) {
6008            c = (unsigned char)*s;
6009            if (!Py_ISXDIGIT(c)) {
6010                endinpos = s-starts;
6011                if (unicode_decode_call_errorhandler(
6012                        errors, &errorHandler,
6013                        "rawunicodeescape", "truncated \\uXXXX",
6014                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6015                        &v, &outpos, &p))
6016                    goto onError;
6017                goto nextByte;
6018            }
6019            x = (x<<4) & ~0xF;
6020            if (c >= '0' && c <= '9')
6021                x += c - '0';
6022            else if (c >= 'a' && c <= 'f')
6023                x += 10 + c - 'a';
6024            else
6025                x += 10 + c - 'A';
6026        }
6027        if (x <= 0xffff)
6028            /* UCS-2 character */
6029            *p++ = (Py_UNICODE) x;
6030        else if (x <= 0x10ffff) {
6031            /* UCS-4 character. Either store directly, or as
6032               surrogate pair. */
6033#ifdef Py_UNICODE_WIDE
6034            *p++ = (Py_UNICODE) x;
6035#else
6036            x -= 0x10000L;
6037            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6038            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
6039#endif
6040        } else {
6041            endinpos = s-starts;
6042            outpos = p-PyUnicode_AS_UNICODE(v);
6043            if (unicode_decode_call_errorhandler(
6044                    errors, &errorHandler,
6045                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6046                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6047                    &v, &outpos, &p))
6048                goto onError;
6049        }
6050      nextByte:
6051        ;
6052    }
6053    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6054        goto onError;
6055    Py_XDECREF(errorHandler);
6056    Py_XDECREF(exc);
6057#ifndef DONT_MAKE_RESULT_READY
6058    if (_PyUnicode_READY_REPLACE(&v)) {
6059        Py_DECREF(v);
6060        return NULL;
6061    }
6062#endif
6063    assert(_PyUnicode_CheckConsistency(v, 1));
6064    return (PyObject *)v;
6065
6066  onError:
6067    Py_XDECREF(v);
6068    Py_XDECREF(errorHandler);
6069    Py_XDECREF(exc);
6070    return NULL;
6071}
6072
6073PyObject *
6074PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6075                                 Py_ssize_t size)
6076{
6077    PyObject *repr;
6078    char *p;
6079    char *q;
6080
6081#ifdef Py_UNICODE_WIDE
6082    const Py_ssize_t expandsize = 10;
6083#else
6084    const Py_ssize_t expandsize = 6;
6085#endif
6086
6087    if (size > PY_SSIZE_T_MAX / expandsize)
6088        return PyErr_NoMemory();
6089
6090    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
6091    if (repr == NULL)
6092        return NULL;
6093    if (size == 0)
6094        return repr;
6095
6096    p = q = PyBytes_AS_STRING(repr);
6097    while (size-- > 0) {
6098        Py_UNICODE ch = *s++;
6099#ifdef Py_UNICODE_WIDE
6100        /* Map 32-bit characters to '\Uxxxxxxxx' */
6101        if (ch >= 0x10000) {
6102            *p++ = '\\';
6103            *p++ = 'U';
6104            *p++ = hexdigits[(ch >> 28) & 0xf];
6105            *p++ = hexdigits[(ch >> 24) & 0xf];
6106            *p++ = hexdigits[(ch >> 20) & 0xf];
6107            *p++ = hexdigits[(ch >> 16) & 0xf];
6108            *p++ = hexdigits[(ch >> 12) & 0xf];
6109            *p++ = hexdigits[(ch >> 8) & 0xf];
6110            *p++ = hexdigits[(ch >> 4) & 0xf];
6111            *p++ = hexdigits[ch & 15];
6112        }
6113        else
6114#else
6115            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6116            if (ch >= 0xD800 && ch < 0xDC00) {
6117                Py_UNICODE ch2;
6118                Py_UCS4 ucs;
6119
6120                ch2 = *s++;
6121                size--;
6122                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6123                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6124                    *p++ = '\\';
6125                    *p++ = 'U';
6126                    *p++ = hexdigits[(ucs >> 28) & 0xf];
6127                    *p++ = hexdigits[(ucs >> 24) & 0xf];
6128                    *p++ = hexdigits[(ucs >> 20) & 0xf];
6129                    *p++ = hexdigits[(ucs >> 16) & 0xf];
6130                    *p++ = hexdigits[(ucs >> 12) & 0xf];
6131                    *p++ = hexdigits[(ucs >> 8) & 0xf];
6132                    *p++ = hexdigits[(ucs >> 4) & 0xf];
6133                    *p++ = hexdigits[ucs & 0xf];
6134                    continue;
6135                }
6136                /* Fall through: isolated surrogates are copied as-is */
6137                s--;
6138                size++;
6139            }
6140#endif
6141        /* Map 16-bit characters to '\uxxxx' */
6142        if (ch >= 256) {
6143            *p++ = '\\';
6144            *p++ = 'u';
6145            *p++ = hexdigits[(ch >> 12) & 0xf];
6146            *p++ = hexdigits[(ch >> 8) & 0xf];
6147            *p++ = hexdigits[(ch >> 4) & 0xf];
6148            *p++ = hexdigits[ch & 15];
6149        }
6150        /* Copy everything else as-is */
6151        else
6152            *p++ = (char) ch;
6153    }
6154    size = p - q;
6155
6156    assert(size > 0);
6157    if (_PyBytes_Resize(&repr, size) < 0)
6158        return NULL;
6159    return repr;
6160}
6161
6162PyObject *
6163PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6164{
6165    PyObject *s;
6166    if (!PyUnicode_Check(unicode)) {
6167        PyErr_BadArgument();
6168        return NULL;
6169    }
6170    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6171                                         PyUnicode_GET_SIZE(unicode));
6172
6173    return s;
6174}
6175
6176/* --- Unicode Internal Codec ------------------------------------------- */
6177
6178PyObject *
6179_PyUnicode_DecodeUnicodeInternal(const char *s,
6180                                 Py_ssize_t size,
6181                                 const char *errors)
6182{
6183    const char *starts = s;
6184    Py_ssize_t startinpos;
6185    Py_ssize_t endinpos;
6186    Py_ssize_t outpos;
6187    PyUnicodeObject *v;
6188    Py_UNICODE *p;
6189    const char *end;
6190    const char *reason;
6191    PyObject *errorHandler = NULL;
6192    PyObject *exc = NULL;
6193
6194#ifdef Py_UNICODE_WIDE
6195    Py_UNICODE unimax = PyUnicode_GetMax();
6196#endif
6197
6198    /* XXX overflow detection missing */
6199    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6200    if (v == NULL)
6201        goto onError;
6202    /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6203       as string was created with the old API. */
6204    if (PyUnicode_GET_SIZE(v) == 0)
6205        return (PyObject *)v;
6206    p = PyUnicode_AS_UNICODE(v);
6207    end = s + size;
6208
6209    while (s < end) {
6210        memcpy(p, s, sizeof(Py_UNICODE));
6211        /* We have to sanity check the raw data, otherwise doom looms for
6212           some malformed UCS-4 data. */
6213        if (
6214#ifdef Py_UNICODE_WIDE
6215            *p > unimax || *p < 0 ||
6216#endif
6217            end-s < Py_UNICODE_SIZE
6218            )
6219        {
6220            startinpos = s - starts;
6221            if (end-s < Py_UNICODE_SIZE) {
6222                endinpos = end-starts;
6223                reason = "truncated input";
6224            }
6225            else {
6226                endinpos = s - starts + Py_UNICODE_SIZE;
6227                reason = "illegal code point (> 0x10FFFF)";
6228            }
6229            outpos = p - PyUnicode_AS_UNICODE(v);
6230            if (unicode_decode_call_errorhandler(
6231                    errors, &errorHandler,
6232                    "unicode_internal", reason,
6233                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6234                    &v, &outpos, &p)) {
6235                goto onError;
6236            }
6237        }
6238        else {
6239            p++;
6240            s += Py_UNICODE_SIZE;
6241        }
6242    }
6243
6244    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6245        goto onError;
6246    Py_XDECREF(errorHandler);
6247    Py_XDECREF(exc);
6248#ifndef DONT_MAKE_RESULT_READY
6249    if (_PyUnicode_READY_REPLACE(&v)) {
6250        Py_DECREF(v);
6251        return NULL;
6252    }
6253#endif
6254    assert(_PyUnicode_CheckConsistency(v, 1));
6255    return (PyObject *)v;
6256
6257  onError:
6258    Py_XDECREF(v);
6259    Py_XDECREF(errorHandler);
6260    Py_XDECREF(exc);
6261    return NULL;
6262}
6263
6264/* --- Latin-1 Codec ------------------------------------------------------ */
6265
6266PyObject *
6267PyUnicode_DecodeLatin1(const char *s,
6268                       Py_ssize_t size,
6269                       const char *errors)
6270{
6271    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6272    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6273}
6274
6275/* create or adjust a UnicodeEncodeError */
6276static void
6277make_encode_exception(PyObject **exceptionObject,
6278                      const char *encoding,
6279                      const Py_UNICODE *unicode, Py_ssize_t size,
6280                      Py_ssize_t startpos, Py_ssize_t endpos,
6281                      const char *reason)
6282{
6283    if (*exceptionObject == NULL) {
6284        *exceptionObject = PyUnicodeEncodeError_Create(
6285            encoding, unicode, size, startpos, endpos, reason);
6286    }
6287    else {
6288        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6289            goto onError;
6290        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6291            goto onError;
6292        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6293            goto onError;
6294        return;
6295      onError:
6296        Py_DECREF(*exceptionObject);
6297        *exceptionObject = NULL;
6298    }
6299}
6300
6301/* raises a UnicodeEncodeError */
6302static void
6303raise_encode_exception(PyObject **exceptionObject,
6304                       const char *encoding,
6305                       const Py_UNICODE *unicode, Py_ssize_t size,
6306                       Py_ssize_t startpos, Py_ssize_t endpos,
6307                       const char *reason)
6308{
6309    make_encode_exception(exceptionObject,
6310                          encoding, unicode, size, startpos, endpos, reason);
6311    if (*exceptionObject != NULL)
6312        PyCodec_StrictErrors(*exceptionObject);
6313}
6314
6315/* error handling callback helper:
6316   build arguments, call the callback and check the arguments,
6317   put the result into newpos and return the replacement string, which
6318   has to be freed by the caller */
6319static PyObject *
6320unicode_encode_call_errorhandler(const char *errors,
6321                                 PyObject **errorHandler,
6322                                 const char *encoding, const char *reason,
6323                                 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6324                                 Py_ssize_t startpos, Py_ssize_t endpos,
6325                                 Py_ssize_t *newpos)
6326{
6327    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6328
6329    PyObject *restuple;
6330    PyObject *resunicode;
6331
6332    if (*errorHandler == NULL) {
6333        *errorHandler = PyCodec_LookupError(errors);
6334        if (*errorHandler == NULL)
6335            return NULL;
6336    }
6337
6338    make_encode_exception(exceptionObject,
6339                          encoding, unicode, size, startpos, endpos, reason);
6340    if (*exceptionObject == NULL)
6341        return NULL;
6342
6343    restuple = PyObject_CallFunctionObjArgs(
6344        *errorHandler, *exceptionObject, NULL);
6345    if (restuple == NULL)
6346        return NULL;
6347    if (!PyTuple_Check(restuple)) {
6348        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6349        Py_DECREF(restuple);
6350        return NULL;
6351    }
6352    if (!PyArg_ParseTuple(restuple, argparse,
6353                          &resunicode, newpos)) {
6354        Py_DECREF(restuple);
6355        return NULL;
6356    }
6357    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6358        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6359        Py_DECREF(restuple);
6360        return NULL;
6361    }
6362    if (*newpos<0)
6363        *newpos = size+*newpos;
6364    if (*newpos<0 || *newpos>size) {
6365        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6366        Py_DECREF(restuple);
6367        return NULL;
6368    }
6369    Py_INCREF(resunicode);
6370    Py_DECREF(restuple);
6371    return resunicode;
6372}
6373
6374static PyObject *
6375unicode_encode_ucs1(const Py_UNICODE *p,
6376                    Py_ssize_t size,
6377                    const char *errors,
6378                    int limit)
6379{
6380    /* output object */
6381    PyObject *res;
6382    /* pointers to the beginning and end+1 of input */
6383    const Py_UNICODE *startp = p;
6384    const Py_UNICODE *endp = p + size;
6385    /* pointer to the beginning of the unencodable characters */
6386    /* const Py_UNICODE *badp = NULL; */
6387    /* pointer into the output */
6388    char *str;
6389    /* current output position */
6390    Py_ssize_t ressize;
6391    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6392    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6393    PyObject *errorHandler = NULL;
6394    PyObject *exc = NULL;
6395    /* the following variable is used for caching string comparisons
6396     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6397    int known_errorHandler = -1;
6398
6399    /* allocate enough for a simple encoding without
6400       replacements, if we need more, we'll resize */
6401    if (size == 0)
6402        return PyBytes_FromStringAndSize(NULL, 0);
6403    res = PyBytes_FromStringAndSize(NULL, size);
6404    if (res == NULL)
6405        return NULL;
6406    str = PyBytes_AS_STRING(res);
6407    ressize = size;
6408
6409    while (p<endp) {
6410        Py_UNICODE c = *p;
6411
6412        /* can we encode this? */
6413        if (c<limit) {
6414            /* no overflow check, because we know that the space is enough */
6415            *str++ = (char)c;
6416            ++p;
6417        }
6418        else {
6419            Py_ssize_t unicodepos = p-startp;
6420            Py_ssize_t requiredsize;
6421            PyObject *repunicode;
6422            Py_ssize_t repsize;
6423            Py_ssize_t newpos;
6424            Py_ssize_t respos;
6425            Py_UNICODE *uni2;
6426            /* startpos for collecting unencodable chars */
6427            const Py_UNICODE *collstart = p;
6428            const Py_UNICODE *collend = p;
6429            /* find all unecodable characters */
6430            while ((collend < endp) && ((*collend)>=limit))
6431                ++collend;
6432            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6433            if (known_errorHandler==-1) {
6434                if ((errors==NULL) || (!strcmp(errors, "strict")))
6435                    known_errorHandler = 1;
6436                else if (!strcmp(errors, "replace"))
6437                    known_errorHandler = 2;
6438                else if (!strcmp(errors, "ignore"))
6439                    known_errorHandler = 3;
6440                else if (!strcmp(errors, "xmlcharrefreplace"))
6441                    known_errorHandler = 4;
6442                else
6443                    known_errorHandler = 0;
6444            }
6445            switch (known_errorHandler) {
6446            case 1: /* strict */
6447                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6448                goto onError;
6449            case 2: /* replace */
6450                while (collstart++<collend)
6451                    *str++ = '?'; /* fall through */
6452            case 3: /* ignore */
6453                p = collend;
6454                break;
6455            case 4: /* xmlcharrefreplace */
6456                respos = str - PyBytes_AS_STRING(res);
6457                /* determine replacement size (temporarily (mis)uses p) */
6458                for (p = collstart, repsize = 0; p < collend; ++p) {
6459                    if (*p<10)
6460                        repsize += 2+1+1;
6461                    else if (*p<100)
6462                        repsize += 2+2+1;
6463                    else if (*p<1000)
6464                        repsize += 2+3+1;
6465                    else if (*p<10000)
6466                        repsize += 2+4+1;
6467#ifndef Py_UNICODE_WIDE
6468                    else
6469                        repsize += 2+5+1;
6470#else
6471                    else if (*p<100000)
6472                        repsize += 2+5+1;
6473                    else if (*p<1000000)
6474                        repsize += 2+6+1;
6475                    else
6476                        repsize += 2+7+1;
6477#endif
6478                }
6479                requiredsize = respos+repsize+(endp-collend);
6480                if (requiredsize > ressize) {
6481                    if (requiredsize<2*ressize)
6482                        requiredsize = 2*ressize;
6483                    if (_PyBytes_Resize(&res, requiredsize))
6484                        goto onError;
6485                    str = PyBytes_AS_STRING(res) + respos;
6486                    ressize = requiredsize;
6487                }
6488                /* generate replacement (temporarily (mis)uses p) */
6489                for (p = collstart; p < collend; ++p) {
6490                    str += sprintf(str, "&#%d;", (int)*p);
6491                }
6492                p = collend;
6493                break;
6494            default:
6495                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6496                                                              encoding, reason, startp, size, &exc,
6497                                                              collstart-startp, collend-startp, &newpos);
6498                if (repunicode == NULL)
6499                    goto onError;
6500                if (PyBytes_Check(repunicode)) {
6501                    /* Directly copy bytes result to output. */
6502                    repsize = PyBytes_Size(repunicode);
6503                    if (repsize > 1) {
6504                        /* Make room for all additional bytes. */
6505                        respos = str - PyBytes_AS_STRING(res);
6506                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6507                            Py_DECREF(repunicode);
6508                            goto onError;
6509                        }
6510                        str = PyBytes_AS_STRING(res) + respos;
6511                        ressize += repsize-1;
6512                    }
6513                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6514                    str += repsize;
6515                    p = startp + newpos;
6516                    Py_DECREF(repunicode);
6517                    break;
6518                }
6519                /* need more space? (at least enough for what we
6520                   have+the replacement+the rest of the string, so
6521                   we won't have to check space for encodable characters) */
6522                respos = str - PyBytes_AS_STRING(res);
6523                repsize = PyUnicode_GET_SIZE(repunicode);
6524                requiredsize = respos+repsize+(endp-collend);
6525                if (requiredsize > ressize) {
6526                    if (requiredsize<2*ressize)
6527                        requiredsize = 2*ressize;
6528                    if (_PyBytes_Resize(&res, requiredsize)) {
6529                        Py_DECREF(repunicode);
6530                        goto onError;
6531                    }
6532                    str = PyBytes_AS_STRING(res) + respos;
6533                    ressize = requiredsize;
6534                }
6535                /* check if there is anything unencodable in the replacement
6536                   and copy it to the output */
6537                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6538                    c = *uni2;
6539                    if (c >= limit) {
6540                        raise_encode_exception(&exc, encoding, startp, size,
6541                                               unicodepos, unicodepos+1, reason);
6542                        Py_DECREF(repunicode);
6543                        goto onError;
6544                    }
6545                    *str = (char)c;
6546                }
6547                p = startp + newpos;
6548                Py_DECREF(repunicode);
6549            }
6550        }
6551    }
6552    /* Resize if we allocated to much */
6553    size = str - PyBytes_AS_STRING(res);
6554    if (size < ressize) { /* If this falls res will be NULL */
6555        assert(size >= 0);
6556        if (_PyBytes_Resize(&res, size) < 0)
6557            goto onError;
6558    }
6559
6560    Py_XDECREF(errorHandler);
6561    Py_XDECREF(exc);
6562    return res;
6563
6564  onError:
6565    Py_XDECREF(res);
6566    Py_XDECREF(errorHandler);
6567    Py_XDECREF(exc);
6568    return NULL;
6569}
6570
6571PyObject *
6572PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6573                       Py_ssize_t size,
6574                       const char *errors)
6575{
6576    return unicode_encode_ucs1(p, size, errors, 256);
6577}
6578
6579PyObject *
6580_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6581{
6582    if (!PyUnicode_Check(unicode)) {
6583        PyErr_BadArgument();
6584        return NULL;
6585    }
6586    if (PyUnicode_READY(unicode) == -1)
6587        return NULL;
6588    /* Fast path: if it is a one-byte string, construct
6589       bytes object directly. */
6590    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6591        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6592                                         PyUnicode_GET_LENGTH(unicode));
6593    /* Non-Latin-1 characters present. Defer to above function to
6594       raise the exception. */
6595    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
6596                                  PyUnicode_GET_SIZE(unicode),
6597                                  errors);
6598}
6599
6600PyObject*
6601PyUnicode_AsLatin1String(PyObject *unicode)
6602{
6603    return _PyUnicode_AsLatin1String(unicode, NULL);
6604}
6605
6606/* --- 7-bit ASCII Codec -------------------------------------------------- */
6607
6608PyObject *
6609PyUnicode_DecodeASCII(const char *s,
6610                      Py_ssize_t size,
6611                      const char *errors)
6612{
6613    const char *starts = s;
6614    PyUnicodeObject *v;
6615    Py_UNICODE *u;
6616    Py_ssize_t startinpos;
6617    Py_ssize_t endinpos;
6618    Py_ssize_t outpos;
6619    const char *e;
6620    int has_error;
6621    const unsigned char *p = (const unsigned char *)s;
6622    const unsigned char *end = p + size;
6623    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
6624    PyObject *errorHandler = NULL;
6625    PyObject *exc = NULL;
6626
6627    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6628    if (size == 1 && (unsigned char)s[0] < 128)
6629        return get_latin1_char((unsigned char)s[0]);
6630
6631    has_error = 0;
6632    while (p < end && !has_error) {
6633        /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6634           an explanation. */
6635        if (!((size_t) p & LONG_PTR_MASK)) {
6636            /* Help register allocation */
6637            register const unsigned char *_p = p;
6638            while (_p < aligned_end) {
6639                unsigned long value = *(unsigned long *) _p;
6640                if (value & ASCII_CHAR_MASK) {
6641                    has_error = 1;
6642                    break;
6643                }
6644                _p += SIZEOF_LONG;
6645            }
6646            if (_p == end)
6647                break;
6648            if (has_error)
6649                break;
6650            p = _p;
6651        }
6652        if (*p & 0x80) {
6653            has_error = 1;
6654            break;
6655        }
6656        else {
6657            ++p;
6658        }
6659    }
6660    if (!has_error)
6661        return unicode_fromascii((const unsigned char *)s, size);
6662
6663    v = _PyUnicode_New(size);
6664    if (v == NULL)
6665        goto onError;
6666    if (size == 0)
6667        return (PyObject *)v;
6668    u = PyUnicode_AS_UNICODE(v);
6669    e = s + size;
6670    while (s < e) {
6671        register unsigned char c = (unsigned char)*s;
6672        if (c < 128) {
6673            *u++ = c;
6674            ++s;
6675        }
6676        else {
6677            startinpos = s-starts;
6678            endinpos = startinpos + 1;
6679            outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6680            if (unicode_decode_call_errorhandler(
6681                    errors, &errorHandler,
6682                    "ascii", "ordinal not in range(128)",
6683                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6684                    &v, &outpos, &u))
6685                goto onError;
6686        }
6687    }
6688    if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6689        if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
6690            goto onError;
6691    Py_XDECREF(errorHandler);
6692    Py_XDECREF(exc);
6693#ifndef DONT_MAKE_RESULT_READY
6694    if (_PyUnicode_READY_REPLACE(&v)) {
6695        Py_DECREF(v);
6696        return NULL;
6697    }
6698#endif
6699    assert(_PyUnicode_CheckConsistency(v, 1));
6700    return (PyObject *)v;
6701
6702  onError:
6703    Py_XDECREF(v);
6704    Py_XDECREF(errorHandler);
6705    Py_XDECREF(exc);
6706    return NULL;
6707}
6708
6709PyObject *
6710PyUnicode_EncodeASCII(const Py_UNICODE *p,
6711                      Py_ssize_t size,
6712                      const char *errors)
6713{
6714    return unicode_encode_ucs1(p, size, errors, 128);
6715}
6716
6717PyObject *
6718_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6719{
6720    if (!PyUnicode_Check(unicode)) {
6721        PyErr_BadArgument();
6722        return NULL;
6723    }
6724    if (PyUnicode_READY(unicode) == -1)
6725        return NULL;
6726    /* Fast path: if it is an ASCII-only string, construct bytes object
6727       directly. Else defer to above function to raise the exception. */
6728    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6729        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6730                                         PyUnicode_GET_LENGTH(unicode));
6731    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
6732                                 PyUnicode_GET_SIZE(unicode),
6733                                 errors);
6734}
6735
6736PyObject *
6737PyUnicode_AsASCIIString(PyObject *unicode)
6738{
6739    return _PyUnicode_AsASCIIString(unicode, NULL);
6740}
6741
6742#ifdef HAVE_MBCS
6743
6744/* --- MBCS codecs for Windows -------------------------------------------- */
6745
6746#if SIZEOF_INT < SIZEOF_SIZE_T
6747#define NEED_RETRY
6748#endif
6749
6750/* XXX This code is limited to "true" double-byte encodings, as
6751   a) it assumes an incomplete character consists of a single byte, and
6752   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
6753   encodings, see IsDBCSLeadByteEx documentation. */
6754
6755static int
6756is_dbcs_lead_byte(const char *s, int offset)
6757{
6758    const char *curr = s + offset;
6759
6760    if (IsDBCSLeadByte(*curr)) {
6761        const char *prev = CharPrev(s, curr);
6762        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
6763    }
6764    return 0;
6765}
6766
6767/*
6768 * Decode MBCS string into unicode object. If 'final' is set, converts
6769 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6770 */
6771static int
6772decode_mbcs(PyUnicodeObject **v,
6773            const char *s, /* MBCS string */
6774            int size, /* sizeof MBCS string */
6775            int final,
6776            const char *errors)
6777{
6778    Py_UNICODE *p;
6779    Py_ssize_t n;
6780    DWORD usize;
6781    DWORD flags;
6782
6783    assert(size >= 0);
6784
6785    /* check and handle 'errors' arg */
6786    if (errors==NULL || strcmp(errors, "strict")==0)
6787        flags = MB_ERR_INVALID_CHARS;
6788    else if (strcmp(errors, "ignore")==0)
6789        flags = 0;
6790    else {
6791        PyErr_Format(PyExc_ValueError,
6792                     "mbcs encoding does not support errors='%s'",
6793                     errors);
6794        return -1;
6795    }
6796
6797    /* Skip trailing lead-byte unless 'final' is set */
6798    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
6799        --size;
6800
6801    /* First get the size of the result */
6802    if (size > 0) {
6803        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6804        if (usize==0)
6805            goto mbcs_decode_error;
6806    } else
6807        usize = 0;
6808
6809    if (*v == NULL) {
6810        /* Create unicode object */
6811        *v = _PyUnicode_New(usize);
6812        if (*v == NULL)
6813            return -1;
6814        n = 0;
6815    }
6816    else {
6817        /* Extend unicode object */
6818        n = PyUnicode_GET_SIZE(*v);
6819        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
6820            return -1;
6821    }
6822
6823    /* Do the conversion */
6824    if (usize > 0) {
6825        p = PyUnicode_AS_UNICODE(*v) + n;
6826        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6827            goto mbcs_decode_error;
6828        }
6829    }
6830    return size;
6831
6832mbcs_decode_error:
6833    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6834       we raise a UnicodeDecodeError - else it is a 'generic'
6835       windows error
6836     */
6837    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6838        /* Ideally, we should get reason from FormatMessage - this
6839           is the Windows 2000 English version of the message
6840        */
6841        PyObject *exc = NULL;
6842        const char *reason = "No mapping for the Unicode character exists "
6843                             "in the target multi-byte code page.";
6844        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6845        if (exc != NULL) {
6846            PyCodec_StrictErrors(exc);
6847            Py_DECREF(exc);
6848        }
6849    } else {
6850        PyErr_SetFromWindowsErrWithFilename(0, NULL);
6851    }
6852    return -1;
6853}
6854
6855PyObject *
6856PyUnicode_DecodeMBCSStateful(const char *s,
6857                             Py_ssize_t size,
6858                             const char *errors,
6859                             Py_ssize_t *consumed)
6860{
6861    PyUnicodeObject *v = NULL;
6862    int done;
6863
6864    if (consumed)
6865        *consumed = 0;
6866
6867#ifdef NEED_RETRY
6868  retry:
6869    if (size > INT_MAX)
6870        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
6871    else
6872#endif
6873        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
6874
6875    if (done < 0) {
6876        Py_XDECREF(v);
6877        return NULL;
6878    }
6879
6880    if (consumed)
6881        *consumed += done;
6882
6883#ifdef NEED_RETRY
6884    if (size > INT_MAX) {
6885        s += done;
6886        size -= done;
6887        goto retry;
6888    }
6889#endif
6890#ifndef DONT_MAKE_RESULT_READY
6891    if (_PyUnicode_READY_REPLACE(&v)) {
6892        Py_DECREF(v);
6893        return NULL;
6894    }
6895#endif
6896    assert(_PyUnicode_CheckConsistency(v, 1));
6897    return (PyObject *)v;
6898}
6899
6900PyObject *
6901PyUnicode_DecodeMBCS(const char *s,
6902                     Py_ssize_t size,
6903                     const char *errors)
6904{
6905    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6906}
6907
6908/*
6909 * Convert unicode into string object (MBCS).
6910 * Returns 0 if succeed, -1 otherwise.
6911 */
6912static int
6913encode_mbcs(PyObject **repr,
6914            const Py_UNICODE *p, /* unicode */
6915            int size, /* size of unicode */
6916            const char* errors)
6917{
6918    BOOL usedDefaultChar = FALSE;
6919    BOOL *pusedDefaultChar;
6920    int mbcssize;
6921    Py_ssize_t n;
6922    PyObject *exc = NULL;
6923    DWORD flags;
6924
6925    assert(size >= 0);
6926
6927    /* check and handle 'errors' arg */
6928    if (errors==NULL || strcmp(errors, "strict")==0) {
6929        flags = WC_NO_BEST_FIT_CHARS;
6930        pusedDefaultChar = &usedDefaultChar;
6931    } else if (strcmp(errors, "replace")==0) {
6932        flags = 0;
6933        pusedDefaultChar = NULL;
6934    } else {
6935         PyErr_Format(PyExc_ValueError,
6936                      "mbcs encoding does not support errors='%s'",
6937                      errors);
6938         return -1;
6939    }
6940
6941    /* First get the size of the result */
6942    if (size > 0) {
6943        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6944                                       NULL, pusedDefaultChar);
6945        if (mbcssize == 0) {
6946            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6947            return -1;
6948        }
6949        /* If we used a default char, then we failed! */
6950        if (pusedDefaultChar && *pusedDefaultChar)
6951            goto mbcs_encode_error;
6952    } else {
6953        mbcssize = 0;
6954    }
6955
6956    if (*repr == NULL) {
6957        /* Create string object */
6958        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6959        if (*repr == NULL)
6960            return -1;
6961        n = 0;
6962    }
6963    else {
6964        /* Extend string object */
6965        n = PyBytes_Size(*repr);
6966        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6967            return -1;
6968    }
6969
6970    /* Do the conversion */
6971    if (size > 0) {
6972        char *s = PyBytes_AS_STRING(*repr) + n;
6973        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6974                                     NULL, pusedDefaultChar)) {
6975            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6976            return -1;
6977        }
6978        if (pusedDefaultChar && *pusedDefaultChar)
6979            goto mbcs_encode_error;
6980    }
6981    return 0;
6982
6983mbcs_encode_error:
6984    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6985    Py_XDECREF(exc);
6986    return -1;
6987}
6988
6989PyObject *
6990PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6991                     Py_ssize_t size,
6992                     const char *errors)
6993{
6994    PyObject *repr = NULL;
6995    int ret;
6996
6997#ifdef NEED_RETRY
6998  retry:
6999    if (size > INT_MAX)
7000        ret = encode_mbcs(&repr, p, INT_MAX, errors);
7001    else
7002#endif
7003        ret = encode_mbcs(&repr, p, (int)size, errors);
7004
7005    if (ret < 0) {
7006        Py_XDECREF(repr);
7007        return NULL;
7008    }
7009
7010#ifdef NEED_RETRY
7011    if (size > INT_MAX) {
7012        p += INT_MAX;
7013        size -= INT_MAX;
7014        goto retry;
7015    }
7016#endif
7017
7018    return repr;
7019}
7020
7021PyObject *
7022PyUnicode_AsMBCSString(PyObject *unicode)
7023{
7024    if (!PyUnicode_Check(unicode)) {
7025        PyErr_BadArgument();
7026        return NULL;
7027    }
7028    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
7029                                PyUnicode_GET_SIZE(unicode),
7030                                NULL);
7031}
7032
7033#undef NEED_RETRY
7034
7035#endif /* HAVE_MBCS */
7036
7037/* --- Character Mapping Codec -------------------------------------------- */
7038
7039PyObject *
7040PyUnicode_DecodeCharmap(const char *s,
7041                        Py_ssize_t size,
7042                        PyObject *mapping,
7043                        const char *errors)
7044{
7045    const char *starts = s;
7046    Py_ssize_t startinpos;
7047    Py_ssize_t endinpos;
7048    Py_ssize_t outpos;
7049    const char *e;
7050    PyUnicodeObject *v;
7051    Py_UNICODE *p;
7052    Py_ssize_t extrachars = 0;
7053    PyObject *errorHandler = NULL;
7054    PyObject *exc = NULL;
7055    Py_UNICODE *mapstring = NULL;
7056    Py_ssize_t maplen = 0;
7057
7058    /* Default to Latin-1 */
7059    if (mapping == NULL)
7060        return PyUnicode_DecodeLatin1(s, size, errors);
7061
7062    v = _PyUnicode_New(size);
7063    if (v == NULL)
7064        goto onError;
7065    if (size == 0)
7066        return (PyObject *)v;
7067    p = PyUnicode_AS_UNICODE(v);
7068    e = s + size;
7069    if (PyUnicode_CheckExact(mapping)) {
7070        mapstring = PyUnicode_AS_UNICODE(mapping);
7071        maplen = PyUnicode_GET_SIZE(mapping);
7072        while (s < e) {
7073            unsigned char ch = *s;
7074            Py_UNICODE x = 0xfffe; /* illegal value */
7075
7076            if (ch < maplen)
7077                x = mapstring[ch];
7078
7079            if (x == 0xfffe) {
7080                /* undefined mapping */
7081                outpos = p-PyUnicode_AS_UNICODE(v);
7082                startinpos = s-starts;
7083                endinpos = startinpos+1;
7084                if (unicode_decode_call_errorhandler(
7085                        errors, &errorHandler,
7086                        "charmap", "character maps to <undefined>",
7087                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7088                        &v, &outpos, &p)) {
7089                    goto onError;
7090                }
7091                continue;
7092            }
7093            *p++ = x;
7094            ++s;
7095        }
7096    }
7097    else {
7098        while (s < e) {
7099            unsigned char ch = *s;
7100            PyObject *w, *x;
7101
7102            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7103            w = PyLong_FromLong((long)ch);
7104            if (w == NULL)
7105                goto onError;
7106            x = PyObject_GetItem(mapping, w);
7107            Py_DECREF(w);
7108            if (x == NULL) {
7109                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7110                    /* No mapping found means: mapping is undefined. */
7111                    PyErr_Clear();
7112                    x = Py_None;
7113                    Py_INCREF(x);
7114                } else
7115                    goto onError;
7116            }
7117
7118            /* Apply mapping */
7119            if (PyLong_Check(x)) {
7120                long value = PyLong_AS_LONG(x);
7121                if (value < 0 || value > 65535) {
7122                    PyErr_SetString(PyExc_TypeError,
7123                                    "character mapping must be in range(65536)");
7124                    Py_DECREF(x);
7125                    goto onError;
7126                }
7127                *p++ = (Py_UNICODE)value;
7128            }
7129            else if (x == Py_None) {
7130                /* undefined mapping */
7131                outpos = p-PyUnicode_AS_UNICODE(v);
7132                startinpos = s-starts;
7133                endinpos = startinpos+1;
7134                if (unicode_decode_call_errorhandler(
7135                        errors, &errorHandler,
7136                        "charmap", "character maps to <undefined>",
7137                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7138                        &v, &outpos, &p)) {
7139                    Py_DECREF(x);
7140                    goto onError;
7141                }
7142                Py_DECREF(x);
7143                continue;
7144            }
7145            else if (PyUnicode_Check(x)) {
7146                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
7147
7148                if (targetsize == 1)
7149                    /* 1-1 mapping */
7150                    *p++ = *PyUnicode_AS_UNICODE(x);
7151
7152                else if (targetsize > 1) {
7153                    /* 1-n mapping */
7154                    if (targetsize > extrachars) {
7155                        /* resize first */
7156                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7157                        Py_ssize_t needed = (targetsize - extrachars) + \
7158                            (targetsize << 2);
7159                        extrachars += needed;
7160                        /* XXX overflow detection missing */
7161                        if (PyUnicode_Resize((PyObject**)&v,
7162                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
7163                            Py_DECREF(x);
7164                            goto onError;
7165                        }
7166                        p = PyUnicode_AS_UNICODE(v) + oldpos;
7167                    }
7168                    Py_UNICODE_COPY(p,
7169                                    PyUnicode_AS_UNICODE(x),
7170                                    targetsize);
7171                    p += targetsize;
7172                    extrachars -= targetsize;
7173                }
7174                /* 1-0 mapping: skip the character */
7175            }
7176            else {
7177                /* wrong return value */
7178                PyErr_SetString(PyExc_TypeError,
7179                                "character mapping must return integer, None or str");
7180                Py_DECREF(x);
7181                goto onError;
7182            }
7183            Py_DECREF(x);
7184            ++s;
7185        }
7186    }
7187    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
7188        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
7189            goto onError;
7190    Py_XDECREF(errorHandler);
7191    Py_XDECREF(exc);
7192#ifndef DONT_MAKE_RESULT_READY
7193    if (_PyUnicode_READY_REPLACE(&v)) {
7194        Py_DECREF(v);
7195        return NULL;
7196    }
7197#endif
7198    assert(_PyUnicode_CheckConsistency(v, 1));
7199    return (PyObject *)v;
7200
7201  onError:
7202    Py_XDECREF(errorHandler);
7203    Py_XDECREF(exc);
7204    Py_XDECREF(v);
7205    return NULL;
7206}
7207
7208/* Charmap encoding: the lookup table */
7209
7210struct encoding_map {
7211    PyObject_HEAD
7212    unsigned char level1[32];
7213    int count2, count3;
7214    unsigned char level23[1];
7215};
7216
7217static PyObject*
7218encoding_map_size(PyObject *obj, PyObject* args)
7219{
7220    struct encoding_map *map = (struct encoding_map*)obj;
7221    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7222                           128*map->count3);
7223}
7224
7225static PyMethodDef encoding_map_methods[] = {
7226    {"size", encoding_map_size, METH_NOARGS,
7227     PyDoc_STR("Return the size (in bytes) of this object") },
7228    { 0 }
7229};
7230
7231static void
7232encoding_map_dealloc(PyObject* o)
7233{
7234    PyObject_FREE(o);
7235}
7236
7237static PyTypeObject EncodingMapType = {
7238    PyVarObject_HEAD_INIT(NULL, 0)
7239    "EncodingMap",          /*tp_name*/
7240    sizeof(struct encoding_map),   /*tp_basicsize*/
7241    0,                      /*tp_itemsize*/
7242    /* methods */
7243    encoding_map_dealloc,   /*tp_dealloc*/
7244    0,                      /*tp_print*/
7245    0,                      /*tp_getattr*/
7246    0,                      /*tp_setattr*/
7247    0,                      /*tp_reserved*/
7248    0,                      /*tp_repr*/
7249    0,                      /*tp_as_number*/
7250    0,                      /*tp_as_sequence*/
7251    0,                      /*tp_as_mapping*/
7252    0,                      /*tp_hash*/
7253    0,                      /*tp_call*/
7254    0,                      /*tp_str*/
7255    0,                      /*tp_getattro*/
7256    0,                      /*tp_setattro*/
7257    0,                      /*tp_as_buffer*/
7258    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7259    0,                      /*tp_doc*/
7260    0,                      /*tp_traverse*/
7261    0,                      /*tp_clear*/
7262    0,                      /*tp_richcompare*/
7263    0,                      /*tp_weaklistoffset*/
7264    0,                      /*tp_iter*/
7265    0,                      /*tp_iternext*/
7266    encoding_map_methods,   /*tp_methods*/
7267    0,                      /*tp_members*/
7268    0,                      /*tp_getset*/
7269    0,                      /*tp_base*/
7270    0,                      /*tp_dict*/
7271    0,                      /*tp_descr_get*/
7272    0,                      /*tp_descr_set*/
7273    0,                      /*tp_dictoffset*/
7274    0,                      /*tp_init*/
7275    0,                      /*tp_alloc*/
7276    0,                      /*tp_new*/
7277    0,                      /*tp_free*/
7278    0,                      /*tp_is_gc*/
7279};
7280
7281PyObject*
7282PyUnicode_BuildEncodingMap(PyObject* string)
7283{
7284    PyObject *result;
7285    struct encoding_map *mresult;
7286    int i;
7287    int need_dict = 0;
7288    unsigned char level1[32];
7289    unsigned char level2[512];
7290    unsigned char *mlevel1, *mlevel2, *mlevel3;
7291    int count2 = 0, count3 = 0;
7292    int kind;
7293    void *data;
7294    Py_UCS4 ch;
7295
7296    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7297        PyErr_BadArgument();
7298        return NULL;
7299    }
7300    kind = PyUnicode_KIND(string);
7301    data = PyUnicode_DATA(string);
7302    memset(level1, 0xFF, sizeof level1);
7303    memset(level2, 0xFF, sizeof level2);
7304
7305    /* If there isn't a one-to-one mapping of NULL to \0,
7306       or if there are non-BMP characters, we need to use
7307       a mapping dictionary. */
7308    if (PyUnicode_READ(kind, data, 0) != 0)
7309        need_dict = 1;
7310    for (i = 1; i < 256; i++) {
7311        int l1, l2;
7312        ch = PyUnicode_READ(kind, data, i);
7313        if (ch == 0 || ch > 0xFFFF) {
7314            need_dict = 1;
7315            break;
7316        }
7317        if (ch == 0xFFFE)
7318            /* unmapped character */
7319            continue;
7320        l1 = ch >> 11;
7321        l2 = ch >> 7;
7322        if (level1[l1] == 0xFF)
7323            level1[l1] = count2++;
7324        if (level2[l2] == 0xFF)
7325            level2[l2] = count3++;
7326    }
7327
7328    if (count2 >= 0xFF || count3 >= 0xFF)
7329        need_dict = 1;
7330
7331    if (need_dict) {
7332        PyObject *result = PyDict_New();
7333        PyObject *key, *value;
7334        if (!result)
7335            return NULL;
7336        for (i = 0; i < 256; i++) {
7337            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7338            value = PyLong_FromLong(i);
7339            if (!key || !value)
7340                goto failed1;
7341            if (PyDict_SetItem(result, key, value) == -1)
7342                goto failed1;
7343            Py_DECREF(key);
7344            Py_DECREF(value);
7345        }
7346        return result;
7347      failed1:
7348        Py_XDECREF(key);
7349        Py_XDECREF(value);
7350        Py_DECREF(result);
7351        return NULL;
7352    }
7353
7354    /* Create a three-level trie */
7355    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7356                             16*count2 + 128*count3 - 1);
7357    if (!result)
7358        return PyErr_NoMemory();
7359    PyObject_Init(result, &EncodingMapType);
7360    mresult = (struct encoding_map*)result;
7361    mresult->count2 = count2;
7362    mresult->count3 = count3;
7363    mlevel1 = mresult->level1;
7364    mlevel2 = mresult->level23;
7365    mlevel3 = mresult->level23 + 16*count2;
7366    memcpy(mlevel1, level1, 32);
7367    memset(mlevel2, 0xFF, 16*count2);
7368    memset(mlevel3, 0, 128*count3);
7369    count3 = 0;
7370    for (i = 1; i < 256; i++) {
7371        int o1, o2, o3, i2, i3;
7372        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7373            /* unmapped character */
7374            continue;
7375        o1 = PyUnicode_READ(kind, data, i)>>11;
7376        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7377        i2 = 16*mlevel1[o1] + o2;
7378        if (mlevel2[i2] == 0xFF)
7379            mlevel2[i2] = count3++;
7380        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7381        i3 = 128*mlevel2[i2] + o3;
7382        mlevel3[i3] = i;
7383    }
7384    return result;
7385}
7386
7387static int
7388encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7389{
7390    struct encoding_map *map = (struct encoding_map*)mapping;
7391    int l1 = c>>11;
7392    int l2 = (c>>7) & 0xF;
7393    int l3 = c & 0x7F;
7394    int i;
7395
7396#ifdef Py_UNICODE_WIDE
7397    if (c > 0xFFFF) {
7398        return -1;
7399    }
7400#endif
7401    if (c == 0)
7402        return 0;
7403    /* level 1*/
7404    i = map->level1[l1];
7405    if (i == 0xFF) {
7406        return -1;
7407    }
7408    /* level 2*/
7409    i = map->level23[16*i+l2];
7410    if (i == 0xFF) {
7411        return -1;
7412    }
7413    /* level 3 */
7414    i = map->level23[16*map->count2 + 128*i + l3];
7415    if (i == 0) {
7416        return -1;
7417    }
7418    return i;
7419}
7420
7421/* Lookup the character ch in the mapping. If the character
7422   can't be found, Py_None is returned (or NULL, if another
7423   error occurred). */
7424static PyObject *
7425charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
7426{
7427    PyObject *w = PyLong_FromLong((long)c);
7428    PyObject *x;
7429
7430    if (w == NULL)
7431        return NULL;
7432    x = PyObject_GetItem(mapping, w);
7433    Py_DECREF(w);
7434    if (x == NULL) {
7435        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7436            /* No mapping found means: mapping is undefined. */
7437            PyErr_Clear();
7438            x = Py_None;
7439            Py_INCREF(x);
7440            return x;
7441        } else
7442            return NULL;
7443    }
7444    else if (x == Py_None)
7445        return x;
7446    else if (PyLong_Check(x)) {
7447        long value = PyLong_AS_LONG(x);
7448        if (value < 0 || value > 255) {
7449            PyErr_SetString(PyExc_TypeError,
7450                            "character mapping must be in range(256)");
7451            Py_DECREF(x);
7452            return NULL;
7453        }
7454        return x;
7455    }
7456    else if (PyBytes_Check(x))
7457        return x;
7458    else {
7459        /* wrong return value */
7460        PyErr_Format(PyExc_TypeError,
7461                     "character mapping must return integer, bytes or None, not %.400s",
7462                     x->ob_type->tp_name);
7463        Py_DECREF(x);
7464        return NULL;
7465    }
7466}
7467
7468static int
7469charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7470{
7471    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7472    /* exponentially overallocate to minimize reallocations */
7473    if (requiredsize < 2*outsize)
7474        requiredsize = 2*outsize;
7475    if (_PyBytes_Resize(outobj, requiredsize))
7476        return -1;
7477    return 0;
7478}
7479
7480typedef enum charmapencode_result {
7481    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7482} charmapencode_result;
7483/* lookup the character, put the result in the output string and adjust
7484   various state variables. Resize the output bytes object if not enough
7485   space is available. Return a new reference to the object that
7486   was put in the output buffer, or Py_None, if the mapping was undefined
7487   (in which case no character was written) or NULL, if a
7488   reallocation error occurred. The caller must decref the result */
7489static charmapencode_result
7490charmapencode_output(Py_UNICODE c, PyObject *mapping,
7491                     PyObject **outobj, Py_ssize_t *outpos)
7492{
7493    PyObject *rep;
7494    char *outstart;
7495    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7496
7497    if (Py_TYPE(mapping) == &EncodingMapType) {
7498        int res = encoding_map_lookup(c, mapping);
7499        Py_ssize_t requiredsize = *outpos+1;
7500        if (res == -1)
7501            return enc_FAILED;
7502        if (outsize<requiredsize)
7503            if (charmapencode_resize(outobj, outpos, requiredsize))
7504                return enc_EXCEPTION;
7505        outstart = PyBytes_AS_STRING(*outobj);
7506        outstart[(*outpos)++] = (char)res;
7507        return enc_SUCCESS;
7508    }
7509
7510    rep = charmapencode_lookup(c, mapping);
7511    if (rep==NULL)
7512        return enc_EXCEPTION;
7513    else if (rep==Py_None) {
7514        Py_DECREF(rep);
7515        return enc_FAILED;
7516    } else {
7517        if (PyLong_Check(rep)) {
7518            Py_ssize_t requiredsize = *outpos+1;
7519            if (outsize<requiredsize)
7520                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7521                    Py_DECREF(rep);
7522                    return enc_EXCEPTION;
7523                }
7524            outstart = PyBytes_AS_STRING(*outobj);
7525            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7526        }
7527        else {
7528            const char *repchars = PyBytes_AS_STRING(rep);
7529            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7530            Py_ssize_t requiredsize = *outpos+repsize;
7531            if (outsize<requiredsize)
7532                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7533                    Py_DECREF(rep);
7534                    return enc_EXCEPTION;
7535                }
7536            outstart = PyBytes_AS_STRING(*outobj);
7537            memcpy(outstart + *outpos, repchars, repsize);
7538            *outpos += repsize;
7539        }
7540    }
7541    Py_DECREF(rep);
7542    return enc_SUCCESS;
7543}
7544
7545/* handle an error in PyUnicode_EncodeCharmap
7546   Return 0 on success, -1 on error */
7547static int
7548charmap_encoding_error(
7549    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
7550    PyObject **exceptionObject,
7551    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7552    PyObject **res, Py_ssize_t *respos)
7553{
7554    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7555    Py_ssize_t repsize;
7556    Py_ssize_t newpos;
7557    Py_UNICODE *uni2;
7558    /* startpos for collecting unencodable chars */
7559    Py_ssize_t collstartpos = *inpos;
7560    Py_ssize_t collendpos = *inpos+1;
7561    Py_ssize_t collpos;
7562    char *encoding = "charmap";
7563    char *reason = "character maps to <undefined>";
7564    charmapencode_result x;
7565
7566    /* find all unencodable characters */
7567    while (collendpos < size) {
7568        PyObject *rep;
7569        if (Py_TYPE(mapping) == &EncodingMapType) {
7570            int res = encoding_map_lookup(p[collendpos], mapping);
7571            if (res != -1)
7572                break;
7573            ++collendpos;
7574            continue;
7575        }
7576
7577        rep = charmapencode_lookup(p[collendpos], mapping);
7578        if (rep==NULL)
7579            return -1;
7580        else if (rep!=Py_None) {
7581            Py_DECREF(rep);
7582            break;
7583        }
7584        Py_DECREF(rep);
7585        ++collendpos;
7586    }
7587    /* cache callback name lookup
7588     * (if not done yet, i.e. it's the first error) */
7589    if (*known_errorHandler==-1) {
7590        if ((errors==NULL) || (!strcmp(errors, "strict")))
7591            *known_errorHandler = 1;
7592        else if (!strcmp(errors, "replace"))
7593            *known_errorHandler = 2;
7594        else if (!strcmp(errors, "ignore"))
7595            *known_errorHandler = 3;
7596        else if (!strcmp(errors, "xmlcharrefreplace"))
7597            *known_errorHandler = 4;
7598        else
7599            *known_errorHandler = 0;
7600    }
7601    switch (*known_errorHandler) {
7602    case 1: /* strict */
7603        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7604        return -1;
7605    case 2: /* replace */
7606        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7607            x = charmapencode_output('?', mapping, res, respos);
7608            if (x==enc_EXCEPTION) {
7609                return -1;
7610            }
7611            else if (x==enc_FAILED) {
7612                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7613                return -1;
7614            }
7615        }
7616        /* fall through */
7617    case 3: /* ignore */
7618        *inpos = collendpos;
7619        break;
7620    case 4: /* xmlcharrefreplace */
7621        /* generate replacement (temporarily (mis)uses p) */
7622        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7623            char buffer[2+29+1+1];
7624            char *cp;
7625            sprintf(buffer, "&#%d;", (int)p[collpos]);
7626            for (cp = buffer; *cp; ++cp) {
7627                x = charmapencode_output(*cp, mapping, res, respos);
7628                if (x==enc_EXCEPTION)
7629                    return -1;
7630                else if (x==enc_FAILED) {
7631                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7632                    return -1;
7633                }
7634            }
7635        }
7636        *inpos = collendpos;
7637        break;
7638    default:
7639        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7640                                                      encoding, reason, p, size, exceptionObject,
7641                                                      collstartpos, collendpos, &newpos);
7642        if (repunicode == NULL)
7643            return -1;
7644        if (PyBytes_Check(repunicode)) {
7645            /* Directly copy bytes result to output. */
7646            Py_ssize_t outsize = PyBytes_Size(*res);
7647            Py_ssize_t requiredsize;
7648            repsize = PyBytes_Size(repunicode);
7649            requiredsize = *respos + repsize;
7650            if (requiredsize > outsize)
7651                /* Make room for all additional bytes. */
7652                if (charmapencode_resize(res, respos, requiredsize)) {
7653                    Py_DECREF(repunicode);
7654                    return -1;
7655                }
7656            memcpy(PyBytes_AsString(*res) + *respos,
7657                   PyBytes_AsString(repunicode),  repsize);
7658            *respos += repsize;
7659            *inpos = newpos;
7660            Py_DECREF(repunicode);
7661            break;
7662        }
7663        /* generate replacement  */
7664        repsize = PyUnicode_GET_SIZE(repunicode);
7665        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7666            x = charmapencode_output(*uni2, mapping, res, respos);
7667            if (x==enc_EXCEPTION) {
7668                return -1;
7669            }
7670            else if (x==enc_FAILED) {
7671                Py_DECREF(repunicode);
7672                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7673                return -1;
7674            }
7675        }
7676        *inpos = newpos;
7677        Py_DECREF(repunicode);
7678    }
7679    return 0;
7680}
7681
7682PyObject *
7683PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7684                        Py_ssize_t size,
7685                        PyObject *mapping,
7686                        const char *errors)
7687{
7688    /* output object */
7689    PyObject *res = NULL;
7690    /* current input position */
7691    Py_ssize_t inpos = 0;
7692    /* current output position */
7693    Py_ssize_t respos = 0;
7694    PyObject *errorHandler = NULL;
7695    PyObject *exc = NULL;
7696    /* the following variable is used for caching string comparisons
7697     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7698     * 3=ignore, 4=xmlcharrefreplace */
7699    int known_errorHandler = -1;
7700
7701    /* Default to Latin-1 */
7702    if (mapping == NULL)
7703        return PyUnicode_EncodeLatin1(p, size, errors);
7704
7705    /* allocate enough for a simple encoding without
7706       replacements, if we need more, we'll resize */
7707    res = PyBytes_FromStringAndSize(NULL, size);
7708    if (res == NULL)
7709        goto onError;
7710    if (size == 0)
7711        return res;
7712
7713    while (inpos<size) {
7714        /* try to encode it */
7715        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7716        if (x==enc_EXCEPTION) /* error */
7717            goto onError;
7718        if (x==enc_FAILED) { /* unencodable character */
7719            if (charmap_encoding_error(p, size, &inpos, mapping,
7720                                       &exc,
7721                                       &known_errorHandler, &errorHandler, errors,
7722                                       &res, &respos)) {
7723                goto onError;
7724            }
7725        }
7726        else
7727            /* done with this character => adjust input position */
7728            ++inpos;
7729    }
7730
7731    /* Resize if we allocated to much */
7732    if (respos<PyBytes_GET_SIZE(res))
7733        if (_PyBytes_Resize(&res, respos) < 0)
7734            goto onError;
7735
7736    Py_XDECREF(exc);
7737    Py_XDECREF(errorHandler);
7738    return res;
7739
7740  onError:
7741    Py_XDECREF(res);
7742    Py_XDECREF(exc);
7743    Py_XDECREF(errorHandler);
7744    return NULL;
7745}
7746
7747PyObject *
7748PyUnicode_AsCharmapString(PyObject *unicode,
7749                          PyObject *mapping)
7750{
7751    if (!PyUnicode_Check(unicode) || mapping == NULL) {
7752        PyErr_BadArgument();
7753        return NULL;
7754    }
7755    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
7756                                   PyUnicode_GET_SIZE(unicode),
7757                                   mapping,
7758                                   NULL);
7759}
7760
7761/* create or adjust a UnicodeTranslateError */
7762static void
7763make_translate_exception(PyObject **exceptionObject,
7764                         PyObject *unicode,
7765                         Py_ssize_t startpos, Py_ssize_t endpos,
7766                         const char *reason)
7767{
7768    if (*exceptionObject == NULL) {
7769        *exceptionObject = _PyUnicodeTranslateError_Create(
7770            unicode, startpos, endpos, reason);
7771    }
7772    else {
7773        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7774            goto onError;
7775        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7776            goto onError;
7777        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7778            goto onError;
7779        return;
7780      onError:
7781        Py_DECREF(*exceptionObject);
7782        *exceptionObject = NULL;
7783    }
7784}
7785
7786/* raises a UnicodeTranslateError */
7787static void
7788raise_translate_exception(PyObject **exceptionObject,
7789                          PyObject *unicode,
7790                          Py_ssize_t startpos, Py_ssize_t endpos,
7791                          const char *reason)
7792{
7793    make_translate_exception(exceptionObject,
7794                             unicode, startpos, endpos, reason);
7795    if (*exceptionObject != NULL)
7796        PyCodec_StrictErrors(*exceptionObject);
7797}
7798
7799/* error handling callback helper:
7800   build arguments, call the callback and check the arguments,
7801   put the result into newpos and return the replacement string, which
7802   has to be freed by the caller */
7803static PyObject *
7804unicode_translate_call_errorhandler(const char *errors,
7805                                    PyObject **errorHandler,
7806                                    const char *reason,
7807                                    PyObject *unicode, PyObject **exceptionObject,
7808                                    Py_ssize_t startpos, Py_ssize_t endpos,
7809                                    Py_ssize_t *newpos)
7810{
7811    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
7812
7813    Py_ssize_t i_newpos;
7814    PyObject *restuple;
7815    PyObject *resunicode;
7816
7817    if (*errorHandler == NULL) {
7818        *errorHandler = PyCodec_LookupError(errors);
7819        if (*errorHandler == NULL)
7820            return NULL;
7821    }
7822
7823    make_translate_exception(exceptionObject,
7824                             unicode, startpos, endpos, reason);
7825    if (*exceptionObject == NULL)
7826        return NULL;
7827
7828    restuple = PyObject_CallFunctionObjArgs(
7829        *errorHandler, *exceptionObject, NULL);
7830    if (restuple == NULL)
7831        return NULL;
7832    if (!PyTuple_Check(restuple)) {
7833        PyErr_SetString(PyExc_TypeError, &argparse[4]);
7834        Py_DECREF(restuple);
7835        return NULL;
7836    }
7837    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
7838                          &resunicode, &i_newpos)) {
7839        Py_DECREF(restuple);
7840        return NULL;
7841    }
7842    if (i_newpos<0)
7843        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
7844    else
7845        *newpos = i_newpos;
7846    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
7847        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7848        Py_DECREF(restuple);
7849        return NULL;
7850    }
7851    Py_INCREF(resunicode);
7852    Py_DECREF(restuple);
7853    return resunicode;
7854}
7855
7856/* Lookup the character ch in the mapping and put the result in result,
7857   which must be decrefed by the caller.
7858   Return 0 on success, -1 on error */
7859static int
7860charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
7861{
7862    PyObject *w = PyLong_FromLong((long)c);
7863    PyObject *x;
7864
7865    if (w == NULL)
7866        return -1;
7867    x = PyObject_GetItem(mapping, w);
7868    Py_DECREF(w);
7869    if (x == NULL) {
7870        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7871            /* No mapping found means: use 1:1 mapping. */
7872            PyErr_Clear();
7873            *result = NULL;
7874            return 0;
7875        } else
7876            return -1;
7877    }
7878    else if (x == Py_None) {
7879        *result = x;
7880        return 0;
7881    }
7882    else if (PyLong_Check(x)) {
7883        long value = PyLong_AS_LONG(x);
7884        long max = PyUnicode_GetMax();
7885        if (value < 0 || value > max) {
7886            PyErr_Format(PyExc_TypeError,
7887                         "character mapping must be in range(0x%x)", max+1);
7888            Py_DECREF(x);
7889            return -1;
7890        }
7891        *result = x;
7892        return 0;
7893    }
7894    else if (PyUnicode_Check(x)) {
7895        *result = x;
7896        return 0;
7897    }
7898    else {
7899        /* wrong return value */
7900        PyErr_SetString(PyExc_TypeError,
7901                        "character mapping must return integer, None or str");
7902        Py_DECREF(x);
7903        return -1;
7904    }
7905}
7906/* ensure that *outobj is at least requiredsize characters long,
7907   if not reallocate and adjust various state variables.
7908   Return 0 on success, -1 on error */
7909static int
7910charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
7911                               Py_ssize_t requiredsize)
7912{
7913    Py_ssize_t oldsize = *psize;
7914    if (requiredsize > oldsize) {
7915        /* exponentially overallocate to minimize reallocations */
7916        if (requiredsize < 2 * oldsize)
7917            requiredsize = 2 * oldsize;
7918        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7919        if (*outobj == 0)
7920            return -1;
7921        *psize = requiredsize;
7922    }
7923    return 0;
7924}
7925/* lookup the character, put the result in the output string and adjust
7926   various state variables. Return a new reference to the object that
7927   was put in the output buffer in *result, or Py_None, if the mapping was
7928   undefined (in which case no character was written).
7929   The called must decref result.
7930   Return 0 on success, -1 on error. */
7931static int
7932charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7933                        PyObject *mapping, Py_UCS4 **output,
7934                        Py_ssize_t *osize, Py_ssize_t *opos,
7935                        PyObject **res)
7936{
7937    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7938    if (charmaptranslate_lookup(curinp, mapping, res))
7939        return -1;
7940    if (*res==NULL) {
7941        /* not found => default to 1:1 mapping */
7942        (*output)[(*opos)++] = curinp;
7943    }
7944    else if (*res==Py_None)
7945        ;
7946    else if (PyLong_Check(*res)) {
7947        /* no overflow check, because we know that the space is enough */
7948        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
7949    }
7950    else if (PyUnicode_Check(*res)) {
7951        Py_ssize_t repsize;
7952        if (PyUnicode_READY(*res) == -1)
7953            return -1;
7954        repsize = PyUnicode_GET_LENGTH(*res);
7955        if (repsize==1) {
7956            /* no overflow check, because we know that the space is enough */
7957            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
7958        }
7959        else if (repsize!=0) {
7960            /* more than one character */
7961            Py_ssize_t requiredsize = *opos +
7962                (PyUnicode_GET_LENGTH(input) - ipos) +
7963                repsize - 1;
7964            Py_ssize_t i;
7965            if (charmaptranslate_makespace(output, osize, requiredsize))
7966                return -1;
7967            for(i = 0; i < repsize; i++)
7968                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
7969        }
7970    }
7971    else
7972        return -1;
7973    return 0;
7974}
7975
7976PyObject *
7977_PyUnicode_TranslateCharmap(PyObject *input,
7978                            PyObject *mapping,
7979                            const char *errors)
7980{
7981    /* input object */
7982    char *idata;
7983    Py_ssize_t size, i;
7984    int kind;
7985    /* output buffer */
7986    Py_UCS4 *output = NULL;
7987    Py_ssize_t osize;
7988    PyObject *res;
7989    /* current output position */
7990    Py_ssize_t opos;
7991    char *reason = "character maps to <undefined>";
7992    PyObject *errorHandler = NULL;
7993    PyObject *exc = NULL;
7994    /* the following variable is used for caching string comparisons
7995     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7996     * 3=ignore, 4=xmlcharrefreplace */
7997    int known_errorHandler = -1;
7998
7999    if (mapping == NULL) {
8000        PyErr_BadArgument();
8001        return NULL;
8002    }
8003
8004    if (PyUnicode_READY(input) == -1)
8005        return NULL;
8006    idata = (char*)PyUnicode_DATA(input);
8007    kind = PyUnicode_KIND(input);
8008    size = PyUnicode_GET_LENGTH(input);
8009    i = 0;
8010
8011    if (size == 0) {
8012        Py_INCREF(input);
8013        return input;
8014    }
8015
8016    /* allocate enough for a simple 1:1 translation without
8017       replacements, if we need more, we'll resize */
8018    osize = size;
8019    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8020    opos = 0;
8021    if (output == NULL) {
8022        PyErr_NoMemory();
8023        goto onError;
8024    }
8025
8026    while (i<size) {
8027        /* try to encode it */
8028        PyObject *x = NULL;
8029        if (charmaptranslate_output(input, i, mapping,
8030                                    &output, &osize, &opos, &x)) {
8031            Py_XDECREF(x);
8032            goto onError;
8033        }
8034        Py_XDECREF(x);
8035        if (x!=Py_None) /* it worked => adjust input pointer */
8036            ++i;
8037        else { /* untranslatable character */
8038            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8039            Py_ssize_t repsize;
8040            Py_ssize_t newpos;
8041            Py_ssize_t uni2;
8042            /* startpos for collecting untranslatable chars */
8043            Py_ssize_t collstart = i;
8044            Py_ssize_t collend = i+1;
8045            Py_ssize_t coll;
8046
8047            /* find all untranslatable characters */
8048            while (collend < size) {
8049                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8050                    goto onError;
8051                Py_XDECREF(x);
8052                if (x!=Py_None)
8053                    break;
8054                ++collend;
8055            }
8056            /* cache callback name lookup
8057             * (if not done yet, i.e. it's the first error) */
8058            if (known_errorHandler==-1) {
8059                if ((errors==NULL) || (!strcmp(errors, "strict")))
8060                    known_errorHandler = 1;
8061                else if (!strcmp(errors, "replace"))
8062                    known_errorHandler = 2;
8063                else if (!strcmp(errors, "ignore"))
8064                    known_errorHandler = 3;
8065                else if (!strcmp(errors, "xmlcharrefreplace"))
8066                    known_errorHandler = 4;
8067                else
8068                    known_errorHandler = 0;
8069            }
8070            switch (known_errorHandler) {
8071            case 1: /* strict */
8072                raise_translate_exception(&exc, input, collstart,
8073                                          collend, reason);
8074                goto onError;
8075            case 2: /* replace */
8076                /* No need to check for space, this is a 1:1 replacement */
8077                for (coll = collstart; coll<collend; coll++)
8078                    output[opos++] = '?';
8079                /* fall through */
8080            case 3: /* ignore */
8081                i = collend;
8082                break;
8083            case 4: /* xmlcharrefreplace */
8084                /* generate replacement (temporarily (mis)uses i) */
8085                for (i = collstart; i < collend; ++i) {
8086                    char buffer[2+29+1+1];
8087                    char *cp;
8088                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8089                    if (charmaptranslate_makespace(&output, &osize,
8090                                                   opos+strlen(buffer)+(size-collend)))
8091                        goto onError;
8092                    for (cp = buffer; *cp; ++cp)
8093                        output[opos++] = *cp;
8094                }
8095                i = collend;
8096                break;
8097            default:
8098                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8099                                                                 reason, input, &exc,
8100                                                                 collstart, collend, &newpos);
8101                if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
8102                    goto onError;
8103                /* generate replacement  */
8104                repsize = PyUnicode_GET_LENGTH(repunicode);
8105                if (charmaptranslate_makespace(&output, &osize,
8106                                               opos+repsize+(size-collend))) {
8107                    Py_DECREF(repunicode);
8108                    goto onError;
8109                }
8110                for (uni2 = 0; repsize-->0; ++uni2)
8111                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8112                i = newpos;
8113                Py_DECREF(repunicode);
8114            }
8115        }
8116    }
8117    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8118    if (!res)
8119        goto onError;
8120    PyMem_Free(output);
8121    Py_XDECREF(exc);
8122    Py_XDECREF(errorHandler);
8123    return res;
8124
8125  onError:
8126    PyMem_Free(output);
8127    Py_XDECREF(exc);
8128    Py_XDECREF(errorHandler);
8129    return NULL;
8130}
8131
8132/* Deprecated. Use PyUnicode_Translate instead. */
8133PyObject *
8134PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8135                           Py_ssize_t size,
8136                           PyObject *mapping,
8137                           const char *errors)
8138{
8139    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8140    if (!unicode)
8141        return NULL;
8142    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8143}
8144
8145PyObject *
8146PyUnicode_Translate(PyObject *str,
8147                    PyObject *mapping,
8148                    const char *errors)
8149{
8150    PyObject *result;
8151
8152    str = PyUnicode_FromObject(str);
8153    if (str == NULL)
8154        goto onError;
8155    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8156    Py_DECREF(str);
8157    return result;
8158
8159  onError:
8160    Py_XDECREF(str);
8161    return NULL;
8162}
8163
8164static Py_UCS4
8165fix_decimal_and_space_to_ascii(PyObject *self)
8166{
8167    /* No need to call PyUnicode_READY(self) because this function is only
8168       called as a callback from fixup() which does it already. */
8169    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8170    const int kind = PyUnicode_KIND(self);
8171    void *data = PyUnicode_DATA(self);
8172    Py_UCS4 maxchar = 0, ch, fixed;
8173    Py_ssize_t i;
8174
8175    for (i = 0; i < len; ++i) {
8176        ch = PyUnicode_READ(kind, data, i);
8177        fixed = 0;
8178        if (ch > 127) {
8179            if (Py_UNICODE_ISSPACE(ch))
8180                fixed = ' ';
8181            else {
8182                const int decimal = Py_UNICODE_TODECIMAL(ch);
8183                if (decimal >= 0)
8184                    fixed = '0' + decimal;
8185            }
8186            if (fixed != 0) {
8187                if (fixed > maxchar)
8188                    maxchar = fixed;
8189                PyUnicode_WRITE(kind, data, i, fixed);
8190            }
8191            else if (ch > maxchar)
8192                maxchar = ch;
8193        }
8194        else if (ch > maxchar)
8195            maxchar = ch;
8196    }
8197
8198    return maxchar;
8199}
8200
8201PyObject *
8202_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8203{
8204    if (!PyUnicode_Check(unicode)) {
8205        PyErr_BadInternalCall();
8206        return NULL;
8207    }
8208    if (PyUnicode_READY(unicode) == -1)
8209        return NULL;
8210    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8211        /* If the string is already ASCII, just return the same string */
8212        Py_INCREF(unicode);
8213        return unicode;
8214    }
8215    return fixup(unicode, fix_decimal_and_space_to_ascii);
8216}
8217
8218PyObject *
8219PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8220                                  Py_ssize_t length)
8221{
8222    PyObject *result;
8223    Py_UNICODE *p; /* write pointer into result */
8224    Py_ssize_t i;
8225    /* Copy to a new string */
8226    result = (PyObject *)_PyUnicode_New(length);
8227    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8228    if (result == NULL)
8229        return result;
8230    p = PyUnicode_AS_UNICODE(result);
8231    /* Iterate over code points */
8232    for (i = 0; i < length; i++) {
8233        Py_UNICODE ch =s[i];
8234        if (ch > 127) {
8235            int decimal = Py_UNICODE_TODECIMAL(ch);
8236            if (decimal >= 0)
8237                p[i] = '0' + decimal;
8238        }
8239    }
8240#ifndef DONT_MAKE_RESULT_READY
8241    if (_PyUnicode_READY_REPLACE(&result)) {
8242        Py_DECREF(result);
8243        return NULL;
8244    }
8245#endif
8246    assert(_PyUnicode_CheckConsistency(result, 1));
8247    return result;
8248}
8249/* --- Decimal Encoder ---------------------------------------------------- */
8250
8251int
8252PyUnicode_EncodeDecimal(Py_UNICODE *s,
8253                        Py_ssize_t length,
8254                        char *output,
8255                        const char *errors)
8256{
8257    Py_UNICODE *p, *end;
8258    PyObject *errorHandler = NULL;
8259    PyObject *exc = NULL;
8260    const char *encoding = "decimal";
8261    const char *reason = "invalid decimal Unicode string";
8262    /* the following variable is used for caching string comparisons
8263     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8264    int known_errorHandler = -1;
8265
8266    if (output == NULL) {
8267        PyErr_BadArgument();
8268        return -1;
8269    }
8270
8271    p = s;
8272    end = s + length;
8273    while (p < end) {
8274        register Py_UNICODE ch = *p;
8275        int decimal;
8276        PyObject *repunicode;
8277        Py_ssize_t repsize;
8278        Py_ssize_t newpos;
8279        Py_UNICODE *uni2;
8280        Py_UNICODE *collstart;
8281        Py_UNICODE *collend;
8282
8283        if (Py_UNICODE_ISSPACE(ch)) {
8284            *output++ = ' ';
8285            ++p;
8286            continue;
8287        }
8288        decimal = Py_UNICODE_TODECIMAL(ch);
8289        if (decimal >= 0) {
8290            *output++ = '0' + decimal;
8291            ++p;
8292            continue;
8293        }
8294        if (0 < ch && ch < 256) {
8295            *output++ = (char)ch;
8296            ++p;
8297            continue;
8298        }
8299        /* All other characters are considered unencodable */
8300        collstart = p;
8301        collend = p+1;
8302        while (collend < end) {
8303            if ((0 < *collend && *collend < 256) ||
8304                !Py_UNICODE_ISSPACE(*collend) ||
8305                Py_UNICODE_TODECIMAL(*collend))
8306                break;
8307        }
8308        /* cache callback name lookup
8309         * (if not done yet, i.e. it's the first error) */
8310        if (known_errorHandler==-1) {
8311            if ((errors==NULL) || (!strcmp(errors, "strict")))
8312                known_errorHandler = 1;
8313            else if (!strcmp(errors, "replace"))
8314                known_errorHandler = 2;
8315            else if (!strcmp(errors, "ignore"))
8316                known_errorHandler = 3;
8317            else if (!strcmp(errors, "xmlcharrefreplace"))
8318                known_errorHandler = 4;
8319            else
8320                known_errorHandler = 0;
8321        }
8322        switch (known_errorHandler) {
8323        case 1: /* strict */
8324            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8325            goto onError;
8326        case 2: /* replace */
8327            for (p = collstart; p < collend; ++p)
8328                *output++ = '?';
8329            /* fall through */
8330        case 3: /* ignore */
8331            p = collend;
8332            break;
8333        case 4: /* xmlcharrefreplace */
8334            /* generate replacement (temporarily (mis)uses p) */
8335            for (p = collstart; p < collend; ++p)
8336                output += sprintf(output, "&#%d;", (int)*p);
8337            p = collend;
8338            break;
8339        default:
8340            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8341                                                          encoding, reason, s, length, &exc,
8342                                                          collstart-s, collend-s, &newpos);
8343            if (repunicode == NULL)
8344                goto onError;
8345            if (!PyUnicode_Check(repunicode)) {
8346                /* Byte results not supported, since they have no decimal property. */
8347                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8348                Py_DECREF(repunicode);
8349                goto onError;
8350            }
8351            /* generate replacement  */
8352            repsize = PyUnicode_GET_SIZE(repunicode);
8353            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8354                Py_UNICODE ch = *uni2;
8355                if (Py_UNICODE_ISSPACE(ch))
8356                    *output++ = ' ';
8357                else {
8358                    decimal = Py_UNICODE_TODECIMAL(ch);
8359                    if (decimal >= 0)
8360                        *output++ = '0' + decimal;
8361                    else if (0 < ch && ch < 256)
8362                        *output++ = (char)ch;
8363                    else {
8364                        Py_DECREF(repunicode);
8365                        raise_encode_exception(&exc, encoding,
8366                                               s, length, collstart-s, collend-s, reason);
8367                        goto onError;
8368                    }
8369                }
8370            }
8371            p = s + newpos;
8372            Py_DECREF(repunicode);
8373        }
8374    }
8375    /* 0-terminate the output string */
8376    *output++ = '\0';
8377    Py_XDECREF(exc);
8378    Py_XDECREF(errorHandler);
8379    return 0;
8380
8381  onError:
8382    Py_XDECREF(exc);
8383    Py_XDECREF(errorHandler);
8384    return -1;
8385}
8386
8387/* --- Helpers ------------------------------------------------------------ */
8388
8389#include "stringlib/asciilib.h"
8390#include "stringlib/fastsearch.h"
8391#include "stringlib/partition.h"
8392#include "stringlib/split.h"
8393#include "stringlib/count.h"
8394#include "stringlib/find.h"
8395#include "stringlib/localeutil.h"
8396#include "stringlib/undef.h"
8397
8398#include "stringlib/ucs1lib.h"
8399#include "stringlib/fastsearch.h"
8400#include "stringlib/partition.h"
8401#include "stringlib/split.h"
8402#include "stringlib/count.h"
8403#include "stringlib/find.h"
8404#include "stringlib/localeutil.h"
8405#include "stringlib/undef.h"
8406
8407#include "stringlib/ucs2lib.h"
8408#include "stringlib/fastsearch.h"
8409#include "stringlib/partition.h"
8410#include "stringlib/split.h"
8411#include "stringlib/count.h"
8412#include "stringlib/find.h"
8413#include "stringlib/localeutil.h"
8414#include "stringlib/undef.h"
8415
8416#include "stringlib/ucs4lib.h"
8417#include "stringlib/fastsearch.h"
8418#include "stringlib/partition.h"
8419#include "stringlib/split.h"
8420#include "stringlib/count.h"
8421#include "stringlib/find.h"
8422#include "stringlib/localeutil.h"
8423#include "stringlib/undef.h"
8424
8425static Py_ssize_t
8426any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8427                                  const Py_UCS1*, Py_ssize_t,
8428                                  Py_ssize_t, Py_ssize_t),
8429               Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8430                                  const Py_UCS1*, Py_ssize_t,
8431                                  Py_ssize_t, Py_ssize_t),
8432               Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8433                                  const Py_UCS2*, Py_ssize_t,
8434                                  Py_ssize_t, Py_ssize_t),
8435               Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8436                                  const Py_UCS4*, Py_ssize_t,
8437                                  Py_ssize_t, Py_ssize_t),
8438               PyObject* s1, PyObject* s2,
8439               Py_ssize_t start,
8440               Py_ssize_t end)
8441{
8442    int kind1, kind2, kind;
8443    void *buf1, *buf2;
8444    Py_ssize_t len1, len2, result;
8445
8446    kind1 = PyUnicode_KIND(s1);
8447    kind2 = PyUnicode_KIND(s2);
8448    kind = kind1 > kind2 ? kind1 : kind2;
8449    buf1 = PyUnicode_DATA(s1);
8450    buf2 = PyUnicode_DATA(s2);
8451    if (kind1 != kind)
8452        buf1 = _PyUnicode_AsKind(s1, kind);
8453    if (!buf1)
8454        return -2;
8455    if (kind2 != kind)
8456        buf2 = _PyUnicode_AsKind(s2, kind);
8457    if (!buf2) {
8458        if (kind1 != kind) PyMem_Free(buf1);
8459        return -2;
8460    }
8461    len1 = PyUnicode_GET_LENGTH(s1);
8462    len2 = PyUnicode_GET_LENGTH(s2);
8463
8464    switch(kind) {
8465    case PyUnicode_1BYTE_KIND:
8466        if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8467            result = ascii(buf1, len1, buf2, len2, start, end);
8468        else
8469            result = ucs1(buf1, len1, buf2, len2, start, end);
8470        break;
8471    case PyUnicode_2BYTE_KIND:
8472        result = ucs2(buf1, len1, buf2, len2, start, end);
8473        break;
8474    case PyUnicode_4BYTE_KIND:
8475        result = ucs4(buf1, len1, buf2, len2, start, end);
8476        break;
8477    default:
8478        assert(0); result = -2;
8479    }
8480
8481    if (kind1 != kind)
8482        PyMem_Free(buf1);
8483    if (kind2 != kind)
8484        PyMem_Free(buf2);
8485
8486    return result;
8487}
8488
8489Py_ssize_t
8490_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
8491                                   Py_ssize_t n_buffer,
8492                                   void *digits, Py_ssize_t n_digits,
8493                                   Py_ssize_t min_width,
8494                                   const char *grouping,
8495                                   const char *thousands_sep)
8496{
8497    switch(kind) {
8498    case PyUnicode_1BYTE_KIND:
8499        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8500            return _PyUnicode_ascii_InsertThousandsGrouping(
8501                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8502                min_width, grouping, thousands_sep);
8503        else
8504            return _PyUnicode_ucs1_InsertThousandsGrouping(
8505                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8506                min_width, grouping, thousands_sep);
8507    case PyUnicode_2BYTE_KIND:
8508        return _PyUnicode_ucs2_InsertThousandsGrouping(
8509            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8510            min_width, grouping, thousands_sep);
8511    case PyUnicode_4BYTE_KIND:
8512        return _PyUnicode_ucs4_InsertThousandsGrouping(
8513            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8514            min_width, grouping, thousands_sep);
8515    }
8516    assert(0);
8517    return -1;
8518}
8519
8520
8521#include "stringlib/unicodedefs.h"
8522#include "stringlib/fastsearch.h"
8523
8524#include "stringlib/count.h"
8525#include "stringlib/find.h"
8526
8527/* helper macro to fixup start/end slice values */
8528#define ADJUST_INDICES(start, end, len)         \
8529    if (end > len)                              \
8530        end = len;                              \
8531    else if (end < 0) {                         \
8532        end += len;                             \
8533        if (end < 0)                            \
8534            end = 0;                            \
8535    }                                           \
8536    if (start < 0) {                            \
8537        start += len;                           \
8538        if (start < 0)                          \
8539            start = 0;                          \
8540    }
8541
8542Py_ssize_t
8543PyUnicode_Count(PyObject *str,
8544                PyObject *substr,
8545                Py_ssize_t start,
8546                Py_ssize_t end)
8547{
8548    Py_ssize_t result;
8549    PyUnicodeObject* str_obj;
8550    PyUnicodeObject* sub_obj;
8551    int kind1, kind2, kind;
8552    void *buf1 = NULL, *buf2 = NULL;
8553    Py_ssize_t len1, len2;
8554
8555    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
8556    if (!str_obj || PyUnicode_READY(str_obj) == -1)
8557        return -1;
8558    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
8559    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
8560        Py_DECREF(str_obj);
8561        return -1;
8562    }
8563
8564    kind1 = PyUnicode_KIND(str_obj);
8565    kind2 = PyUnicode_KIND(sub_obj);
8566    kind = kind1 > kind2 ? kind1 : kind2;
8567    buf1 = PyUnicode_DATA(str_obj);
8568    if (kind1 != kind)
8569        buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8570    if (!buf1)
8571        goto onError;
8572    buf2 = PyUnicode_DATA(sub_obj);
8573    if (kind2 != kind)
8574        buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8575    if (!buf2)
8576        goto onError;
8577    len1 = PyUnicode_GET_LENGTH(str_obj);
8578    len2 = PyUnicode_GET_LENGTH(sub_obj);
8579
8580    ADJUST_INDICES(start, end, len1);
8581    switch(kind) {
8582    case PyUnicode_1BYTE_KIND:
8583        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8584            result = asciilib_count(
8585                ((Py_UCS1*)buf1) + start, end - start,
8586                buf2, len2, PY_SSIZE_T_MAX
8587                );
8588        else
8589            result = ucs1lib_count(
8590                ((Py_UCS1*)buf1) + start, end - start,
8591                buf2, len2, PY_SSIZE_T_MAX
8592                );
8593        break;
8594    case PyUnicode_2BYTE_KIND:
8595        result = ucs2lib_count(
8596            ((Py_UCS2*)buf1) + start, end - start,
8597            buf2, len2, PY_SSIZE_T_MAX
8598            );
8599        break;
8600    case PyUnicode_4BYTE_KIND:
8601        result = ucs4lib_count(
8602            ((Py_UCS4*)buf1) + start, end - start,
8603            buf2, len2, PY_SSIZE_T_MAX
8604            );
8605        break;
8606    default:
8607        assert(0); result = 0;
8608    }
8609
8610    Py_DECREF(sub_obj);
8611    Py_DECREF(str_obj);
8612
8613    if (kind1 != kind)
8614        PyMem_Free(buf1);
8615    if (kind2 != kind)
8616        PyMem_Free(buf2);
8617
8618    return result;
8619  onError:
8620    Py_DECREF(sub_obj);
8621    Py_DECREF(str_obj);
8622    if (kind1 != kind && buf1)
8623        PyMem_Free(buf1);
8624    if (kind2 != kind && buf2)
8625        PyMem_Free(buf2);
8626    return -1;
8627}
8628
8629Py_ssize_t
8630PyUnicode_Find(PyObject *str,
8631               PyObject *sub,
8632               Py_ssize_t start,
8633               Py_ssize_t end,
8634               int direction)
8635{
8636    Py_ssize_t result;
8637
8638    str = PyUnicode_FromObject(str);
8639    if (!str || PyUnicode_READY(str) == -1)
8640        return -2;
8641    sub = PyUnicode_FromObject(sub);
8642    if (!sub || PyUnicode_READY(sub) == -1) {
8643        Py_DECREF(str);
8644        return -2;
8645    }
8646
8647    if (direction > 0)
8648        result = any_find_slice(
8649            asciilib_find_slice, ucs1lib_find_slice,
8650            ucs2lib_find_slice, ucs4lib_find_slice,
8651            str, sub, start, end
8652            );
8653    else
8654        result = any_find_slice(
8655            asciilib_find_slice, ucs1lib_rfind_slice,
8656            ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8657            str, sub, start, end
8658            );
8659
8660    Py_DECREF(str);
8661    Py_DECREF(sub);
8662
8663    return result;
8664}
8665
8666Py_ssize_t
8667PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8668                   Py_ssize_t start, Py_ssize_t end,
8669                   int direction)
8670{
8671    char *result;
8672    int kind;
8673    if (PyUnicode_READY(str) == -1)
8674        return -2;
8675    if (start < 0 || end < 0) {
8676        PyErr_SetString(PyExc_IndexError, "string index out of range");
8677        return -2;
8678    }
8679    if (end > PyUnicode_GET_LENGTH(str))
8680        end = PyUnicode_GET_LENGTH(str);
8681    kind = PyUnicode_KIND(str);
8682    result = findchar(PyUnicode_1BYTE_DATA(str)
8683                      + PyUnicode_KIND_SIZE(kind, start),
8684                      kind,
8685                      end-start, ch, direction);
8686    if (!result)
8687        return -1;
8688    return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8689}
8690
8691static int
8692tailmatch(PyUnicodeObject *self,
8693          PyUnicodeObject *substring,
8694          Py_ssize_t start,
8695          Py_ssize_t end,
8696          int direction)
8697{
8698    int kind_self;
8699    int kind_sub;
8700    void *data_self;
8701    void *data_sub;
8702    Py_ssize_t offset;
8703    Py_ssize_t i;
8704    Py_ssize_t end_sub;
8705
8706    if (PyUnicode_READY(self) == -1 ||
8707        PyUnicode_READY(substring) == -1)
8708        return 0;
8709
8710    if (PyUnicode_GET_LENGTH(substring) == 0)
8711        return 1;
8712
8713    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8714    end -= PyUnicode_GET_LENGTH(substring);
8715    if (end < start)
8716        return 0;
8717
8718    kind_self = PyUnicode_KIND(self);
8719    data_self = PyUnicode_DATA(self);
8720    kind_sub = PyUnicode_KIND(substring);
8721    data_sub = PyUnicode_DATA(substring);
8722    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8723
8724    if (direction > 0)
8725        offset = end;
8726    else
8727        offset = start;
8728
8729    if (PyUnicode_READ(kind_self, data_self, offset) ==
8730        PyUnicode_READ(kind_sub, data_sub, 0) &&
8731        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8732        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8733        /* If both are of the same kind, memcmp is sufficient */
8734        if (kind_self == kind_sub) {
8735            return ! memcmp((char *)data_self +
8736                                (offset * PyUnicode_CHARACTER_SIZE(substring)),
8737                            data_sub,
8738                            PyUnicode_GET_LENGTH(substring) *
8739                                PyUnicode_CHARACTER_SIZE(substring));
8740        }
8741        /* otherwise we have to compare each character by first accesing it */
8742        else {
8743            /* We do not need to compare 0 and len(substring)-1 because
8744               the if statement above ensured already that they are equal
8745               when we end up here. */
8746            // TODO: honor direction and do a forward or backwards search
8747            for (i = 1; i < end_sub; ++i) {
8748                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8749                    PyUnicode_READ(kind_sub, data_sub, i))
8750                    return 0;
8751            }
8752            return 1;
8753        }
8754    }
8755
8756    return 0;
8757}
8758
8759Py_ssize_t
8760PyUnicode_Tailmatch(PyObject *str,
8761                    PyObject *substr,
8762                    Py_ssize_t start,
8763                    Py_ssize_t end,
8764                    int direction)
8765{
8766    Py_ssize_t result;
8767
8768    str = PyUnicode_FromObject(str);
8769    if (str == NULL)
8770        return -1;
8771    substr = PyUnicode_FromObject(substr);
8772    if (substr == NULL) {
8773        Py_DECREF(str);
8774        return -1;
8775    }
8776
8777    result = tailmatch((PyUnicodeObject *)str,
8778                       (PyUnicodeObject *)substr,
8779                       start, end, direction);
8780    Py_DECREF(str);
8781    Py_DECREF(substr);
8782    return result;
8783}
8784
8785/* Apply fixfct filter to the Unicode object self and return a
8786   reference to the modified object */
8787
8788static PyObject *
8789fixup(PyObject *self,
8790      Py_UCS4 (*fixfct)(PyObject *s))
8791{
8792    PyObject *u;
8793    Py_UCS4 maxchar_old, maxchar_new = 0;
8794
8795    if (PyUnicode_READY(self) == -1)
8796        return NULL;
8797    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8798    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8799                      maxchar_old);
8800    if (u == NULL)
8801        return NULL;
8802
8803    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8804              PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
8805
8806    /* fix functions return the new maximum character in a string,
8807       if the kind of the resulting unicode object does not change,
8808       everything is fine.  Otherwise we need to change the string kind
8809       and re-run the fix function. */
8810    maxchar_new = fixfct(u);
8811    if (maxchar_new == 0)
8812        /* do nothing, keep maxchar_new at 0 which means no changes. */;
8813    else if (maxchar_new <= 127)
8814        maxchar_new = 127;
8815    else if (maxchar_new <= 255)
8816        maxchar_new = 255;
8817    else if (maxchar_new <= 65535)
8818        maxchar_new = 65535;
8819    else
8820        maxchar_new = 1114111; /* 0x10ffff */
8821
8822    if (!maxchar_new && PyUnicode_CheckExact(self)) {
8823        /* fixfct should return TRUE if it modified the buffer. If
8824           FALSE, return a reference to the original buffer instead
8825           (to save space, not time) */
8826        Py_INCREF(self);
8827        Py_DECREF(u);
8828        return (PyObject*) self;
8829    }
8830    else if (maxchar_new == maxchar_old) {
8831        return u;
8832    }
8833    else {
8834        /* In case the maximum character changed, we need to
8835           convert the string to the new category. */
8836        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
8837        if (v == NULL) {
8838            Py_DECREF(u);
8839            return NULL;
8840        }
8841        if (maxchar_new > maxchar_old) {
8842            /* If the maxchar increased so that the kind changed, not all
8843               characters are representable anymore and we need to fix the
8844               string again. This only happens in very few cases. */
8845            copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
8846            maxchar_old = fixfct(v);
8847            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8848        }
8849        else {
8850            copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
8851        }
8852
8853        Py_DECREF(u);
8854        assert(_PyUnicode_CheckConsistency(v, 1));
8855        return v;
8856    }
8857}
8858
8859static Py_UCS4
8860fixupper(PyObject *self)
8861{
8862    /* No need to call PyUnicode_READY(self) because this function is only
8863       called as a callback from fixup() which does it already. */
8864    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8865    const int kind = PyUnicode_KIND(self);
8866    void *data = PyUnicode_DATA(self);
8867    int touched = 0;
8868    Py_UCS4 maxchar = 0;
8869    Py_ssize_t i;
8870
8871    for (i = 0; i < len; ++i) {
8872        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8873        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8874        if (up != ch) {
8875            if (up > maxchar)
8876                maxchar = up;
8877            PyUnicode_WRITE(kind, data, i, up);
8878            touched = 1;
8879        }
8880        else if (ch > maxchar)
8881            maxchar = ch;
8882    }
8883
8884    if (touched)
8885        return maxchar;
8886    else
8887        return 0;
8888}
8889
8890static Py_UCS4
8891fixlower(PyObject *self)
8892{
8893    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8894    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8895    const int kind = PyUnicode_KIND(self);
8896    void *data = PyUnicode_DATA(self);
8897    int touched = 0;
8898    Py_UCS4 maxchar = 0;
8899    Py_ssize_t i;
8900
8901    for(i = 0; i < len; ++i) {
8902        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8903        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8904        if (lo != ch) {
8905            if (lo > maxchar)
8906                maxchar = lo;
8907            PyUnicode_WRITE(kind, data, i, lo);
8908            touched = 1;
8909        }
8910        else if (ch > maxchar)
8911            maxchar = ch;
8912    }
8913
8914    if (touched)
8915        return maxchar;
8916    else
8917        return 0;
8918}
8919
8920static Py_UCS4
8921fixswapcase(PyObject *self)
8922{
8923    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8924    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8925    const int kind = PyUnicode_KIND(self);
8926    void *data = PyUnicode_DATA(self);
8927    int touched = 0;
8928    Py_UCS4 maxchar = 0;
8929    Py_ssize_t i;
8930
8931    for(i = 0; i < len; ++i) {
8932        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8933        Py_UCS4 nu = 0;
8934
8935        if (Py_UNICODE_ISUPPER(ch))
8936            nu = Py_UNICODE_TOLOWER(ch);
8937        else if (Py_UNICODE_ISLOWER(ch))
8938            nu = Py_UNICODE_TOUPPER(ch);
8939
8940        if (nu != 0) {
8941            if (nu > maxchar)
8942                maxchar = nu;
8943            PyUnicode_WRITE(kind, data, i, nu);
8944            touched = 1;
8945        }
8946        else if (ch > maxchar)
8947            maxchar = ch;
8948    }
8949
8950    if (touched)
8951        return maxchar;
8952    else
8953        return 0;
8954}
8955
8956static Py_UCS4
8957fixcapitalize(PyObject *self)
8958{
8959    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8960    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8961    const int kind = PyUnicode_KIND(self);
8962    void *data = PyUnicode_DATA(self);
8963    int touched = 0;
8964    Py_UCS4 maxchar = 0;
8965    Py_ssize_t i = 0;
8966    Py_UCS4 ch;
8967
8968    if (len == 0)
8969        return 0;
8970
8971    ch = PyUnicode_READ(kind, data, i);
8972    if (!Py_UNICODE_ISUPPER(ch)) {
8973        maxchar = Py_UNICODE_TOUPPER(ch);
8974        PyUnicode_WRITE(kind, data, i, maxchar);
8975        touched = 1;
8976    }
8977    ++i;
8978    for(; i < len; ++i) {
8979        ch = PyUnicode_READ(kind, data, i);
8980        if (!Py_UNICODE_ISLOWER(ch)) {
8981            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8982            if (lo > maxchar)
8983                maxchar = lo;
8984            PyUnicode_WRITE(kind, data, i, lo);
8985            touched = 1;
8986        }
8987        else if (ch > maxchar)
8988            maxchar = ch;
8989    }
8990
8991    if (touched)
8992        return maxchar;
8993    else
8994        return 0;
8995}
8996
8997static Py_UCS4
8998fixtitle(PyObject *self)
8999{
9000    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9001    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9002    const int kind = PyUnicode_KIND(self);
9003    void *data = PyUnicode_DATA(self);
9004    Py_UCS4 maxchar = 0;
9005    Py_ssize_t i = 0;
9006    int previous_is_cased;
9007
9008    /* Shortcut for single character strings */
9009    if (len == 1) {
9010        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9011        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9012        if (ti != ch) {
9013            PyUnicode_WRITE(kind, data, i, ti);
9014            return ti;
9015        }
9016        else
9017            return 0;
9018    }
9019    previous_is_cased = 0;
9020    for(; i < len; ++i) {
9021        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9022        Py_UCS4 nu;
9023
9024        if (previous_is_cased)
9025            nu = Py_UNICODE_TOLOWER(ch);
9026        else
9027            nu = Py_UNICODE_TOTITLE(ch);
9028
9029        if (nu > maxchar)
9030            maxchar = nu;
9031        PyUnicode_WRITE(kind, data, i, nu);
9032
9033        if (Py_UNICODE_ISLOWER(ch) ||
9034            Py_UNICODE_ISUPPER(ch) ||
9035            Py_UNICODE_ISTITLE(ch))
9036            previous_is_cased = 1;
9037        else
9038            previous_is_cased = 0;
9039    }
9040    return maxchar;
9041}
9042
9043PyObject *
9044PyUnicode_Join(PyObject *separator, PyObject *seq)
9045{
9046    PyObject *sep = NULL;
9047    Py_ssize_t seplen = 1;
9048    PyObject *res = NULL; /* the result */
9049    PyObject *fseq;          /* PySequence_Fast(seq) */
9050    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9051    PyObject **items;
9052    PyObject *item;
9053    Py_ssize_t sz, i, res_offset;
9054    Py_UCS4 maxchar;
9055    Py_UCS4 item_maxchar;
9056
9057    fseq = PySequence_Fast(seq, "");
9058    if (fseq == NULL) {
9059        return NULL;
9060    }
9061
9062    /* NOTE: the following code can't call back into Python code,
9063     * so we are sure that fseq won't be mutated.
9064     */
9065
9066    seqlen = PySequence_Fast_GET_SIZE(fseq);
9067    /* If empty sequence, return u"". */
9068    if (seqlen == 0) {
9069        Py_DECREF(fseq);
9070        Py_INCREF(unicode_empty);
9071        res = unicode_empty;
9072        return res;
9073    }
9074
9075    /* If singleton sequence with an exact Unicode, return that. */
9076    items = PySequence_Fast_ITEMS(fseq);
9077    if (seqlen == 1 && PyUnicode_CheckExact(items[0])) {
9078        res = items[0];
9079        Py_INCREF(res);
9080        Py_DECREF(fseq);
9081        return res;
9082    }
9083
9084    /* Set up sep and seplen */
9085    if (separator == NULL) {
9086        /* fall back to a blank space separator */
9087        sep = PyUnicode_FromOrdinal(' ');
9088        if (!sep)
9089            goto onError;
9090        maxchar = 32;
9091    }
9092    else {
9093        if (!PyUnicode_Check(separator)) {
9094            PyErr_Format(PyExc_TypeError,
9095                         "separator: expected str instance,"
9096                         " %.80s found",
9097                         Py_TYPE(separator)->tp_name);
9098            goto onError;
9099        }
9100        if (PyUnicode_READY(separator))
9101            goto onError;
9102        sep = separator;
9103        seplen = PyUnicode_GET_LENGTH(separator);
9104        maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9105        /* inc refcount to keep this code path symmetric with the
9106           above case of a blank separator */
9107        Py_INCREF(sep);
9108    }
9109
9110    /* There are at least two things to join, or else we have a subclass
9111     * of str in the sequence.
9112     * Do a pre-pass to figure out the total amount of space we'll
9113     * need (sz), and see whether all argument are strings.
9114     */
9115    sz = 0;
9116    for (i = 0; i < seqlen; i++) {
9117        const Py_ssize_t old_sz = sz;
9118        item = items[i];
9119        if (!PyUnicode_Check(item)) {
9120            PyErr_Format(PyExc_TypeError,
9121                         "sequence item %zd: expected str instance,"
9122                         " %.80s found",
9123                         i, Py_TYPE(item)->tp_name);
9124            goto onError;
9125        }
9126        if (PyUnicode_READY(item) == -1)
9127            goto onError;
9128        sz += PyUnicode_GET_LENGTH(item);
9129        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9130        if (item_maxchar > maxchar)
9131            maxchar = item_maxchar;
9132        if (i != 0)
9133            sz += seplen;
9134        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9135            PyErr_SetString(PyExc_OverflowError,
9136                            "join() result is too long for a Python string");
9137            goto onError;
9138        }
9139    }
9140
9141    res = PyUnicode_New(sz, maxchar);
9142    if (res == NULL)
9143        goto onError;
9144
9145    /* Catenate everything. */
9146    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9147        Py_ssize_t itemlen;
9148        item = items[i];
9149        /* Copy item, and maybe the separator. */
9150        if (i && seplen != 0) {
9151            copy_characters(res, res_offset, sep, 0, seplen);
9152            res_offset += seplen;
9153        }
9154        itemlen = PyUnicode_GET_LENGTH(item);
9155        if (itemlen != 0) {
9156            copy_characters(res, res_offset, item, 0, itemlen);
9157            res_offset += itemlen;
9158        }
9159    }
9160    assert(res_offset == PyUnicode_GET_LENGTH(res));
9161
9162    Py_DECREF(fseq);
9163    Py_XDECREF(sep);
9164    assert(_PyUnicode_CheckConsistency(res, 1));
9165    return res;
9166
9167  onError:
9168    Py_DECREF(fseq);
9169    Py_XDECREF(sep);
9170    Py_XDECREF(res);
9171    return NULL;
9172}
9173
9174#define FILL(kind, data, value, start, length) \
9175    do { \
9176        Py_ssize_t i_ = 0; \
9177        assert(kind != PyUnicode_WCHAR_KIND); \
9178        switch ((kind)) { \
9179        case PyUnicode_1BYTE_KIND: { \
9180            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9181            memset(to_, (unsigned char)value, length); \
9182            break; \
9183        } \
9184        case PyUnicode_2BYTE_KIND: { \
9185            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9186            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9187            break; \
9188        } \
9189        default: { \
9190            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9191            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9192            break; \
9193        } \
9194        } \
9195    } while (0)
9196
9197static PyObject *
9198pad(PyObject *self,
9199    Py_ssize_t left,
9200    Py_ssize_t right,
9201    Py_UCS4 fill)
9202{
9203    PyObject *u;
9204    Py_UCS4 maxchar;
9205    int kind;
9206    void *data;
9207
9208    if (left < 0)
9209        left = 0;
9210    if (right < 0)
9211        right = 0;
9212
9213    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9214        Py_INCREF(self);
9215        return self;
9216    }
9217
9218    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9219        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9220        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9221        return NULL;
9222    }
9223    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9224    if (fill > maxchar)
9225        maxchar = fill;
9226    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9227    if (!u)
9228        return NULL;
9229
9230    kind = PyUnicode_KIND(u);
9231    data = PyUnicode_DATA(u);
9232    if (left)
9233        FILL(kind, data, fill, 0, left);
9234    if (right)
9235        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9236    copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
9237    assert(_PyUnicode_CheckConsistency(u, 1));
9238    return u;
9239}
9240#undef FILL
9241
9242PyObject *
9243PyUnicode_Splitlines(PyObject *string, int keepends)
9244{
9245    PyObject *list;
9246
9247    string = PyUnicode_FromObject(string);
9248    if (string == NULL || PyUnicode_READY(string) == -1)
9249        return NULL;
9250
9251    switch(PyUnicode_KIND(string)) {
9252    case PyUnicode_1BYTE_KIND:
9253        if (PyUnicode_IS_ASCII(string))
9254            list = asciilib_splitlines(
9255                (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9256                PyUnicode_GET_LENGTH(string), keepends);
9257        else
9258            list = ucs1lib_splitlines(
9259                (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9260                PyUnicode_GET_LENGTH(string), keepends);
9261        break;
9262    case PyUnicode_2BYTE_KIND:
9263        list = ucs2lib_splitlines(
9264            (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9265            PyUnicode_GET_LENGTH(string), keepends);
9266        break;
9267    case PyUnicode_4BYTE_KIND:
9268        list = ucs4lib_splitlines(
9269            (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9270            PyUnicode_GET_LENGTH(string), keepends);
9271        break;
9272    default:
9273        assert(0);
9274        list = 0;
9275    }
9276    Py_DECREF(string);
9277    return list;
9278}
9279
9280static PyObject *
9281split(PyObject *self,
9282      PyObject *substring,
9283      Py_ssize_t maxcount)
9284{
9285    int kind1, kind2, kind;
9286    void *buf1, *buf2;
9287    Py_ssize_t len1, len2;
9288    PyObject* out;
9289
9290    if (maxcount < 0)
9291        maxcount = PY_SSIZE_T_MAX;
9292
9293    if (PyUnicode_READY(self) == -1)
9294        return NULL;
9295
9296    if (substring == NULL)
9297        switch(PyUnicode_KIND(self)) {
9298        case PyUnicode_1BYTE_KIND:
9299            if (PyUnicode_IS_ASCII(self))
9300                return asciilib_split_whitespace(
9301                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9302                    PyUnicode_GET_LENGTH(self), maxcount
9303                    );
9304            else
9305                return ucs1lib_split_whitespace(
9306                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9307                    PyUnicode_GET_LENGTH(self), maxcount
9308                    );
9309        case PyUnicode_2BYTE_KIND:
9310            return ucs2lib_split_whitespace(
9311                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9312                PyUnicode_GET_LENGTH(self), maxcount
9313                );
9314        case PyUnicode_4BYTE_KIND:
9315            return ucs4lib_split_whitespace(
9316                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9317                PyUnicode_GET_LENGTH(self), maxcount
9318                );
9319        default:
9320            assert(0);
9321            return NULL;
9322        }
9323
9324    if (PyUnicode_READY(substring) == -1)
9325        return NULL;
9326
9327    kind1 = PyUnicode_KIND(self);
9328    kind2 = PyUnicode_KIND(substring);
9329    kind = kind1 > kind2 ? kind1 : kind2;
9330    buf1 = PyUnicode_DATA(self);
9331    buf2 = PyUnicode_DATA(substring);
9332    if (kind1 != kind)
9333        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9334    if (!buf1)
9335        return NULL;
9336    if (kind2 != kind)
9337        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9338    if (!buf2) {
9339        if (kind1 != kind) PyMem_Free(buf1);
9340        return NULL;
9341    }
9342    len1 = PyUnicode_GET_LENGTH(self);
9343    len2 = PyUnicode_GET_LENGTH(substring);
9344
9345    switch(kind) {
9346    case PyUnicode_1BYTE_KIND:
9347        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9348            out = asciilib_split(
9349                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9350        else
9351            out = ucs1lib_split(
9352                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9353        break;
9354    case PyUnicode_2BYTE_KIND:
9355        out = ucs2lib_split(
9356            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9357        break;
9358    case PyUnicode_4BYTE_KIND:
9359        out = ucs4lib_split(
9360            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9361        break;
9362    default:
9363        out = NULL;
9364    }
9365    if (kind1 != kind)
9366        PyMem_Free(buf1);
9367    if (kind2 != kind)
9368        PyMem_Free(buf2);
9369    return out;
9370}
9371
9372static PyObject *
9373rsplit(PyObject *self,
9374       PyObject *substring,
9375       Py_ssize_t maxcount)
9376{
9377    int kind1, kind2, kind;
9378    void *buf1, *buf2;
9379    Py_ssize_t len1, len2;
9380    PyObject* out;
9381
9382    if (maxcount < 0)
9383        maxcount = PY_SSIZE_T_MAX;
9384
9385    if (PyUnicode_READY(self) == -1)
9386        return NULL;
9387
9388    if (substring == NULL)
9389        switch(PyUnicode_KIND(self)) {
9390        case PyUnicode_1BYTE_KIND:
9391            if (PyUnicode_IS_ASCII(self))
9392                return asciilib_rsplit_whitespace(
9393                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9394                    PyUnicode_GET_LENGTH(self), maxcount
9395                    );
9396            else
9397                return ucs1lib_rsplit_whitespace(
9398                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9399                    PyUnicode_GET_LENGTH(self), maxcount
9400                    );
9401        case PyUnicode_2BYTE_KIND:
9402            return ucs2lib_rsplit_whitespace(
9403                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9404                PyUnicode_GET_LENGTH(self), maxcount
9405                );
9406        case PyUnicode_4BYTE_KIND:
9407            return ucs4lib_rsplit_whitespace(
9408                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9409                PyUnicode_GET_LENGTH(self), maxcount
9410                );
9411        default:
9412            assert(0);
9413            return NULL;
9414        }
9415
9416    if (PyUnicode_READY(substring) == -1)
9417        return NULL;
9418
9419    kind1 = PyUnicode_KIND(self);
9420    kind2 = PyUnicode_KIND(substring);
9421    kind = kind1 > kind2 ? kind1 : kind2;
9422    buf1 = PyUnicode_DATA(self);
9423    buf2 = PyUnicode_DATA(substring);
9424    if (kind1 != kind)
9425        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9426    if (!buf1)
9427        return NULL;
9428    if (kind2 != kind)
9429        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9430    if (!buf2) {
9431        if (kind1 != kind) PyMem_Free(buf1);
9432        return NULL;
9433    }
9434    len1 = PyUnicode_GET_LENGTH(self);
9435    len2 = PyUnicode_GET_LENGTH(substring);
9436
9437    switch(kind) {
9438    case PyUnicode_1BYTE_KIND:
9439        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9440            out = asciilib_rsplit(
9441                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9442        else
9443            out = ucs1lib_rsplit(
9444                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9445        break;
9446    case PyUnicode_2BYTE_KIND:
9447        out = ucs2lib_rsplit(
9448            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9449        break;
9450    case PyUnicode_4BYTE_KIND:
9451        out = ucs4lib_rsplit(
9452            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9453        break;
9454    default:
9455        out = NULL;
9456    }
9457    if (kind1 != kind)
9458        PyMem_Free(buf1);
9459    if (kind2 != kind)
9460        PyMem_Free(buf2);
9461    return out;
9462}
9463
9464static Py_ssize_t
9465anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9466            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9467{
9468    switch(kind) {
9469    case PyUnicode_1BYTE_KIND:
9470        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9471            return asciilib_find(buf1, len1, buf2, len2, offset);
9472        else
9473            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9474    case PyUnicode_2BYTE_KIND:
9475        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9476    case PyUnicode_4BYTE_KIND:
9477        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9478    }
9479    assert(0);
9480    return -1;
9481}
9482
9483static Py_ssize_t
9484anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9485             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9486{
9487        switch(kind) {
9488        case PyUnicode_1BYTE_KIND:
9489            if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9490                return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9491            else
9492                return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9493        case PyUnicode_2BYTE_KIND:
9494            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9495        case PyUnicode_4BYTE_KIND:
9496            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9497        }
9498        assert(0);
9499        return 0;
9500}
9501
9502static PyObject *
9503replace(PyObject *self, PyObject *str1,
9504        PyObject *str2, Py_ssize_t maxcount)
9505{
9506    PyObject *u;
9507    char *sbuf = PyUnicode_DATA(self);
9508    char *buf1 = PyUnicode_DATA(str1);
9509    char *buf2 = PyUnicode_DATA(str2);
9510    int srelease = 0, release1 = 0, release2 = 0;
9511    int skind = PyUnicode_KIND(self);
9512    int kind1 = PyUnicode_KIND(str1);
9513    int kind2 = PyUnicode_KIND(str2);
9514    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9515    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9516    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9517
9518    if (maxcount < 0)
9519        maxcount = PY_SSIZE_T_MAX;
9520    else if (maxcount == 0 || slen == 0)
9521        goto nothing;
9522
9523    if (skind < kind1)
9524        /* substring too wide to be present */
9525        goto nothing;
9526
9527    if (len1 == len2) {
9528        Py_ssize_t i;
9529        /* same length */
9530        if (len1 == 0)
9531            goto nothing;
9532        if (len1 == 1) {
9533            /* replace characters */
9534            Py_UCS4 u1, u2, maxchar;
9535            int mayshrink, rkind;
9536            u1 = PyUnicode_READ_CHAR(str1, 0);
9537            if (!findchar(sbuf, PyUnicode_KIND(self),
9538                          slen, u1, 1))
9539                goto nothing;
9540            u2 = PyUnicode_READ_CHAR(str2, 0);
9541            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9542            /* Replacing u1 with u2 may cause a maxchar reduction in the
9543               result string. */
9544            if (u2 > maxchar) {
9545                maxchar = u2;
9546                mayshrink = 0;
9547            }
9548            else
9549                mayshrink = maxchar > 127;
9550            u = PyUnicode_New(slen, maxchar);
9551            if (!u)
9552                goto error;
9553            copy_characters(u, 0, self, 0, slen);
9554            rkind = PyUnicode_KIND(u);
9555            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9556                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
9557                    if (--maxcount < 0)
9558                        break;
9559                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
9560                }
9561            if (mayshrink) {
9562                PyObject *tmp = u;
9563                u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9564                                              PyUnicode_GET_LENGTH(tmp));
9565                Py_DECREF(tmp);
9566            }
9567        } else {
9568            int rkind = skind;
9569            char *res;
9570            if (kind1 < rkind) {
9571                /* widen substring */
9572                buf1 = _PyUnicode_AsKind(str1, rkind);
9573                if (!buf1) goto error;
9574                release1 = 1;
9575            }
9576            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
9577            if (i < 0)
9578                goto nothing;
9579            if (rkind > kind2) {
9580                /* widen replacement */
9581                buf2 = _PyUnicode_AsKind(str2, rkind);
9582                if (!buf2) goto error;
9583                release2 = 1;
9584            }
9585            else if (rkind < kind2) {
9586                /* widen self and buf1 */
9587                rkind = kind2;
9588                if (release1) PyMem_Free(buf1);
9589                sbuf = _PyUnicode_AsKind(self, rkind);
9590                if (!sbuf) goto error;
9591                srelease = 1;
9592                buf1 = _PyUnicode_AsKind(str1, rkind);
9593                if (!buf1) goto error;
9594                release1 = 1;
9595            }
9596            res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9597            if (!res) {
9598                PyErr_NoMemory();
9599                goto error;
9600            }
9601            memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
9602            /* change everything in-place, starting with this one */
9603            memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9604                   buf2,
9605                   PyUnicode_KIND_SIZE(rkind, len2));
9606            i += len1;
9607
9608            while ( --maxcount > 0) {
9609                i = anylib_find(rkind, self,
9610                                sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9611                                str1, buf1, len1, i);
9612                if (i == -1)
9613                    break;
9614                memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9615                       buf2,
9616                       PyUnicode_KIND_SIZE(rkind, len2));
9617                i += len1;
9618            }
9619
9620            u = PyUnicode_FromKindAndData(rkind, res, slen);
9621            PyMem_Free(res);
9622            if (!u) goto error;
9623        }
9624    } else {
9625
9626        Py_ssize_t n, i, j, ires;
9627        Py_ssize_t product, new_size;
9628        int rkind = skind;
9629        char *res;
9630
9631        if (kind1 < rkind) {
9632            buf1 = _PyUnicode_AsKind(str1, rkind);
9633            if (!buf1) goto error;
9634            release1 = 1;
9635        }
9636        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
9637        if (n == 0)
9638            goto nothing;
9639        if (kind2 < rkind) {
9640            buf2 = _PyUnicode_AsKind(str2, rkind);
9641            if (!buf2) goto error;
9642            release2 = 1;
9643        }
9644        else if (kind2 > rkind) {
9645            rkind = kind2;
9646            sbuf = _PyUnicode_AsKind(self, rkind);
9647            if (!sbuf) goto error;
9648            srelease = 1;
9649            if (release1) PyMem_Free(buf1);
9650            buf1 = _PyUnicode_AsKind(str1, rkind);
9651            if (!buf1) goto error;
9652            release1 = 1;
9653        }
9654        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9655           PyUnicode_GET_LENGTH(str1))); */
9656        product = n * (len2-len1);
9657        if ((product / (len2-len1)) != n) {
9658                PyErr_SetString(PyExc_OverflowError,
9659                                "replace string is too long");
9660                goto error;
9661        }
9662        new_size = slen + product;
9663        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9664            PyErr_SetString(PyExc_OverflowError,
9665                            "replace string is too long");
9666            goto error;
9667        }
9668        res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9669        if (!res)
9670            goto error;
9671        ires = i = 0;
9672        if (len1 > 0) {
9673            while (n-- > 0) {
9674                /* look for next match */
9675                j = anylib_find(rkind, self,
9676                                sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9677                                str1, buf1, len1, i);
9678                if (j == -1)
9679                    break;
9680                else if (j > i) {
9681                    /* copy unchanged part [i:j] */
9682                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9683                           sbuf + PyUnicode_KIND_SIZE(rkind, i),
9684                           PyUnicode_KIND_SIZE(rkind, j-i));
9685                    ires += j - i;
9686                }
9687                /* copy substitution string */
9688                if (len2 > 0) {
9689                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9690                           buf2,
9691                           PyUnicode_KIND_SIZE(rkind, len2));
9692                    ires += len2;
9693                }
9694                i = j + len1;
9695            }
9696            if (i < slen)
9697                /* copy tail [i:] */
9698                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9699                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9700                       PyUnicode_KIND_SIZE(rkind, slen-i));
9701        } else {
9702            /* interleave */
9703            while (n > 0) {
9704                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9705                       buf2,
9706                       PyUnicode_KIND_SIZE(rkind, len2));
9707                ires += len2;
9708                if (--n <= 0)
9709                    break;
9710                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9711                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9712                       PyUnicode_KIND_SIZE(rkind, 1));
9713                ires++;
9714                i++;
9715            }
9716            memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9717                   sbuf + PyUnicode_KIND_SIZE(rkind, i),
9718                   PyUnicode_KIND_SIZE(rkind, slen-i));
9719        }
9720        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(str2))
9721            u = unicode_fromascii((unsigned char*)res, new_size);
9722        else
9723            u = PyUnicode_FromKindAndData(rkind, res, new_size);
9724        PyMem_Free(res);
9725    }
9726    if (srelease)
9727        PyMem_FREE(sbuf);
9728    if (release1)
9729        PyMem_FREE(buf1);
9730    if (release2)
9731        PyMem_FREE(buf2);
9732    assert(_PyUnicode_CheckConsistency(u, 1));
9733    return u;
9734
9735  nothing:
9736    /* nothing to replace; return original string (when possible) */
9737    if (srelease)
9738        PyMem_FREE(sbuf);
9739    if (release1)
9740        PyMem_FREE(buf1);
9741    if (release2)
9742        PyMem_FREE(buf2);
9743    if (PyUnicode_CheckExact(self)) {
9744        Py_INCREF(self);
9745        return (PyObject *) self;
9746    }
9747    return PyUnicode_Copy(self);
9748  error:
9749    if (srelease && sbuf)
9750        PyMem_FREE(sbuf);
9751    if (release1 && buf1)
9752        PyMem_FREE(buf1);
9753    if (release2 && buf2)
9754        PyMem_FREE(buf2);
9755    return NULL;
9756}
9757
9758/* --- Unicode Object Methods --------------------------------------------- */
9759
9760PyDoc_STRVAR(title__doc__,
9761             "S.title() -> str\n\
9762\n\
9763Return a titlecased version of S, i.e. words start with title case\n\
9764characters, all remaining cased characters have lower case.");
9765
9766static PyObject*
9767unicode_title(PyObject *self)
9768{
9769    return fixup(self, fixtitle);
9770}
9771
9772PyDoc_STRVAR(capitalize__doc__,
9773             "S.capitalize() -> str\n\
9774\n\
9775Return a capitalized version of S, i.e. make the first character\n\
9776have upper case and the rest lower case.");
9777
9778static PyObject*
9779unicode_capitalize(PyObject *self)
9780{
9781    return fixup(self, fixcapitalize);
9782}
9783
9784#if 0
9785PyDoc_STRVAR(capwords__doc__,
9786             "S.capwords() -> str\n\
9787\n\
9788Apply .capitalize() to all words in S and return the result with\n\
9789normalized whitespace (all whitespace strings are replaced by ' ').");
9790
9791static PyObject*
9792unicode_capwords(PyUnicodeObject *self)
9793{
9794    PyObject *list;
9795    PyObject *item;
9796    Py_ssize_t i;
9797
9798    /* Split into words */
9799    list = split(self, NULL, -1);
9800    if (!list)
9801        return NULL;
9802
9803    /* Capitalize each word */
9804    for (i = 0; i < PyList_GET_SIZE(list); i++) {
9805        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
9806                     fixcapitalize);
9807        if (item == NULL)
9808            goto onError;
9809        Py_DECREF(PyList_GET_ITEM(list, i));
9810        PyList_SET_ITEM(list, i, item);
9811    }
9812
9813    /* Join the words to form a new string */
9814    item = PyUnicode_Join(NULL, list);
9815
9816  onError:
9817    Py_DECREF(list);
9818    return (PyObject *)item;
9819}
9820#endif
9821
9822/* Argument converter.  Coerces to a single unicode character */
9823
9824static int
9825convert_uc(PyObject *obj, void *addr)
9826{
9827    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
9828    PyObject *uniobj;
9829
9830    uniobj = PyUnicode_FromObject(obj);
9831    if (uniobj == NULL) {
9832        PyErr_SetString(PyExc_TypeError,
9833                        "The fill character cannot be converted to Unicode");
9834        return 0;
9835    }
9836    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
9837        PyErr_SetString(PyExc_TypeError,
9838                        "The fill character must be exactly one character long");
9839        Py_DECREF(uniobj);
9840        return 0;
9841    }
9842    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
9843    Py_DECREF(uniobj);
9844    return 1;
9845}
9846
9847PyDoc_STRVAR(center__doc__,
9848             "S.center(width[, fillchar]) -> str\n\
9849\n\
9850Return S centered in a string of length width. Padding is\n\
9851done using the specified fill character (default is a space)");
9852
9853static PyObject *
9854unicode_center(PyObject *self, PyObject *args)
9855{
9856    Py_ssize_t marg, left;
9857    Py_ssize_t width;
9858    Py_UCS4 fillchar = ' ';
9859
9860    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
9861        return NULL;
9862
9863    if (PyUnicode_READY(self) == -1)
9864        return NULL;
9865
9866    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
9867        Py_INCREF(self);
9868        return (PyObject*) self;
9869    }
9870
9871    marg = width - _PyUnicode_LENGTH(self);
9872    left = marg / 2 + (marg & width & 1);
9873
9874    return pad(self, left, marg - left, fillchar);
9875}
9876
9877#if 0
9878
9879/* This code should go into some future Unicode collation support
9880   module. The basic comparison should compare ordinals on a naive
9881   basis (this is what Java does and thus Jython too). */
9882
9883/* speedy UTF-16 code point order comparison */
9884/* gleaned from: */
9885/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9886
9887static short utf16Fixup[32] =
9888{
9889    0, 0, 0, 0, 0, 0, 0, 0,
9890    0, 0, 0, 0, 0, 0, 0, 0,
9891    0, 0, 0, 0, 0, 0, 0, 0,
9892    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
9893};
9894
9895static int
9896unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9897{
9898    Py_ssize_t len1, len2;
9899
9900    Py_UNICODE *s1 = str1->str;
9901    Py_UNICODE *s2 = str2->str;
9902
9903    len1 = str1->_base._base.length;
9904    len2 = str2->_base._base.length;
9905
9906    while (len1 > 0 && len2 > 0) {
9907        Py_UNICODE c1, c2;
9908
9909        c1 = *s1++;
9910        c2 = *s2++;
9911
9912        if (c1 > (1<<11) * 26)
9913            c1 += utf16Fixup[c1>>11];
9914        if (c2 > (1<<11) * 26)
9915            c2 += utf16Fixup[c2>>11];
9916        /* now c1 and c2 are in UTF-32-compatible order */
9917
9918        if (c1 != c2)
9919            return (c1 < c2) ? -1 : 1;
9920
9921        len1--; len2--;
9922    }
9923
9924    return (len1 < len2) ? -1 : (len1 != len2);
9925}
9926
9927#else
9928
9929/* This function assumes that str1 and str2 are readied by the caller. */
9930
9931static int
9932unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9933{
9934    int kind1, kind2;
9935    void *data1, *data2;
9936    Py_ssize_t len1, len2, i;
9937
9938    kind1 = PyUnicode_KIND(str1);
9939    kind2 = PyUnicode_KIND(str2);
9940    data1 = PyUnicode_DATA(str1);
9941    data2 = PyUnicode_DATA(str2);
9942    len1 = PyUnicode_GET_LENGTH(str1);
9943    len2 = PyUnicode_GET_LENGTH(str2);
9944
9945    for (i = 0; i < len1 && i < len2; ++i) {
9946        Py_UCS4 c1, c2;
9947        c1 = PyUnicode_READ(kind1, data1, i);
9948        c2 = PyUnicode_READ(kind2, data2, i);
9949
9950        if (c1 != c2)
9951            return (c1 < c2) ? -1 : 1;
9952    }
9953
9954    return (len1 < len2) ? -1 : (len1 != len2);
9955}
9956
9957#endif
9958
9959int
9960PyUnicode_Compare(PyObject *left, PyObject *right)
9961{
9962    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9963        if (PyUnicode_READY(left) == -1 ||
9964            PyUnicode_READY(right) == -1)
9965            return -1;
9966        return unicode_compare((PyUnicodeObject *)left,
9967                               (PyUnicodeObject *)right);
9968    }
9969    PyErr_Format(PyExc_TypeError,
9970                 "Can't compare %.100s and %.100s",
9971                 left->ob_type->tp_name,
9972                 right->ob_type->tp_name);
9973    return -1;
9974}
9975
9976int
9977PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9978{
9979    Py_ssize_t i;
9980    int kind;
9981    void *data;
9982    Py_UCS4 chr;
9983
9984    assert(_PyUnicode_CHECK(uni));
9985    if (PyUnicode_READY(uni) == -1)
9986        return -1;
9987    kind = PyUnicode_KIND(uni);
9988    data = PyUnicode_DATA(uni);
9989    /* Compare Unicode string and source character set string */
9990    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9991        if (chr != str[i])
9992            return (chr < (unsigned char)(str[i])) ? -1 : 1;
9993    /* This check keeps Python strings that end in '\0' from comparing equal
9994     to C strings identical up to that point. */
9995    if (PyUnicode_GET_LENGTH(uni) != i || chr)
9996        return 1; /* uni is longer */
9997    if (str[i])
9998        return -1; /* str is longer */
9999    return 0;
10000}
10001
10002
10003#define TEST_COND(cond)                         \
10004    ((cond) ? Py_True : Py_False)
10005
10006PyObject *
10007PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10008{
10009    int result;
10010
10011    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10012        PyObject *v;
10013        if (PyUnicode_READY(left) == -1 ||
10014            PyUnicode_READY(right) == -1)
10015            return NULL;
10016        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10017            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10018            if (op == Py_EQ) {
10019                Py_INCREF(Py_False);
10020                return Py_False;
10021            }
10022            if (op == Py_NE) {
10023                Py_INCREF(Py_True);
10024                return Py_True;
10025            }
10026        }
10027        if (left == right)
10028            result = 0;
10029        else
10030            result = unicode_compare((PyUnicodeObject *)left,
10031                                     (PyUnicodeObject *)right);
10032
10033        /* Convert the return value to a Boolean */
10034        switch (op) {
10035        case Py_EQ:
10036            v = TEST_COND(result == 0);
10037            break;
10038        case Py_NE:
10039            v = TEST_COND(result != 0);
10040            break;
10041        case Py_LE:
10042            v = TEST_COND(result <= 0);
10043            break;
10044        case Py_GE:
10045            v = TEST_COND(result >= 0);
10046            break;
10047        case Py_LT:
10048            v = TEST_COND(result == -1);
10049            break;
10050        case Py_GT:
10051            v = TEST_COND(result == 1);
10052            break;
10053        default:
10054            PyErr_BadArgument();
10055            return NULL;
10056        }
10057        Py_INCREF(v);
10058        return v;
10059    }
10060
10061    Py_RETURN_NOTIMPLEMENTED;
10062}
10063
10064int
10065PyUnicode_Contains(PyObject *container, PyObject *element)
10066{
10067    PyObject *str, *sub;
10068    int kind1, kind2, kind;
10069    void *buf1, *buf2;
10070    Py_ssize_t len1, len2;
10071    int result;
10072
10073    /* Coerce the two arguments */
10074    sub = PyUnicode_FromObject(element);
10075    if (!sub) {
10076        PyErr_Format(PyExc_TypeError,
10077                     "'in <string>' requires string as left operand, not %s",
10078                     element->ob_type->tp_name);
10079        return -1;
10080    }
10081    if (PyUnicode_READY(sub) == -1)
10082        return -1;
10083
10084    str = PyUnicode_FromObject(container);
10085    if (!str || PyUnicode_READY(str) == -1) {
10086        Py_DECREF(sub);
10087        return -1;
10088    }
10089
10090    kind1 = PyUnicode_KIND(str);
10091    kind2 = PyUnicode_KIND(sub);
10092    kind = kind1 > kind2 ? kind1 : kind2;
10093    buf1 = PyUnicode_DATA(str);
10094    buf2 = PyUnicode_DATA(sub);
10095    if (kind1 != kind)
10096        buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10097    if (!buf1) {
10098        Py_DECREF(sub);
10099        return -1;
10100    }
10101    if (kind2 != kind)
10102        buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10103    if (!buf2) {
10104        Py_DECREF(sub);
10105        if (kind1 != kind) PyMem_Free(buf1);
10106        return -1;
10107    }
10108    len1 = PyUnicode_GET_LENGTH(str);
10109    len2 = PyUnicode_GET_LENGTH(sub);
10110
10111    switch(kind) {
10112    case PyUnicode_1BYTE_KIND:
10113        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10114        break;
10115    case PyUnicode_2BYTE_KIND:
10116        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10117        break;
10118    case PyUnicode_4BYTE_KIND:
10119        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10120        break;
10121    default:
10122        result = -1;
10123        assert(0);
10124    }
10125
10126    Py_DECREF(str);
10127    Py_DECREF(sub);
10128
10129    if (kind1 != kind)
10130        PyMem_Free(buf1);
10131    if (kind2 != kind)
10132        PyMem_Free(buf2);
10133
10134    return result;
10135}
10136
10137/* Concat to string or Unicode object giving a new Unicode object. */
10138
10139PyObject *
10140PyUnicode_Concat(PyObject *left, PyObject *right)
10141{
10142    PyObject *u = NULL, *v = NULL, *w;
10143    Py_UCS4 maxchar;
10144
10145    /* Coerce the two arguments */
10146    u = PyUnicode_FromObject(left);
10147    if (u == NULL)
10148        goto onError;
10149    v = PyUnicode_FromObject(right);
10150    if (v == NULL)
10151        goto onError;
10152
10153    /* Shortcuts */
10154    if (v == unicode_empty) {
10155        Py_DECREF(v);
10156        return u;
10157    }
10158    if (u == unicode_empty) {
10159        Py_DECREF(u);
10160        return v;
10161    }
10162
10163    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10164    maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
10165
10166    /* Concat the two Unicode strings */
10167    w = PyUnicode_New(
10168        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10169        maxchar);
10170    if (w == NULL)
10171        goto onError;
10172    copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10173    copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
10174    Py_DECREF(u);
10175    Py_DECREF(v);
10176    assert(_PyUnicode_CheckConsistency(w, 1));
10177    return w;
10178
10179  onError:
10180    Py_XDECREF(u);
10181    Py_XDECREF(v);
10182    return NULL;
10183}
10184
10185static void
10186unicode_append_inplace(PyObject **p_left, PyObject *right)
10187{
10188    Py_ssize_t left_len, right_len, new_len;
10189
10190    assert(PyUnicode_IS_READY(*p_left));
10191    assert(PyUnicode_IS_READY(right));
10192
10193    left_len = PyUnicode_GET_LENGTH(*p_left);
10194    right_len = PyUnicode_GET_LENGTH(right);
10195    if (left_len > PY_SSIZE_T_MAX - right_len) {
10196        PyErr_SetString(PyExc_OverflowError,
10197                        "strings are too large to concat");
10198        goto error;
10199    }
10200    new_len = left_len + right_len;
10201
10202    /* Now we own the last reference to 'left', so we can resize it
10203     * in-place.
10204     */
10205    if (unicode_resize(p_left, new_len) != 0) {
10206        /* XXX if _PyUnicode_Resize() fails, 'left' has been
10207         * deallocated so it cannot be put back into
10208         * 'variable'.  The MemoryError is raised when there
10209         * is no value in 'variable', which might (very
10210         * remotely) be a cause of incompatibilities.
10211         */
10212        goto error;
10213    }
10214    /* copy 'right' into the newly allocated area of 'left' */
10215    copy_characters(*p_left, left_len, right, 0, right_len);
10216    _PyUnicode_DIRTY(*p_left);
10217    return;
10218
10219error:
10220    Py_DECREF(*p_left);
10221    *p_left = NULL;
10222}
10223
10224void
10225PyUnicode_Append(PyObject **p_left, PyObject *right)
10226{
10227    PyObject *left, *res;
10228
10229    if (p_left == NULL) {
10230        if (!PyErr_Occurred())
10231            PyErr_BadInternalCall();
10232        return;
10233    }
10234    left = *p_left;
10235    if (right == NULL || !PyUnicode_Check(left)) {
10236        if (!PyErr_Occurred())
10237            PyErr_BadInternalCall();
10238        goto error;
10239    }
10240
10241    if (PyUnicode_READY(left))
10242        goto error;
10243    if (PyUnicode_READY(right))
10244        goto error;
10245
10246    if (PyUnicode_CheckExact(left) && left != unicode_empty
10247        && PyUnicode_CheckExact(right) && right != unicode_empty
10248        && unicode_resizable(left)
10249        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10250            || _PyUnicode_WSTR(left) != NULL))
10251    {
10252        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10253           to change the structure size, but characters are stored just after
10254           the structure, and so it requires to move all characters which is
10255           not so different than duplicating the string. */
10256        if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10257        {
10258            unicode_append_inplace(p_left, right);
10259            if (p_left != NULL)
10260                assert(_PyUnicode_CheckConsistency(*p_left, 1));
10261            return;
10262        }
10263    }
10264
10265    res = PyUnicode_Concat(left, right);
10266    if (res == NULL)
10267        goto error;
10268    Py_DECREF(left);
10269    *p_left = res;
10270    return;
10271
10272error:
10273    Py_DECREF(*p_left);
10274    *p_left = NULL;
10275}
10276
10277void
10278PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10279{
10280    PyUnicode_Append(pleft, right);
10281    Py_XDECREF(right);
10282}
10283
10284PyDoc_STRVAR(count__doc__,
10285             "S.count(sub[, start[, end]]) -> int\n\
10286\n\
10287Return the number of non-overlapping occurrences of substring sub in\n\
10288string S[start:end].  Optional arguments start and end are\n\
10289interpreted as in slice notation.");
10290
10291static PyObject *
10292unicode_count(PyUnicodeObject *self, PyObject *args)
10293{
10294    PyUnicodeObject *substring;
10295    Py_ssize_t start = 0;
10296    Py_ssize_t end = PY_SSIZE_T_MAX;
10297    PyObject *result;
10298    int kind1, kind2, kind;
10299    void *buf1, *buf2;
10300    Py_ssize_t len1, len2, iresult;
10301
10302    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10303                                            &start, &end))
10304        return NULL;
10305
10306    kind1 = PyUnicode_KIND(self);
10307    kind2 = PyUnicode_KIND(substring);
10308    kind = kind1 > kind2 ? kind1 : kind2;
10309    buf1 = PyUnicode_DATA(self);
10310    buf2 = PyUnicode_DATA(substring);
10311    if (kind1 != kind)
10312        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10313    if (!buf1) {
10314        Py_DECREF(substring);
10315        return NULL;
10316    }
10317    if (kind2 != kind)
10318        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10319    if (!buf2) {
10320        Py_DECREF(substring);
10321        if (kind1 != kind) PyMem_Free(buf1);
10322        return NULL;
10323    }
10324    len1 = PyUnicode_GET_LENGTH(self);
10325    len2 = PyUnicode_GET_LENGTH(substring);
10326
10327    ADJUST_INDICES(start, end, len1);
10328    switch(kind) {
10329    case PyUnicode_1BYTE_KIND:
10330        iresult = ucs1lib_count(
10331            ((Py_UCS1*)buf1) + start, end - start,
10332            buf2, len2, PY_SSIZE_T_MAX
10333            );
10334        break;
10335    case PyUnicode_2BYTE_KIND:
10336        iresult = ucs2lib_count(
10337            ((Py_UCS2*)buf1) + start, end - start,
10338            buf2, len2, PY_SSIZE_T_MAX
10339            );
10340        break;
10341    case PyUnicode_4BYTE_KIND:
10342        iresult = ucs4lib_count(
10343            ((Py_UCS4*)buf1) + start, end - start,
10344            buf2, len2, PY_SSIZE_T_MAX
10345            );
10346        break;
10347    default:
10348        assert(0); iresult = 0;
10349    }
10350
10351    result = PyLong_FromSsize_t(iresult);
10352
10353    if (kind1 != kind)
10354        PyMem_Free(buf1);
10355    if (kind2 != kind)
10356        PyMem_Free(buf2);
10357
10358    Py_DECREF(substring);
10359
10360    return result;
10361}
10362
10363PyDoc_STRVAR(encode__doc__,
10364             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10365\n\
10366Encode S using the codec registered for encoding. Default encoding\n\
10367is 'utf-8'. errors may be given to set a different error\n\
10368handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10369a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10370'xmlcharrefreplace' as well as any other name registered with\n\
10371codecs.register_error that can handle UnicodeEncodeErrors.");
10372
10373static PyObject *
10374unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
10375{
10376    static char *kwlist[] = {"encoding", "errors", 0};
10377    char *encoding = NULL;
10378    char *errors = NULL;
10379
10380    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10381                                     kwlist, &encoding, &errors))
10382        return NULL;
10383    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
10384}
10385
10386PyDoc_STRVAR(expandtabs__doc__,
10387             "S.expandtabs([tabsize]) -> str\n\
10388\n\
10389Return a copy of S where all tab characters are expanded using spaces.\n\
10390If tabsize is not given, a tab size of 8 characters is assumed.");
10391
10392static PyObject*
10393unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10394{
10395    Py_ssize_t i, j, line_pos, src_len, incr;
10396    Py_UCS4 ch;
10397    PyObject *u;
10398    void *src_data, *dest_data;
10399    int tabsize = 8;
10400    int kind;
10401    int found;
10402
10403    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10404        return NULL;
10405
10406    if (PyUnicode_READY(self) == -1)
10407        return NULL;
10408
10409    /* First pass: determine size of output string */
10410    src_len = PyUnicode_GET_LENGTH(self);
10411    i = j = line_pos = 0;
10412    kind = PyUnicode_KIND(self);
10413    src_data = PyUnicode_DATA(self);
10414    found = 0;
10415    for (; i < src_len; i++) {
10416        ch = PyUnicode_READ(kind, src_data, i);
10417        if (ch == '\t') {
10418            found = 1;
10419            if (tabsize > 0) {
10420                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10421                if (j > PY_SSIZE_T_MAX - incr)
10422                    goto overflow;
10423                line_pos += incr;
10424                j += incr;
10425            }
10426        }
10427        else {
10428            if (j > PY_SSIZE_T_MAX - 1)
10429                goto overflow;
10430            line_pos++;
10431            j++;
10432            if (ch == '\n' || ch == '\r')
10433                line_pos = 0;
10434        }
10435    }
10436    if (!found && PyUnicode_CheckExact(self)) {
10437        Py_INCREF((PyObject *) self);
10438        return (PyObject *) self;
10439    }
10440
10441    /* Second pass: create output string and fill it */
10442    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10443    if (!u)
10444        return NULL;
10445    dest_data = PyUnicode_DATA(u);
10446
10447    i = j = line_pos = 0;
10448
10449    for (; i < src_len; i++) {
10450        ch = PyUnicode_READ(kind, src_data, i);
10451        if (ch == '\t') {
10452            if (tabsize > 0) {
10453                incr = tabsize - (line_pos % tabsize);
10454                line_pos += incr;
10455                while (incr--) {
10456                    PyUnicode_WRITE(kind, dest_data, j, ' ');
10457                    j++;
10458                }
10459            }
10460        }
10461        else {
10462            line_pos++;
10463            PyUnicode_WRITE(kind, dest_data, j, ch);
10464            j++;
10465            if (ch == '\n' || ch == '\r')
10466                line_pos = 0;
10467        }
10468    }
10469    assert (j == PyUnicode_GET_LENGTH(u));
10470#ifndef DONT_MAKE_RESULT_READY
10471    if (_PyUnicode_READY_REPLACE(&u)) {
10472        Py_DECREF(u);
10473        return NULL;
10474    }
10475#endif
10476    assert(_PyUnicode_CheckConsistency(u, 1));
10477    return (PyObject*) u;
10478
10479  overflow:
10480    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10481    return NULL;
10482}
10483
10484PyDoc_STRVAR(find__doc__,
10485             "S.find(sub[, start[, end]]) -> int\n\
10486\n\
10487Return the lowest index in S where substring sub is found,\n\
10488such that sub is contained within S[start:end].  Optional\n\
10489arguments start and end are interpreted as in slice notation.\n\
10490\n\
10491Return -1 on failure.");
10492
10493static PyObject *
10494unicode_find(PyObject *self, PyObject *args)
10495{
10496    PyUnicodeObject *substring;
10497    Py_ssize_t start;
10498    Py_ssize_t end;
10499    Py_ssize_t result;
10500
10501    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10502                                            &start, &end))
10503        return NULL;
10504
10505    if (PyUnicode_READY(self) == -1)
10506        return NULL;
10507    if (PyUnicode_READY(substring) == -1)
10508        return NULL;
10509
10510    result = any_find_slice(
10511        asciilib_find_slice, ucs1lib_find_slice,
10512        ucs2lib_find_slice, ucs4lib_find_slice,
10513        self, (PyObject*)substring, start, end
10514        );
10515
10516    Py_DECREF(substring);
10517
10518    if (result == -2)
10519        return NULL;
10520
10521    return PyLong_FromSsize_t(result);
10522}
10523
10524static PyObject *
10525unicode_getitem(PyObject *self, Py_ssize_t index)
10526{
10527    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10528    if (ch == (Py_UCS4)-1)
10529        return NULL;
10530    return PyUnicode_FromOrdinal(ch);
10531}
10532
10533/* Believe it or not, this produces the same value for ASCII strings
10534   as bytes_hash(). */
10535static Py_hash_t
10536unicode_hash(PyUnicodeObject *self)
10537{
10538    Py_ssize_t len;
10539    Py_uhash_t x;
10540
10541    if (_PyUnicode_HASH(self) != -1)
10542        return _PyUnicode_HASH(self);
10543    if (PyUnicode_READY(self) == -1)
10544        return -1;
10545    len = PyUnicode_GET_LENGTH(self);
10546
10547    /* The hash function as a macro, gets expanded three times below. */
10548#define HASH(P) \
10549    x = (Py_uhash_t)*P << 7; \
10550    while (--len >= 0) \
10551        x = (1000003*x) ^ (Py_uhash_t)*P++;
10552
10553    switch (PyUnicode_KIND(self)) {
10554    case PyUnicode_1BYTE_KIND: {
10555        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10556        HASH(c);
10557        break;
10558    }
10559    case PyUnicode_2BYTE_KIND: {
10560        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10561        HASH(s);
10562        break;
10563    }
10564    default: {
10565        Py_UCS4 *l;
10566        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10567               "Impossible switch case in unicode_hash");
10568        l = PyUnicode_4BYTE_DATA(self);
10569        HASH(l);
10570        break;
10571    }
10572    }
10573    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10574
10575    if (x == -1)
10576        x = -2;
10577    _PyUnicode_HASH(self) = x;
10578    return x;
10579}
10580#undef HASH
10581
10582PyDoc_STRVAR(index__doc__,
10583             "S.index(sub[, start[, end]]) -> int\n\
10584\n\
10585Like S.find() but raise ValueError when the substring is not found.");
10586
10587static PyObject *
10588unicode_index(PyObject *self, PyObject *args)
10589{
10590    Py_ssize_t result;
10591    PyUnicodeObject *substring;
10592    Py_ssize_t start;
10593    Py_ssize_t end;
10594
10595    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10596                                            &start, &end))
10597        return NULL;
10598
10599    if (PyUnicode_READY(self) == -1)
10600        return NULL;
10601    if (PyUnicode_READY(substring) == -1)
10602        return NULL;
10603
10604    result = any_find_slice(
10605        asciilib_find_slice, ucs1lib_find_slice,
10606        ucs2lib_find_slice, ucs4lib_find_slice,
10607        self, (PyObject*)substring, start, end
10608        );
10609
10610    Py_DECREF(substring);
10611
10612    if (result == -2)
10613        return NULL;
10614
10615    if (result < 0) {
10616        PyErr_SetString(PyExc_ValueError, "substring not found");
10617        return NULL;
10618    }
10619
10620    return PyLong_FromSsize_t(result);
10621}
10622
10623PyDoc_STRVAR(islower__doc__,
10624             "S.islower() -> bool\n\
10625\n\
10626Return True if all cased characters in S are lowercase and there is\n\
10627at least one cased character in S, False otherwise.");
10628
10629static PyObject*
10630unicode_islower(PyUnicodeObject *self)
10631{
10632    Py_ssize_t i, length;
10633    int kind;
10634    void *data;
10635    int cased;
10636
10637    if (PyUnicode_READY(self) == -1)
10638        return NULL;
10639    length = PyUnicode_GET_LENGTH(self);
10640    kind = PyUnicode_KIND(self);
10641    data = PyUnicode_DATA(self);
10642
10643    /* Shortcut for single character strings */
10644    if (length == 1)
10645        return PyBool_FromLong(
10646            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
10647
10648    /* Special case for empty strings */
10649    if (length == 0)
10650        return PyBool_FromLong(0);
10651
10652    cased = 0;
10653    for (i = 0; i < length; i++) {
10654        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10655
10656        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10657            return PyBool_FromLong(0);
10658        else if (!cased && Py_UNICODE_ISLOWER(ch))
10659            cased = 1;
10660    }
10661    return PyBool_FromLong(cased);
10662}
10663
10664PyDoc_STRVAR(isupper__doc__,
10665             "S.isupper() -> bool\n\
10666\n\
10667Return True if all cased characters in S are uppercase and there is\n\
10668at least one cased character in S, False otherwise.");
10669
10670static PyObject*
10671unicode_isupper(PyUnicodeObject *self)
10672{
10673    Py_ssize_t i, length;
10674    int kind;
10675    void *data;
10676    int cased;
10677
10678    if (PyUnicode_READY(self) == -1)
10679        return NULL;
10680    length = PyUnicode_GET_LENGTH(self);
10681    kind = PyUnicode_KIND(self);
10682    data = PyUnicode_DATA(self);
10683
10684    /* Shortcut for single character strings */
10685    if (length == 1)
10686        return PyBool_FromLong(
10687            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
10688
10689    /* Special case for empty strings */
10690    if (length == 0)
10691        return PyBool_FromLong(0);
10692
10693    cased = 0;
10694    for (i = 0; i < length; i++) {
10695        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10696
10697        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10698            return PyBool_FromLong(0);
10699        else if (!cased && Py_UNICODE_ISUPPER(ch))
10700            cased = 1;
10701    }
10702    return PyBool_FromLong(cased);
10703}
10704
10705PyDoc_STRVAR(istitle__doc__,
10706             "S.istitle() -> bool\n\
10707\n\
10708Return True if S is a titlecased string and there is at least one\n\
10709character in S, i.e. upper- and titlecase characters may only\n\
10710follow uncased characters and lowercase characters only cased ones.\n\
10711Return False otherwise.");
10712
10713static PyObject*
10714unicode_istitle(PyUnicodeObject *self)
10715{
10716    Py_ssize_t i, length;
10717    int kind;
10718    void *data;
10719    int cased, previous_is_cased;
10720
10721    if (PyUnicode_READY(self) == -1)
10722        return NULL;
10723    length = PyUnicode_GET_LENGTH(self);
10724    kind = PyUnicode_KIND(self);
10725    data = PyUnicode_DATA(self);
10726
10727    /* Shortcut for single character strings */
10728    if (length == 1) {
10729        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10730        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10731                               (Py_UNICODE_ISUPPER(ch) != 0));
10732    }
10733
10734    /* Special case for empty strings */
10735    if (length == 0)
10736        return PyBool_FromLong(0);
10737
10738    cased = 0;
10739    previous_is_cased = 0;
10740    for (i = 0; i < length; i++) {
10741        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10742
10743        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10744            if (previous_is_cased)
10745                return PyBool_FromLong(0);
10746            previous_is_cased = 1;
10747            cased = 1;
10748        }
10749        else if (Py_UNICODE_ISLOWER(ch)) {
10750            if (!previous_is_cased)
10751                return PyBool_FromLong(0);
10752            previous_is_cased = 1;
10753            cased = 1;
10754        }
10755        else
10756            previous_is_cased = 0;
10757    }
10758    return PyBool_FromLong(cased);
10759}
10760
10761PyDoc_STRVAR(isspace__doc__,
10762             "S.isspace() -> bool\n\
10763\n\
10764Return True if all characters in S are whitespace\n\
10765and there is at least one character in S, False otherwise.");
10766
10767static PyObject*
10768unicode_isspace(PyUnicodeObject *self)
10769{
10770    Py_ssize_t i, length;
10771    int kind;
10772    void *data;
10773
10774    if (PyUnicode_READY(self) == -1)
10775        return NULL;
10776    length = PyUnicode_GET_LENGTH(self);
10777    kind = PyUnicode_KIND(self);
10778    data = PyUnicode_DATA(self);
10779
10780    /* Shortcut for single character strings */
10781    if (length == 1)
10782        return PyBool_FromLong(
10783            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
10784
10785    /* Special case for empty strings */
10786    if (length == 0)
10787        return PyBool_FromLong(0);
10788
10789    for (i = 0; i < length; i++) {
10790        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10791        if (!Py_UNICODE_ISSPACE(ch))
10792            return PyBool_FromLong(0);
10793    }
10794    return PyBool_FromLong(1);
10795}
10796
10797PyDoc_STRVAR(isalpha__doc__,
10798             "S.isalpha() -> bool\n\
10799\n\
10800Return True if all characters in S are alphabetic\n\
10801and there is at least one character in S, False otherwise.");
10802
10803static PyObject*
10804unicode_isalpha(PyUnicodeObject *self)
10805{
10806    Py_ssize_t i, length;
10807    int kind;
10808    void *data;
10809
10810    if (PyUnicode_READY(self) == -1)
10811        return NULL;
10812    length = PyUnicode_GET_LENGTH(self);
10813    kind = PyUnicode_KIND(self);
10814    data = PyUnicode_DATA(self);
10815
10816    /* Shortcut for single character strings */
10817    if (length == 1)
10818        return PyBool_FromLong(
10819            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
10820
10821    /* Special case for empty strings */
10822    if (length == 0)
10823        return PyBool_FromLong(0);
10824
10825    for (i = 0; i < length; i++) {
10826        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
10827            return PyBool_FromLong(0);
10828    }
10829    return PyBool_FromLong(1);
10830}
10831
10832PyDoc_STRVAR(isalnum__doc__,
10833             "S.isalnum() -> bool\n\
10834\n\
10835Return True if all characters in S are alphanumeric\n\
10836and there is at least one character in S, False otherwise.");
10837
10838static PyObject*
10839unicode_isalnum(PyUnicodeObject *self)
10840{
10841    int kind;
10842    void *data;
10843    Py_ssize_t len, i;
10844
10845    if (PyUnicode_READY(self) == -1)
10846        return NULL;
10847
10848    kind = PyUnicode_KIND(self);
10849    data = PyUnicode_DATA(self);
10850    len = PyUnicode_GET_LENGTH(self);
10851
10852    /* Shortcut for single character strings */
10853    if (len == 1) {
10854        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10855        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10856    }
10857
10858    /* Special case for empty strings */
10859    if (len == 0)
10860        return PyBool_FromLong(0);
10861
10862    for (i = 0; i < len; i++) {
10863        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10864        if (!Py_UNICODE_ISALNUM(ch))
10865            return PyBool_FromLong(0);
10866    }
10867    return PyBool_FromLong(1);
10868}
10869
10870PyDoc_STRVAR(isdecimal__doc__,
10871             "S.isdecimal() -> bool\n\
10872\n\
10873Return True if there are only decimal characters in S,\n\
10874False otherwise.");
10875
10876static PyObject*
10877unicode_isdecimal(PyUnicodeObject *self)
10878{
10879    Py_ssize_t i, length;
10880    int kind;
10881    void *data;
10882
10883    if (PyUnicode_READY(self) == -1)
10884        return NULL;
10885    length = PyUnicode_GET_LENGTH(self);
10886    kind = PyUnicode_KIND(self);
10887    data = PyUnicode_DATA(self);
10888
10889    /* Shortcut for single character strings */
10890    if (length == 1)
10891        return PyBool_FromLong(
10892            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
10893
10894    /* Special case for empty strings */
10895    if (length == 0)
10896        return PyBool_FromLong(0);
10897
10898    for (i = 0; i < length; i++) {
10899        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
10900            return PyBool_FromLong(0);
10901    }
10902    return PyBool_FromLong(1);
10903}
10904
10905PyDoc_STRVAR(isdigit__doc__,
10906             "S.isdigit() -> bool\n\
10907\n\
10908Return True if all characters in S are digits\n\
10909and there is at least one character in S, False otherwise.");
10910
10911static PyObject*
10912unicode_isdigit(PyUnicodeObject *self)
10913{
10914    Py_ssize_t i, length;
10915    int kind;
10916    void *data;
10917
10918    if (PyUnicode_READY(self) == -1)
10919        return NULL;
10920    length = PyUnicode_GET_LENGTH(self);
10921    kind = PyUnicode_KIND(self);
10922    data = PyUnicode_DATA(self);
10923
10924    /* Shortcut for single character strings */
10925    if (length == 1) {
10926        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10927        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10928    }
10929
10930    /* Special case for empty strings */
10931    if (length == 0)
10932        return PyBool_FromLong(0);
10933
10934    for (i = 0; i < length; i++) {
10935        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
10936            return PyBool_FromLong(0);
10937    }
10938    return PyBool_FromLong(1);
10939}
10940
10941PyDoc_STRVAR(isnumeric__doc__,
10942             "S.isnumeric() -> bool\n\
10943\n\
10944Return True if there are only numeric characters in S,\n\
10945False otherwise.");
10946
10947static PyObject*
10948unicode_isnumeric(PyUnicodeObject *self)
10949{
10950    Py_ssize_t i, length;
10951    int kind;
10952    void *data;
10953
10954    if (PyUnicode_READY(self) == -1)
10955        return NULL;
10956    length = PyUnicode_GET_LENGTH(self);
10957    kind = PyUnicode_KIND(self);
10958    data = PyUnicode_DATA(self);
10959
10960    /* Shortcut for single character strings */
10961    if (length == 1)
10962        return PyBool_FromLong(
10963            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
10964
10965    /* Special case for empty strings */
10966    if (length == 0)
10967        return PyBool_FromLong(0);
10968
10969    for (i = 0; i < length; i++) {
10970        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
10971            return PyBool_FromLong(0);
10972    }
10973    return PyBool_FromLong(1);
10974}
10975
10976int
10977PyUnicode_IsIdentifier(PyObject *self)
10978{
10979    int kind;
10980    void *data;
10981    Py_ssize_t i;
10982    Py_UCS4 first;
10983
10984    if (PyUnicode_READY(self) == -1) {
10985        Py_FatalError("identifier not ready");
10986        return 0;
10987    }
10988
10989    /* Special case for empty strings */
10990    if (PyUnicode_GET_LENGTH(self) == 0)
10991        return 0;
10992    kind = PyUnicode_KIND(self);
10993    data = PyUnicode_DATA(self);
10994
10995    /* PEP 3131 says that the first character must be in
10996       XID_Start and subsequent characters in XID_Continue,
10997       and for the ASCII range, the 2.x rules apply (i.e
10998       start with letters and underscore, continue with
10999       letters, digits, underscore). However, given the current
11000       definition of XID_Start and XID_Continue, it is sufficient
11001       to check just for these, except that _ must be allowed
11002       as starting an identifier.  */
11003    first = PyUnicode_READ(kind, data, 0);
11004    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11005        return 0;
11006
11007    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11008        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11009            return 0;
11010    return 1;
11011}
11012
11013PyDoc_STRVAR(isidentifier__doc__,
11014             "S.isidentifier() -> bool\n\
11015\n\
11016Return True if S is a valid identifier according\n\
11017to the language definition.");
11018
11019static PyObject*
11020unicode_isidentifier(PyObject *self)
11021{
11022    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11023}
11024
11025PyDoc_STRVAR(isprintable__doc__,
11026             "S.isprintable() -> bool\n\
11027\n\
11028Return True if all characters in S are considered\n\
11029printable in repr() or S is empty, False otherwise.");
11030
11031static PyObject*
11032unicode_isprintable(PyObject *self)
11033{
11034    Py_ssize_t i, length;
11035    int kind;
11036    void *data;
11037
11038    if (PyUnicode_READY(self) == -1)
11039        return NULL;
11040    length = PyUnicode_GET_LENGTH(self);
11041    kind = PyUnicode_KIND(self);
11042    data = PyUnicode_DATA(self);
11043
11044    /* Shortcut for single character strings */
11045    if (length == 1)
11046        return PyBool_FromLong(
11047            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11048
11049    for (i = 0; i < length; i++) {
11050        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11051            Py_RETURN_FALSE;
11052        }
11053    }
11054    Py_RETURN_TRUE;
11055}
11056
11057PyDoc_STRVAR(join__doc__,
11058             "S.join(iterable) -> str\n\
11059\n\
11060Return a string which is the concatenation of the strings in the\n\
11061iterable.  The separator between elements is S.");
11062
11063static PyObject*
11064unicode_join(PyObject *self, PyObject *data)
11065{
11066    return PyUnicode_Join(self, data);
11067}
11068
11069static Py_ssize_t
11070unicode_length(PyUnicodeObject *self)
11071{
11072    if (PyUnicode_READY(self) == -1)
11073        return -1;
11074    return PyUnicode_GET_LENGTH(self);
11075}
11076
11077PyDoc_STRVAR(ljust__doc__,
11078             "S.ljust(width[, fillchar]) -> str\n\
11079\n\
11080Return S left-justified in a Unicode string of length width. Padding is\n\
11081done using the specified fill character (default is a space).");
11082
11083static PyObject *
11084unicode_ljust(PyObject *self, PyObject *args)
11085{
11086    Py_ssize_t width;
11087    Py_UCS4 fillchar = ' ';
11088
11089    if (PyUnicode_READY(self) == -1)
11090        return NULL;
11091
11092    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11093        return NULL;
11094
11095    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11096        Py_INCREF(self);
11097        return (PyObject*) self;
11098    }
11099
11100    return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
11101}
11102
11103PyDoc_STRVAR(lower__doc__,
11104             "S.lower() -> str\n\
11105\n\
11106Return a copy of the string S converted to lowercase.");
11107
11108static PyObject*
11109unicode_lower(PyObject *self)
11110{
11111    return fixup(self, fixlower);
11112}
11113
11114#define LEFTSTRIP 0
11115#define RIGHTSTRIP 1
11116#define BOTHSTRIP 2
11117
11118/* Arrays indexed by above */
11119static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11120
11121#define STRIPNAME(i) (stripformat[i]+3)
11122
11123/* externally visible for str.strip(unicode) */
11124PyObject *
11125_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11126{
11127    void *data;
11128    int kind;
11129    Py_ssize_t i, j, len;
11130    BLOOM_MASK sepmask;
11131
11132    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11133        return NULL;
11134
11135    kind = PyUnicode_KIND(self);
11136    data = PyUnicode_DATA(self);
11137    len = PyUnicode_GET_LENGTH(self);
11138    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11139                              PyUnicode_DATA(sepobj),
11140                              PyUnicode_GET_LENGTH(sepobj));
11141
11142    i = 0;
11143    if (striptype != RIGHTSTRIP) {
11144        while (i < len &&
11145               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11146            i++;
11147        }
11148    }
11149
11150    j = len;
11151    if (striptype != LEFTSTRIP) {
11152        do {
11153            j--;
11154        } while (j >= i &&
11155                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11156        j++;
11157    }
11158
11159    return PyUnicode_Substring((PyObject*)self, i, j);
11160}
11161
11162PyObject*
11163PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11164{
11165    unsigned char *data;
11166    int kind;
11167    Py_ssize_t length;
11168
11169    if (PyUnicode_READY(self) == -1)
11170        return NULL;
11171
11172    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11173
11174    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
11175    {
11176        if (PyUnicode_CheckExact(self)) {
11177            Py_INCREF(self);
11178            return self;
11179        }
11180        else
11181            return PyUnicode_Copy(self);
11182    }
11183
11184    length = end - start;
11185    if (length == 1)
11186        return unicode_getitem(self, start);
11187
11188    if (start < 0 || end < 0) {
11189        PyErr_SetString(PyExc_IndexError, "string index out of range");
11190        return NULL;
11191    }
11192
11193    if (PyUnicode_IS_ASCII(self)) {
11194        kind = PyUnicode_KIND(self);
11195        data = PyUnicode_1BYTE_DATA(self);
11196        return unicode_fromascii(data + start, length);
11197    }
11198    else {
11199        kind = PyUnicode_KIND(self);
11200        data = PyUnicode_1BYTE_DATA(self);
11201        return PyUnicode_FromKindAndData(kind,
11202                                         data + PyUnicode_KIND_SIZE(kind, start),
11203                                         length);
11204    }
11205}
11206
11207static PyObject *
11208do_strip(PyUnicodeObject *self, int striptype)
11209{
11210    int kind;
11211    void *data;
11212    Py_ssize_t len, i, j;
11213
11214    if (PyUnicode_READY(self) == -1)
11215        return NULL;
11216
11217    kind = PyUnicode_KIND(self);
11218    data = PyUnicode_DATA(self);
11219    len = PyUnicode_GET_LENGTH(self);
11220
11221    i = 0;
11222    if (striptype != RIGHTSTRIP) {
11223        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11224            i++;
11225        }
11226    }
11227
11228    j = len;
11229    if (striptype != LEFTSTRIP) {
11230        do {
11231            j--;
11232        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11233        j++;
11234    }
11235
11236    return PyUnicode_Substring((PyObject*)self, i, j);
11237}
11238
11239
11240static PyObject *
11241do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11242{
11243    PyObject *sep = NULL;
11244
11245    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11246        return NULL;
11247
11248    if (sep != NULL && sep != Py_None) {
11249        if (PyUnicode_Check(sep))
11250            return _PyUnicode_XStrip(self, striptype, sep);
11251        else {
11252            PyErr_Format(PyExc_TypeError,
11253                         "%s arg must be None or str",
11254                         STRIPNAME(striptype));
11255            return NULL;
11256        }
11257    }
11258
11259    return do_strip(self, striptype);
11260}
11261
11262
11263PyDoc_STRVAR(strip__doc__,
11264             "S.strip([chars]) -> str\n\
11265\n\
11266Return a copy of the string S with leading and trailing\n\
11267whitespace removed.\n\
11268If chars is given and not None, remove characters in chars instead.");
11269
11270static PyObject *
11271unicode_strip(PyUnicodeObject *self, PyObject *args)
11272{
11273    if (PyTuple_GET_SIZE(args) == 0)
11274        return do_strip(self, BOTHSTRIP); /* Common case */
11275    else
11276        return do_argstrip(self, BOTHSTRIP, args);
11277}
11278
11279
11280PyDoc_STRVAR(lstrip__doc__,
11281             "S.lstrip([chars]) -> str\n\
11282\n\
11283Return a copy of the string S with leading whitespace removed.\n\
11284If chars is given and not None, remove characters in chars instead.");
11285
11286static PyObject *
11287unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11288{
11289    if (PyTuple_GET_SIZE(args) == 0)
11290        return do_strip(self, LEFTSTRIP); /* Common case */
11291    else
11292        return do_argstrip(self, LEFTSTRIP, args);
11293}
11294
11295
11296PyDoc_STRVAR(rstrip__doc__,
11297             "S.rstrip([chars]) -> str\n\
11298\n\
11299Return a copy of the string S with trailing whitespace removed.\n\
11300If chars is given and not None, remove characters in chars instead.");
11301
11302static PyObject *
11303unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11304{
11305    if (PyTuple_GET_SIZE(args) == 0)
11306        return do_strip(self, RIGHTSTRIP); /* Common case */
11307    else
11308        return do_argstrip(self, RIGHTSTRIP, args);
11309}
11310
11311
11312static PyObject*
11313unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
11314{
11315    PyUnicodeObject *u;
11316    Py_ssize_t nchars, n;
11317
11318    if (len < 1) {
11319        Py_INCREF(unicode_empty);
11320        return unicode_empty;
11321    }
11322
11323    if (len == 1 && PyUnicode_CheckExact(str)) {
11324        /* no repeat, return original string */
11325        Py_INCREF(str);
11326        return (PyObject*) str;
11327    }
11328
11329    if (PyUnicode_READY(str) == -1)
11330        return NULL;
11331
11332    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11333        PyErr_SetString(PyExc_OverflowError,
11334                        "repeated string is too long");
11335        return NULL;
11336    }
11337    nchars = len * PyUnicode_GET_LENGTH(str);
11338
11339    u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11340    if (!u)
11341        return NULL;
11342    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11343
11344    if (PyUnicode_GET_LENGTH(str) == 1) {
11345        const int kind = PyUnicode_KIND(str);
11346        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11347        void *to = PyUnicode_DATA(u);
11348        if (kind == PyUnicode_1BYTE_KIND)
11349            memset(to, (unsigned char)fill_char, len);
11350        else {
11351            for (n = 0; n < len; ++n)
11352                PyUnicode_WRITE(kind, to, n, fill_char);
11353        }
11354    }
11355    else {
11356        /* number of characters copied this far */
11357        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11358        const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11359        char *to = (char *) PyUnicode_DATA(u);
11360        Py_MEMCPY(to, PyUnicode_DATA(str),
11361                  PyUnicode_GET_LENGTH(str) * char_size);
11362        while (done < nchars) {
11363            n = (done <= nchars-done) ? done : nchars-done;
11364            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11365            done += n;
11366        }
11367    }
11368
11369    assert(_PyUnicode_CheckConsistency(u, 1));
11370    return (PyObject*) u;
11371}
11372
11373PyObject *
11374PyUnicode_Replace(PyObject *obj,
11375                  PyObject *subobj,
11376                  PyObject *replobj,
11377                  Py_ssize_t maxcount)
11378{
11379    PyObject *self;
11380    PyObject *str1;
11381    PyObject *str2;
11382    PyObject *result;
11383
11384    self = PyUnicode_FromObject(obj);
11385    if (self == NULL || PyUnicode_READY(self) == -1)
11386        return NULL;
11387    str1 = PyUnicode_FromObject(subobj);
11388    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11389        Py_DECREF(self);
11390        return NULL;
11391    }
11392    str2 = PyUnicode_FromObject(replobj);
11393    if (str2 == NULL || PyUnicode_READY(str2)) {
11394        Py_DECREF(self);
11395        Py_DECREF(str1);
11396        return NULL;
11397    }
11398    result = replace(self, str1, str2, maxcount);
11399    Py_DECREF(self);
11400    Py_DECREF(str1);
11401    Py_DECREF(str2);
11402    return result;
11403}
11404
11405PyDoc_STRVAR(replace__doc__,
11406             "S.replace(old, new[, count]) -> str\n\
11407\n\
11408Return a copy of S with all occurrences of substring\n\
11409old replaced by new.  If the optional argument count is\n\
11410given, only the first count occurrences are replaced.");
11411
11412static PyObject*
11413unicode_replace(PyObject *self, PyObject *args)
11414{
11415    PyObject *str1;
11416    PyObject *str2;
11417    Py_ssize_t maxcount = -1;
11418    PyObject *result;
11419
11420    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11421        return NULL;
11422    if (!PyUnicode_READY(self) == -1)
11423        return NULL;
11424    str1 = PyUnicode_FromObject(str1);
11425    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11426        return NULL;
11427    str2 = PyUnicode_FromObject(str2);
11428    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11429        Py_DECREF(str1);
11430        return NULL;
11431    }
11432
11433    result = replace(self, str1, str2, maxcount);
11434
11435    Py_DECREF(str1);
11436    Py_DECREF(str2);
11437    return result;
11438}
11439
11440static PyObject *
11441unicode_repr(PyObject *unicode)
11442{
11443    PyObject *repr;
11444    Py_ssize_t isize;
11445    Py_ssize_t osize, squote, dquote, i, o;
11446    Py_UCS4 max, quote;
11447    int ikind, okind;
11448    void *idata, *odata;
11449
11450    if (PyUnicode_READY(unicode) == -1)
11451        return NULL;
11452
11453    isize = PyUnicode_GET_LENGTH(unicode);
11454    idata = PyUnicode_DATA(unicode);
11455
11456    /* Compute length of output, quote characters, and
11457       maximum character */
11458    osize = 2; /* quotes */
11459    max = 127;
11460    squote = dquote = 0;
11461    ikind = PyUnicode_KIND(unicode);
11462    for (i = 0; i < isize; i++) {
11463        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11464        switch (ch) {
11465        case '\'': squote++; osize++; break;
11466        case '"':  dquote++; osize++; break;
11467        case '\\': case '\t': case '\r': case '\n':
11468            osize += 2; break;
11469        default:
11470            /* Fast-path ASCII */
11471            if (ch < ' ' || ch == 0x7f)
11472                osize += 4; /* \xHH */
11473            else if (ch < 0x7f)
11474                osize++;
11475            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11476                osize++;
11477                max = ch > max ? ch : max;
11478            }
11479            else if (ch < 0x100)
11480                osize += 4; /* \xHH */
11481            else if (ch < 0x10000)
11482                osize += 6; /* \uHHHH */
11483            else
11484                osize += 10; /* \uHHHHHHHH */
11485        }
11486    }
11487
11488    quote = '\'';
11489    if (squote) {
11490        if (dquote)
11491            /* Both squote and dquote present. Use squote,
11492               and escape them */
11493            osize += squote;
11494        else
11495            quote = '"';
11496    }
11497
11498    repr = PyUnicode_New(osize, max);
11499    if (repr == NULL)
11500        return NULL;
11501    okind = PyUnicode_KIND(repr);
11502    odata = PyUnicode_DATA(repr);
11503
11504    PyUnicode_WRITE(okind, odata, 0, quote);
11505    PyUnicode_WRITE(okind, odata, osize-1, quote);
11506
11507    for (i = 0, o = 1; i < isize; i++) {
11508        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11509
11510        /* Escape quotes and backslashes */
11511        if ((ch == quote) || (ch == '\\')) {
11512            PyUnicode_WRITE(okind, odata, o++, '\\');
11513            PyUnicode_WRITE(okind, odata, o++, ch);
11514            continue;
11515        }
11516
11517        /* Map special whitespace to '\t', \n', '\r' */
11518        if (ch == '\t') {
11519            PyUnicode_WRITE(okind, odata, o++, '\\');
11520            PyUnicode_WRITE(okind, odata, o++, 't');
11521        }
11522        else if (ch == '\n') {
11523            PyUnicode_WRITE(okind, odata, o++, '\\');
11524            PyUnicode_WRITE(okind, odata, o++, 'n');
11525        }
11526        else if (ch == '\r') {
11527            PyUnicode_WRITE(okind, odata, o++, '\\');
11528            PyUnicode_WRITE(okind, odata, o++, 'r');
11529        }
11530
11531        /* Map non-printable US ASCII to '\xhh' */
11532        else if (ch < ' ' || ch == 0x7F) {
11533            PyUnicode_WRITE(okind, odata, o++, '\\');
11534            PyUnicode_WRITE(okind, odata, o++, 'x');
11535            PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11536            PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11537        }
11538
11539        /* Copy ASCII characters as-is */
11540        else if (ch < 0x7F) {
11541            PyUnicode_WRITE(okind, odata, o++, ch);
11542        }
11543
11544        /* Non-ASCII characters */
11545        else {
11546            /* Map Unicode whitespace and control characters
11547               (categories Z* and C* except ASCII space)
11548            */
11549            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11550                /* Map 8-bit characters to '\xhh' */
11551                if (ch <= 0xff) {
11552                    PyUnicode_WRITE(okind, odata, o++, '\\');
11553                    PyUnicode_WRITE(okind, odata, o++, 'x');
11554                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11555                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11556                }
11557                /* Map 21-bit characters to '\U00xxxxxx' */
11558                else if (ch >= 0x10000) {
11559                    PyUnicode_WRITE(okind, odata, o++, '\\');
11560                    PyUnicode_WRITE(okind, odata, o++, 'U');
11561                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11562                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11563                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11564                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11565                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11566                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11567                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11568                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11569                }
11570                /* Map 16-bit characters to '\uxxxx' */
11571                else {
11572                    PyUnicode_WRITE(okind, odata, o++, '\\');
11573                    PyUnicode_WRITE(okind, odata, o++, 'u');
11574                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11575                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11576                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11577                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11578                }
11579            }
11580            /* Copy characters as-is */
11581            else {
11582                PyUnicode_WRITE(okind, odata, o++, ch);
11583            }
11584        }
11585    }
11586    /* Closing quote already added at the beginning */
11587    assert(_PyUnicode_CheckConsistency(repr, 1));
11588    return repr;
11589}
11590
11591PyDoc_STRVAR(rfind__doc__,
11592             "S.rfind(sub[, start[, end]]) -> int\n\
11593\n\
11594Return the highest index in S where substring sub is found,\n\
11595such that sub is contained within S[start:end].  Optional\n\
11596arguments start and end are interpreted as in slice notation.\n\
11597\n\
11598Return -1 on failure.");
11599
11600static PyObject *
11601unicode_rfind(PyObject *self, PyObject *args)
11602{
11603    PyUnicodeObject *substring;
11604    Py_ssize_t start;
11605    Py_ssize_t end;
11606    Py_ssize_t result;
11607
11608    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11609                                            &start, &end))
11610        return NULL;
11611
11612    if (PyUnicode_READY(self) == -1)
11613        return NULL;
11614    if (PyUnicode_READY(substring) == -1)
11615        return NULL;
11616
11617    result = any_find_slice(
11618        asciilib_rfind_slice, ucs1lib_rfind_slice,
11619        ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11620        self, (PyObject*)substring, start, end
11621        );
11622
11623    Py_DECREF(substring);
11624
11625    if (result == -2)
11626        return NULL;
11627
11628    return PyLong_FromSsize_t(result);
11629}
11630
11631PyDoc_STRVAR(rindex__doc__,
11632             "S.rindex(sub[, start[, end]]) -> int\n\
11633\n\
11634Like S.rfind() but raise ValueError when the substring is not found.");
11635
11636static PyObject *
11637unicode_rindex(PyObject *self, PyObject *args)
11638{
11639    PyUnicodeObject *substring;
11640    Py_ssize_t start;
11641    Py_ssize_t end;
11642    Py_ssize_t result;
11643
11644    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11645                                            &start, &end))
11646        return NULL;
11647
11648    if (PyUnicode_READY(self) == -1)
11649        return NULL;
11650    if (PyUnicode_READY(substring) == -1)
11651        return NULL;
11652
11653    result = any_find_slice(
11654        asciilib_rfind_slice, ucs1lib_rfind_slice,
11655        ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11656        self, (PyObject*)substring, start, end
11657        );
11658
11659    Py_DECREF(substring);
11660
11661    if (result == -2)
11662        return NULL;
11663
11664    if (result < 0) {
11665        PyErr_SetString(PyExc_ValueError, "substring not found");
11666        return NULL;
11667    }
11668
11669    return PyLong_FromSsize_t(result);
11670}
11671
11672PyDoc_STRVAR(rjust__doc__,
11673             "S.rjust(width[, fillchar]) -> str\n\
11674\n\
11675Return S right-justified in a string of length width. Padding is\n\
11676done using the specified fill character (default is a space).");
11677
11678static PyObject *
11679unicode_rjust(PyObject *self, PyObject *args)
11680{
11681    Py_ssize_t width;
11682    Py_UCS4 fillchar = ' ';
11683
11684    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
11685        return NULL;
11686
11687    if (PyUnicode_READY(self) == -1)
11688        return NULL;
11689
11690    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11691        Py_INCREF(self);
11692        return (PyObject*) self;
11693    }
11694
11695    return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
11696}
11697
11698PyObject *
11699PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11700{
11701    PyObject *result;
11702
11703    s = PyUnicode_FromObject(s);
11704    if (s == NULL)
11705        return NULL;
11706    if (sep != NULL) {
11707        sep = PyUnicode_FromObject(sep);
11708        if (sep == NULL) {
11709            Py_DECREF(s);
11710            return NULL;
11711        }
11712    }
11713
11714    result = split(s, sep, maxsplit);
11715
11716    Py_DECREF(s);
11717    Py_XDECREF(sep);
11718    return result;
11719}
11720
11721PyDoc_STRVAR(split__doc__,
11722             "S.split([sep[, maxsplit]]) -> list of strings\n\
11723\n\
11724Return a list of the words in S, using sep as the\n\
11725delimiter string.  If maxsplit is given, at most maxsplit\n\
11726splits are done. If sep is not specified or is None, any\n\
11727whitespace string is a separator and empty strings are\n\
11728removed from the result.");
11729
11730static PyObject*
11731unicode_split(PyObject *self, PyObject *args)
11732{
11733    PyObject *substring = Py_None;
11734    Py_ssize_t maxcount = -1;
11735
11736    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
11737        return NULL;
11738
11739    if (substring == Py_None)
11740        return split(self, NULL, maxcount);
11741    else if (PyUnicode_Check(substring))
11742        return split(self, substring, maxcount);
11743    else
11744        return PyUnicode_Split((PyObject *)self, substring, maxcount);
11745}
11746
11747PyObject *
11748PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11749{
11750    PyObject* str_obj;
11751    PyObject* sep_obj;
11752    PyObject* out;
11753    int kind1, kind2, kind;
11754    void *buf1 = NULL, *buf2 = NULL;
11755    Py_ssize_t len1, len2;
11756
11757    str_obj = PyUnicode_FromObject(str_in);
11758    if (!str_obj || PyUnicode_READY(str_obj) == -1)
11759        return NULL;
11760    sep_obj = PyUnicode_FromObject(sep_in);
11761    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
11762        Py_DECREF(str_obj);
11763        return NULL;
11764    }
11765
11766    kind1 = PyUnicode_KIND(str_obj);
11767    kind2 = PyUnicode_KIND(sep_obj);
11768    kind = Py_MAX(kind1, kind2);
11769    buf1 = PyUnicode_DATA(str_obj);
11770    if (kind1 != kind)
11771        buf1 = _PyUnicode_AsKind(str_obj, kind);
11772    if (!buf1)
11773        goto onError;
11774    buf2 = PyUnicode_DATA(sep_obj);
11775    if (kind2 != kind)
11776        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11777    if (!buf2)
11778        goto onError;
11779    len1 = PyUnicode_GET_LENGTH(str_obj);
11780    len2 = PyUnicode_GET_LENGTH(sep_obj);
11781
11782    switch(PyUnicode_KIND(str_obj)) {
11783    case PyUnicode_1BYTE_KIND:
11784        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11785            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11786        else
11787            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11788        break;
11789    case PyUnicode_2BYTE_KIND:
11790        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11791        break;
11792    case PyUnicode_4BYTE_KIND:
11793        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11794        break;
11795    default:
11796        assert(0);
11797        out = 0;
11798    }
11799
11800    Py_DECREF(sep_obj);
11801    Py_DECREF(str_obj);
11802    if (kind1 != kind)
11803        PyMem_Free(buf1);
11804    if (kind2 != kind)
11805        PyMem_Free(buf2);
11806
11807    return out;
11808  onError:
11809    Py_DECREF(sep_obj);
11810    Py_DECREF(str_obj);
11811    if (kind1 != kind && buf1)
11812        PyMem_Free(buf1);
11813    if (kind2 != kind && buf2)
11814        PyMem_Free(buf2);
11815    return NULL;
11816}
11817
11818
11819PyObject *
11820PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11821{
11822    PyObject* str_obj;
11823    PyObject* sep_obj;
11824    PyObject* out;
11825    int kind1, kind2, kind;
11826    void *buf1 = NULL, *buf2 = NULL;
11827    Py_ssize_t len1, len2;
11828
11829    str_obj = PyUnicode_FromObject(str_in);
11830    if (!str_obj)
11831        return NULL;
11832    sep_obj = PyUnicode_FromObject(sep_in);
11833    if (!sep_obj) {
11834        Py_DECREF(str_obj);
11835        return NULL;
11836    }
11837
11838    kind1 = PyUnicode_KIND(str_in);
11839    kind2 = PyUnicode_KIND(sep_obj);
11840    kind = Py_MAX(kind1, kind2);
11841    buf1 = PyUnicode_DATA(str_in);
11842    if (kind1 != kind)
11843        buf1 = _PyUnicode_AsKind(str_in, kind);
11844    if (!buf1)
11845        goto onError;
11846    buf2 = PyUnicode_DATA(sep_obj);
11847    if (kind2 != kind)
11848        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11849    if (!buf2)
11850        goto onError;
11851    len1 = PyUnicode_GET_LENGTH(str_obj);
11852    len2 = PyUnicode_GET_LENGTH(sep_obj);
11853
11854    switch(PyUnicode_KIND(str_in)) {
11855    case PyUnicode_1BYTE_KIND:
11856        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11857            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11858        else
11859            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11860        break;
11861    case PyUnicode_2BYTE_KIND:
11862        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11863        break;
11864    case PyUnicode_4BYTE_KIND:
11865        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11866        break;
11867    default:
11868        assert(0);
11869        out = 0;
11870    }
11871
11872    Py_DECREF(sep_obj);
11873    Py_DECREF(str_obj);
11874    if (kind1 != kind)
11875        PyMem_Free(buf1);
11876    if (kind2 != kind)
11877        PyMem_Free(buf2);
11878
11879    return out;
11880  onError:
11881    Py_DECREF(sep_obj);
11882    Py_DECREF(str_obj);
11883    if (kind1 != kind && buf1)
11884        PyMem_Free(buf1);
11885    if (kind2 != kind && buf2)
11886        PyMem_Free(buf2);
11887    return NULL;
11888}
11889
11890PyDoc_STRVAR(partition__doc__,
11891             "S.partition(sep) -> (head, sep, tail)\n\
11892\n\
11893Search for the separator sep in S, and return the part before it,\n\
11894the separator itself, and the part after it.  If the separator is not\n\
11895found, return S and two empty strings.");
11896
11897static PyObject*
11898unicode_partition(PyObject *self, PyObject *separator)
11899{
11900    return PyUnicode_Partition(self, separator);
11901}
11902
11903PyDoc_STRVAR(rpartition__doc__,
11904             "S.rpartition(sep) -> (head, sep, tail)\n\
11905\n\
11906Search for the separator sep in S, starting at the end of S, and return\n\
11907the part before it, the separator itself, and the part after it.  If the\n\
11908separator is not found, return two empty strings and S.");
11909
11910static PyObject*
11911unicode_rpartition(PyObject *self, PyObject *separator)
11912{
11913    return PyUnicode_RPartition(self, separator);
11914}
11915
11916PyObject *
11917PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11918{
11919    PyObject *result;
11920
11921    s = PyUnicode_FromObject(s);
11922    if (s == NULL)
11923        return NULL;
11924    if (sep != NULL) {
11925        sep = PyUnicode_FromObject(sep);
11926        if (sep == NULL) {
11927            Py_DECREF(s);
11928            return NULL;
11929        }
11930    }
11931
11932    result = rsplit(s, sep, maxsplit);
11933
11934    Py_DECREF(s);
11935    Py_XDECREF(sep);
11936    return result;
11937}
11938
11939PyDoc_STRVAR(rsplit__doc__,
11940             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
11941\n\
11942Return a list of the words in S, using sep as the\n\
11943delimiter string, starting at the end of the string and\n\
11944working to the front.  If maxsplit is given, at most maxsplit\n\
11945splits are done. If sep is not specified, any whitespace string\n\
11946is a separator.");
11947
11948static PyObject*
11949unicode_rsplit(PyObject *self, PyObject *args)
11950{
11951    PyObject *substring = Py_None;
11952    Py_ssize_t maxcount = -1;
11953
11954    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
11955        return NULL;
11956
11957    if (substring == Py_None)
11958        return rsplit(self, NULL, maxcount);
11959    else if (PyUnicode_Check(substring))
11960        return rsplit(self, substring, maxcount);
11961    else
11962        return PyUnicode_RSplit(self, substring, maxcount);
11963}
11964
11965PyDoc_STRVAR(splitlines__doc__,
11966             "S.splitlines([keepends]) -> list of strings\n\
11967\n\
11968Return a list of the lines in S, breaking at line boundaries.\n\
11969Line breaks are not included in the resulting list unless keepends\n\
11970is given and true.");
11971
11972static PyObject*
11973unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
11974{
11975    static char *kwlist[] = {"keepends", 0};
11976    int keepends = 0;
11977
11978    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11979                                     kwlist, &keepends))
11980        return NULL;
11981
11982    return PyUnicode_Splitlines((PyObject *)self, keepends);
11983}
11984
11985static
11986PyObject *unicode_str(PyObject *self)
11987{
11988    if (PyUnicode_CheckExact(self)) {
11989        Py_INCREF(self);
11990        return self;
11991    } else
11992        /* Subtype -- return genuine unicode string with the same value. */
11993        return PyUnicode_Copy(self);
11994}
11995
11996PyDoc_STRVAR(swapcase__doc__,
11997             "S.swapcase() -> str\n\
11998\n\
11999Return a copy of S with uppercase characters converted to lowercase\n\
12000and vice versa.");
12001
12002static PyObject*
12003unicode_swapcase(PyObject *self)
12004{
12005    return fixup(self, fixswapcase);
12006}
12007
12008PyDoc_STRVAR(maketrans__doc__,
12009             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12010\n\
12011Return a translation table usable for str.translate().\n\
12012If there is only one argument, it must be a dictionary mapping Unicode\n\
12013ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12014Character keys will be then converted to ordinals.\n\
12015If there are two arguments, they must be strings of equal length, and\n\
12016in the resulting dictionary, each character in x will be mapped to the\n\
12017character at the same position in y. If there is a third argument, it\n\
12018must be a string, whose characters will be mapped to None in the result.");
12019
12020static PyObject*
12021unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12022{
12023    PyObject *x, *y = NULL, *z = NULL;
12024    PyObject *new = NULL, *key, *value;
12025    Py_ssize_t i = 0;
12026    int res;
12027
12028    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12029        return NULL;
12030    new = PyDict_New();
12031    if (!new)
12032        return NULL;
12033    if (y != NULL) {
12034        int x_kind, y_kind, z_kind;
12035        void *x_data, *y_data, *z_data;
12036
12037        /* x must be a string too, of equal length */
12038        if (!PyUnicode_Check(x)) {
12039            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12040                            "be a string if there is a second argument");
12041            goto err;
12042        }
12043        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12044            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12045                            "arguments must have equal length");
12046            goto err;
12047        }
12048        /* create entries for translating chars in x to those in y */
12049        x_kind = PyUnicode_KIND(x);
12050        y_kind = PyUnicode_KIND(y);
12051        x_data = PyUnicode_DATA(x);
12052        y_data = PyUnicode_DATA(y);
12053        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12054            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12055            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12056            if (!key || !value)
12057                goto err;
12058            res = PyDict_SetItem(new, key, value);
12059            Py_DECREF(key);
12060            Py_DECREF(value);
12061            if (res < 0)
12062                goto err;
12063        }
12064        /* create entries for deleting chars in z */
12065        if (z != NULL) {
12066            z_kind = PyUnicode_KIND(z);
12067            z_data = PyUnicode_DATA(z);
12068            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
12069                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12070                if (!key)
12071                    goto err;
12072                res = PyDict_SetItem(new, key, Py_None);
12073                Py_DECREF(key);
12074                if (res < 0)
12075                    goto err;
12076            }
12077        }
12078    } else {
12079        int kind;
12080        void *data;
12081
12082        /* x must be a dict */
12083        if (!PyDict_CheckExact(x)) {
12084            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12085                            "to maketrans it must be a dict");
12086            goto err;
12087        }
12088        /* copy entries into the new dict, converting string keys to int keys */
12089        while (PyDict_Next(x, &i, &key, &value)) {
12090            if (PyUnicode_Check(key)) {
12091                /* convert string keys to integer keys */
12092                PyObject *newkey;
12093                if (PyUnicode_GET_SIZE(key) != 1) {
12094                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12095                                    "table must be of length 1");
12096                    goto err;
12097                }
12098                kind = PyUnicode_KIND(key);
12099                data = PyUnicode_DATA(key);
12100                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12101                if (!newkey)
12102                    goto err;
12103                res = PyDict_SetItem(new, newkey, value);
12104                Py_DECREF(newkey);
12105                if (res < 0)
12106                    goto err;
12107            } else if (PyLong_Check(key)) {
12108                /* just keep integer keys */
12109                if (PyDict_SetItem(new, key, value) < 0)
12110                    goto err;
12111            } else {
12112                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12113                                "be strings or integers");
12114                goto err;
12115            }
12116        }
12117    }
12118    return new;
12119  err:
12120    Py_DECREF(new);
12121    return NULL;
12122}
12123
12124PyDoc_STRVAR(translate__doc__,
12125             "S.translate(table) -> str\n\
12126\n\
12127Return a copy of the string S, where all characters have been mapped\n\
12128through the given translation table, which must be a mapping of\n\
12129Unicode ordinals to Unicode ordinals, strings, or None.\n\
12130Unmapped characters are left untouched. Characters mapped to None\n\
12131are deleted.");
12132
12133static PyObject*
12134unicode_translate(PyObject *self, PyObject *table)
12135{
12136    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12137}
12138
12139PyDoc_STRVAR(upper__doc__,
12140             "S.upper() -> str\n\
12141\n\
12142Return a copy of S converted to uppercase.");
12143
12144static PyObject*
12145unicode_upper(PyObject *self)
12146{
12147    return fixup(self, fixupper);
12148}
12149
12150PyDoc_STRVAR(zfill__doc__,
12151             "S.zfill(width) -> str\n\
12152\n\
12153Pad a numeric string S with zeros on the left, to fill a field\n\
12154of the specified width. The string S is never truncated.");
12155
12156static PyObject *
12157unicode_zfill(PyObject *self, PyObject *args)
12158{
12159    Py_ssize_t fill;
12160    PyObject *u;
12161    Py_ssize_t width;
12162    int kind;
12163    void *data;
12164    Py_UCS4 chr;
12165
12166    if (PyUnicode_READY(self) == -1)
12167        return NULL;
12168
12169    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12170        return NULL;
12171
12172    if (PyUnicode_GET_LENGTH(self) >= width) {
12173        if (PyUnicode_CheckExact(self)) {
12174            Py_INCREF(self);
12175            return (PyObject*) self;
12176        }
12177        else
12178            return PyUnicode_Copy((PyObject*)self);
12179    }
12180
12181    fill = width - _PyUnicode_LENGTH(self);
12182
12183    u = pad(self, fill, 0, '0');
12184
12185    if (u == NULL)
12186        return NULL;
12187
12188    kind = PyUnicode_KIND(u);
12189    data = PyUnicode_DATA(u);
12190    chr = PyUnicode_READ(kind, data, fill);
12191
12192    if (chr == '+' || chr == '-') {
12193        /* move sign to beginning of string */
12194        PyUnicode_WRITE(kind, data, 0, chr);
12195        PyUnicode_WRITE(kind, data, fill, '0');
12196    }
12197
12198    assert(_PyUnicode_CheckConsistency(u, 1));
12199    return (PyObject*) u;
12200}
12201
12202#if 0
12203static PyObject *
12204unicode__decimal2ascii(PyObject *self)
12205{
12206    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12207}
12208#endif
12209
12210PyDoc_STRVAR(startswith__doc__,
12211             "S.startswith(prefix[, start[, end]]) -> bool\n\
12212\n\
12213Return True if S starts with the specified prefix, False otherwise.\n\
12214With optional start, test S beginning at that position.\n\
12215With optional end, stop comparing S at that position.\n\
12216prefix can also be a tuple of strings to try.");
12217
12218static PyObject *
12219unicode_startswith(PyUnicodeObject *self,
12220                   PyObject *args)
12221{
12222    PyObject *subobj;
12223    PyUnicodeObject *substring;
12224    Py_ssize_t start = 0;
12225    Py_ssize_t end = PY_SSIZE_T_MAX;
12226    int result;
12227
12228    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12229        return NULL;
12230    if (PyTuple_Check(subobj)) {
12231        Py_ssize_t i;
12232        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12233            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12234                PyTuple_GET_ITEM(subobj, i));
12235            if (substring == NULL)
12236                return NULL;
12237            result = tailmatch(self, substring, start, end, -1);
12238            Py_DECREF(substring);
12239            if (result) {
12240                Py_RETURN_TRUE;
12241            }
12242        }
12243        /* nothing matched */
12244        Py_RETURN_FALSE;
12245    }
12246    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12247    if (substring == NULL) {
12248        if (PyErr_ExceptionMatches(PyExc_TypeError))
12249            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12250                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12251        return NULL;
12252    }
12253    result = tailmatch(self, substring, start, end, -1);
12254    Py_DECREF(substring);
12255    return PyBool_FromLong(result);
12256}
12257
12258
12259PyDoc_STRVAR(endswith__doc__,
12260             "S.endswith(suffix[, start[, end]]) -> bool\n\
12261\n\
12262Return True if S ends with the specified suffix, False otherwise.\n\
12263With optional start, test S beginning at that position.\n\
12264With optional end, stop comparing S at that position.\n\
12265suffix can also be a tuple of strings to try.");
12266
12267static PyObject *
12268unicode_endswith(PyUnicodeObject *self,
12269                 PyObject *args)
12270{
12271    PyObject *subobj;
12272    PyUnicodeObject *substring;
12273    Py_ssize_t start = 0;
12274    Py_ssize_t end = PY_SSIZE_T_MAX;
12275    int result;
12276
12277    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12278        return NULL;
12279    if (PyTuple_Check(subobj)) {
12280        Py_ssize_t i;
12281        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12282            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12283                PyTuple_GET_ITEM(subobj, i));
12284            if (substring == NULL)
12285                return NULL;
12286            result = tailmatch(self, substring, start, end, +1);
12287            Py_DECREF(substring);
12288            if (result) {
12289                Py_RETURN_TRUE;
12290            }
12291        }
12292        Py_RETURN_FALSE;
12293    }
12294    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12295    if (substring == NULL) {
12296        if (PyErr_ExceptionMatches(PyExc_TypeError))
12297            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12298                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12299        return NULL;
12300    }
12301    result = tailmatch(self, substring, start, end, +1);
12302    Py_DECREF(substring);
12303    return PyBool_FromLong(result);
12304}
12305
12306#include "stringlib/unicode_format.h"
12307
12308PyDoc_STRVAR(format__doc__,
12309             "S.format(*args, **kwargs) -> str\n\
12310\n\
12311Return a formatted version of S, using substitutions from args and kwargs.\n\
12312The substitutions are identified by braces ('{' and '}').");
12313
12314PyDoc_STRVAR(format_map__doc__,
12315             "S.format_map(mapping) -> str\n\
12316\n\
12317Return a formatted version of S, using substitutions from mapping.\n\
12318The substitutions are identified by braces ('{' and '}').");
12319
12320static PyObject *
12321unicode__format__(PyObject* self, PyObject* args)
12322{
12323    PyObject *format_spec, *out;
12324
12325    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12326        return NULL;
12327
12328    out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
12329                                     PyUnicode_GET_LENGTH(format_spec));
12330    return out;
12331}
12332
12333PyDoc_STRVAR(p_format__doc__,
12334             "S.__format__(format_spec) -> str\n\
12335\n\
12336Return a formatted version of S as described by format_spec.");
12337
12338static PyObject *
12339unicode__sizeof__(PyUnicodeObject *v)
12340{
12341    Py_ssize_t size;
12342
12343    /* If it's a compact object, account for base structure +
12344       character data. */
12345    if (PyUnicode_IS_COMPACT_ASCII(v))
12346        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12347    else if (PyUnicode_IS_COMPACT(v))
12348        size = sizeof(PyCompactUnicodeObject) +
12349            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12350    else {
12351        /* If it is a two-block object, account for base object, and
12352           for character block if present. */
12353        size = sizeof(PyUnicodeObject);
12354        if (_PyUnicode_DATA_ANY(v))
12355            size += (PyUnicode_GET_LENGTH(v) + 1) *
12356                PyUnicode_CHARACTER_SIZE(v);
12357    }
12358    /* If the wstr pointer is present, account for it unless it is shared
12359       with the data pointer. Check if the data is not shared. */
12360    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12361        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12362    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12363        size += PyUnicode_UTF8_LENGTH(v) + 1;
12364
12365    return PyLong_FromSsize_t(size);
12366}
12367
12368PyDoc_STRVAR(sizeof__doc__,
12369             "S.__sizeof__() -> size of S in memory, in bytes");
12370
12371static PyObject *
12372unicode_getnewargs(PyObject *v)
12373{
12374    PyObject *copy = PyUnicode_Copy(v);
12375    if (!copy)
12376        return NULL;
12377    return Py_BuildValue("(N)", copy);
12378}
12379
12380static PyMethodDef unicode_methods[] = {
12381
12382    /* Order is according to common usage: often used methods should
12383       appear first, since lookup is done sequentially. */
12384
12385    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12386    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12387    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12388    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12389    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12390    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12391    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12392    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12393    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12394    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12395    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12396    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12397    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12398    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12399    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12400    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12401    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12402    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12403    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12404    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12405    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12406    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12407    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12408    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12409    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12410    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12411    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12412    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12413    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12414    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12415    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12416    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12417    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12418    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12419    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12420    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12421    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12422    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12423    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12424    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12425    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12426    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12427    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12428    {"maketrans", (PyCFunction) unicode_maketrans,
12429     METH_VARARGS | METH_STATIC, maketrans__doc__},
12430    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12431#if 0
12432    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12433#endif
12434
12435#if 0
12436    /* These methods are just used for debugging the implementation. */
12437    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12438#endif
12439
12440    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12441    {NULL, NULL}
12442};
12443
12444static PyObject *
12445unicode_mod(PyObject *v, PyObject *w)
12446{
12447    if (!PyUnicode_Check(v))
12448        Py_RETURN_NOTIMPLEMENTED;
12449    return PyUnicode_Format(v, w);
12450}
12451
12452static PyNumberMethods unicode_as_number = {
12453    0,              /*nb_add*/
12454    0,              /*nb_subtract*/
12455    0,              /*nb_multiply*/
12456    unicode_mod,            /*nb_remainder*/
12457};
12458
12459static PySequenceMethods unicode_as_sequence = {
12460    (lenfunc) unicode_length,       /* sq_length */
12461    PyUnicode_Concat,           /* sq_concat */
12462    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12463    (ssizeargfunc) unicode_getitem,     /* sq_item */
12464    0,                  /* sq_slice */
12465    0,                  /* sq_ass_item */
12466    0,                  /* sq_ass_slice */
12467    PyUnicode_Contains,         /* sq_contains */
12468};
12469
12470static PyObject*
12471unicode_subscript(PyUnicodeObject* self, PyObject* item)
12472{
12473    if (PyUnicode_READY(self) == -1)
12474        return NULL;
12475
12476    if (PyIndex_Check(item)) {
12477        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12478        if (i == -1 && PyErr_Occurred())
12479            return NULL;
12480        if (i < 0)
12481            i += PyUnicode_GET_LENGTH(self);
12482        return unicode_getitem((PyObject*)self, i);
12483    } else if (PySlice_Check(item)) {
12484        Py_ssize_t start, stop, step, slicelength, cur, i;
12485        PyObject *result;
12486        void *src_data, *dest_data;
12487        int src_kind, dest_kind;
12488        Py_UCS4 ch, max_char, kind_limit;
12489
12490        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12491                                 &start, &stop, &step, &slicelength) < 0) {
12492            return NULL;
12493        }
12494
12495        if (slicelength <= 0) {
12496            return PyUnicode_New(0, 0);
12497        } else if (start == 0 && step == 1 &&
12498                   slicelength == PyUnicode_GET_LENGTH(self) &&
12499                   PyUnicode_CheckExact(self)) {
12500            Py_INCREF(self);
12501            return (PyObject *)self;
12502        } else if (step == 1) {
12503            return PyUnicode_Substring((PyObject*)self,
12504                                       start, start + slicelength);
12505        }
12506        /* General case */
12507        max_char = 0;
12508        src_kind = PyUnicode_KIND(self);
12509        kind_limit = kind_maxchar_limit(src_kind);
12510        src_data = PyUnicode_DATA(self);
12511        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12512            ch = PyUnicode_READ(src_kind, src_data, cur);
12513            if (ch > max_char) {
12514                max_char = ch;
12515                if (max_char >= kind_limit)
12516                    break;
12517            }
12518        }
12519        result = PyUnicode_New(slicelength, max_char);
12520        if (result == NULL)
12521            return NULL;
12522        dest_kind = PyUnicode_KIND(result);
12523        dest_data = PyUnicode_DATA(result);
12524
12525        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12526            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12527            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
12528        }
12529        assert(_PyUnicode_CheckConsistency(result, 1));
12530        return result;
12531    } else {
12532        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12533        return NULL;
12534    }
12535}
12536
12537static PyMappingMethods unicode_as_mapping = {
12538    (lenfunc)unicode_length,        /* mp_length */
12539    (binaryfunc)unicode_subscript,  /* mp_subscript */
12540    (objobjargproc)0,           /* mp_ass_subscript */
12541};
12542
12543
12544/* Helpers for PyUnicode_Format() */
12545
12546static PyObject *
12547getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12548{
12549    Py_ssize_t argidx = *p_argidx;
12550    if (argidx < arglen) {
12551        (*p_argidx)++;
12552        if (arglen < 0)
12553            return args;
12554        else
12555            return PyTuple_GetItem(args, argidx);
12556    }
12557    PyErr_SetString(PyExc_TypeError,
12558                    "not enough arguments for format string");
12559    return NULL;
12560}
12561
12562/* Returns a new reference to a PyUnicode object, or NULL on failure. */
12563
12564static PyObject *
12565formatfloat(PyObject *v, int flags, int prec, int type)
12566{
12567    char *p;
12568    PyObject *result;
12569    double x;
12570
12571    x = PyFloat_AsDouble(v);
12572    if (x == -1.0 && PyErr_Occurred())
12573        return NULL;
12574
12575    if (prec < 0)
12576        prec = 6;
12577
12578    p = PyOS_double_to_string(x, type, prec,
12579                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
12580    if (p == NULL)
12581        return NULL;
12582    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
12583    PyMem_Free(p);
12584    return result;
12585}
12586
12587static PyObject*
12588formatlong(PyObject *val, int flags, int prec, int type)
12589{
12590    char *buf;
12591    int len;
12592    PyObject *str; /* temporary string object. */
12593    PyObject *result;
12594
12595    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12596    if (!str)
12597        return NULL;
12598    result = PyUnicode_DecodeASCII(buf, len, NULL);
12599    Py_DECREF(str);
12600    return result;
12601}
12602
12603static int
12604formatchar(Py_UCS4 *buf,
12605           size_t buflen,
12606           PyObject *v)
12607{
12608    /* presume that the buffer is at least 3 characters long */
12609    if (PyUnicode_Check(v)) {
12610        if (PyUnicode_GET_LENGTH(v) == 1) {
12611            buf[0] = PyUnicode_READ_CHAR(v, 0);
12612            buf[1] = '\0';
12613            return 1;
12614        }
12615        goto onError;
12616    }
12617    else {
12618        /* Integer input truncated to a character */
12619        long x;
12620        x = PyLong_AsLong(v);
12621        if (x == -1 && PyErr_Occurred())
12622            goto onError;
12623
12624        if (x < 0 || x > 0x10ffff) {
12625            PyErr_SetString(PyExc_OverflowError,
12626                            "%c arg not in range(0x110000)");
12627            return -1;
12628        }
12629
12630        buf[0] = (Py_UCS4) x;
12631        buf[1] = '\0';
12632        return 1;
12633    }
12634
12635  onError:
12636    PyErr_SetString(PyExc_TypeError,
12637                    "%c requires int or char");
12638    return -1;
12639}
12640
12641/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
12642   FORMATBUFLEN is the length of the buffer in which chars are formatted.
12643*/
12644#define FORMATBUFLEN (size_t)10
12645
12646PyObject *
12647PyUnicode_Format(PyObject *format, PyObject *args)
12648{
12649    void *fmt;
12650    int fmtkind;
12651    PyObject *result;
12652    Py_UCS4 *res, *res0;
12653    Py_UCS4 max;
12654    int kind;
12655    Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
12656    int args_owned = 0;
12657    PyObject *dict = NULL;
12658    PyUnicodeObject *uformat;
12659
12660    if (format == NULL || args == NULL) {
12661        PyErr_BadInternalCall();
12662        return NULL;
12663    }
12664    uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12665    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
12666        return NULL;
12667    fmt = PyUnicode_DATA(uformat);
12668    fmtkind = PyUnicode_KIND(uformat);
12669    fmtcnt = PyUnicode_GET_LENGTH(uformat);
12670    fmtpos = 0;
12671
12672    reslen = rescnt = fmtcnt + 100;
12673    res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12674    if (res0 == NULL) {
12675        PyErr_NoMemory();
12676        goto onError;
12677    }
12678
12679    if (PyTuple_Check(args)) {
12680        arglen = PyTuple_Size(args);
12681        argidx = 0;
12682    }
12683    else {
12684        arglen = -1;
12685        argidx = -2;
12686    }
12687    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
12688        !PyUnicode_Check(args))
12689        dict = args;
12690
12691    while (--fmtcnt >= 0) {
12692        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12693            if (--rescnt < 0) {
12694                rescnt = fmtcnt + 100;
12695                reslen += rescnt;
12696                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12697                if (res0 == NULL){
12698                    PyErr_NoMemory();
12699                    goto onError;
12700                }
12701                res = res0 + reslen - rescnt;
12702                --rescnt;
12703            }
12704            *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12705        }
12706        else {
12707            /* Got a format specifier */
12708            int flags = 0;
12709            Py_ssize_t width = -1;
12710            int prec = -1;
12711            Py_UCS4 c = '\0';
12712            Py_UCS4 fill;
12713            int isnumok;
12714            PyObject *v = NULL;
12715            PyObject *temp = NULL;
12716            void *pbuf;
12717            Py_ssize_t pindex;
12718            Py_UNICODE sign;
12719            Py_ssize_t len, len1;
12720            Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
12721
12722            fmtpos++;
12723            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12724                Py_ssize_t keystart;
12725                Py_ssize_t keylen;
12726                PyObject *key;
12727                int pcount = 1;
12728
12729                if (dict == NULL) {
12730                    PyErr_SetString(PyExc_TypeError,
12731                                    "format requires a mapping");
12732                    goto onError;
12733                }
12734                ++fmtpos;
12735                --fmtcnt;
12736                keystart = fmtpos;
12737                /* Skip over balanced parentheses */
12738                while (pcount > 0 && --fmtcnt >= 0) {
12739                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
12740                        --pcount;
12741                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
12742                        ++pcount;
12743                    fmtpos++;
12744                }
12745                keylen = fmtpos - keystart - 1;
12746                if (fmtcnt < 0 || pcount > 0) {
12747                    PyErr_SetString(PyExc_ValueError,
12748                                    "incomplete format key");
12749                    goto onError;
12750                }
12751                key = PyUnicode_Substring((PyObject*)uformat,
12752                                          keystart, keystart + keylen);
12753                if (key == NULL)
12754                    goto onError;
12755                if (args_owned) {
12756                    Py_DECREF(args);
12757                    args_owned = 0;
12758                }
12759                args = PyObject_GetItem(dict, key);
12760                Py_DECREF(key);
12761                if (args == NULL) {
12762                    goto onError;
12763                }
12764                args_owned = 1;
12765                arglen = -1;
12766                argidx = -2;
12767            }
12768            while (--fmtcnt >= 0) {
12769                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
12770                case '-': flags |= F_LJUST; continue;
12771                case '+': flags |= F_SIGN; continue;
12772                case ' ': flags |= F_BLANK; continue;
12773                case '#': flags |= F_ALT; continue;
12774                case '0': flags |= F_ZERO; continue;
12775                }
12776                break;
12777            }
12778            if (c == '*') {
12779                v = getnextarg(args, arglen, &argidx);
12780                if (v == NULL)
12781                    goto onError;
12782                if (!PyLong_Check(v)) {
12783                    PyErr_SetString(PyExc_TypeError,
12784                                    "* wants int");
12785                    goto onError;
12786                }
12787                width = PyLong_AsLong(v);
12788                if (width == -1 && PyErr_Occurred())
12789                    goto onError;
12790                if (width < 0) {
12791                    flags |= F_LJUST;
12792                    width = -width;
12793                }
12794                if (--fmtcnt >= 0)
12795                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12796            }
12797            else if (c >= '0' && c <= '9') {
12798                width = c - '0';
12799                while (--fmtcnt >= 0) {
12800                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12801                    if (c < '0' || c > '9')
12802                        break;
12803                    if ((width*10) / 10 != width) {
12804                        PyErr_SetString(PyExc_ValueError,
12805                                        "width too big");
12806                        goto onError;
12807                    }
12808                    width = width*10 + (c - '0');
12809                }
12810            }
12811            if (c == '.') {
12812                prec = 0;
12813                if (--fmtcnt >= 0)
12814                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12815                if (c == '*') {
12816                    v = getnextarg(args, arglen, &argidx);
12817                    if (v == NULL)
12818                        goto onError;
12819                    if (!PyLong_Check(v)) {
12820                        PyErr_SetString(PyExc_TypeError,
12821                                        "* wants int");
12822                        goto onError;
12823                    }
12824                    prec = PyLong_AsLong(v);
12825                    if (prec == -1 && PyErr_Occurred())
12826                        goto onError;
12827                    if (prec < 0)
12828                        prec = 0;
12829                    if (--fmtcnt >= 0)
12830                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12831                }
12832                else if (c >= '0' && c <= '9') {
12833                    prec = c - '0';
12834                    while (--fmtcnt >= 0) {
12835                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12836                        if (c < '0' || c > '9')
12837                            break;
12838                        if ((prec*10) / 10 != prec) {
12839                            PyErr_SetString(PyExc_ValueError,
12840                                            "prec too big");
12841                            goto onError;
12842                        }
12843                        prec = prec*10 + (c - '0');
12844                    }
12845                }
12846            } /* prec */
12847            if (fmtcnt >= 0) {
12848                if (c == 'h' || c == 'l' || c == 'L') {
12849                    if (--fmtcnt >= 0)
12850                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12851                }
12852            }
12853            if (fmtcnt < 0) {
12854                PyErr_SetString(PyExc_ValueError,
12855                                "incomplete format");
12856                goto onError;
12857            }
12858            if (c != '%') {
12859                v = getnextarg(args, arglen, &argidx);
12860                if (v == NULL)
12861                    goto onError;
12862            }
12863            sign = 0;
12864            fill = ' ';
12865            switch (c) {
12866
12867            case '%':
12868                pbuf = formatbuf;
12869                kind = PyUnicode_4BYTE_KIND;
12870                /* presume that buffer length is at least 1 */
12871                PyUnicode_WRITE(kind, pbuf, 0, '%');
12872                len = 1;
12873                break;
12874
12875            case 's':
12876            case 'r':
12877            case 'a':
12878                if (PyUnicode_CheckExact(v) && c == 's') {
12879                    temp = v;
12880                    Py_INCREF(temp);
12881                }
12882                else {
12883                    if (c == 's')
12884                        temp = PyObject_Str(v);
12885                    else if (c == 'r')
12886                        temp = PyObject_Repr(v);
12887                    else
12888                        temp = PyObject_ASCII(v);
12889                    if (temp == NULL)
12890                        goto onError;
12891                    if (PyUnicode_Check(temp))
12892                        /* nothing to do */;
12893                    else {
12894                        Py_DECREF(temp);
12895                        PyErr_SetString(PyExc_TypeError,
12896                                        "%s argument has non-string str()");
12897                        goto onError;
12898                    }
12899                }
12900                if (PyUnicode_READY(temp) == -1) {
12901                    Py_CLEAR(temp);
12902                    goto onError;
12903                }
12904                pbuf = PyUnicode_DATA(temp);
12905                kind = PyUnicode_KIND(temp);
12906                len = PyUnicode_GET_LENGTH(temp);
12907                if (prec >= 0 && len > prec)
12908                    len = prec;
12909                break;
12910
12911            case 'i':
12912            case 'd':
12913            case 'u':
12914            case 'o':
12915            case 'x':
12916            case 'X':
12917                isnumok = 0;
12918                if (PyNumber_Check(v)) {
12919                    PyObject *iobj=NULL;
12920
12921                    if (PyLong_Check(v)) {
12922                        iobj = v;
12923                        Py_INCREF(iobj);
12924                    }
12925                    else {
12926                        iobj = PyNumber_Long(v);
12927                    }
12928                    if (iobj!=NULL) {
12929                        if (PyLong_Check(iobj)) {
12930                            isnumok = 1;
12931                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
12932                            Py_DECREF(iobj);
12933                            if (!temp)
12934                                goto onError;
12935                            if (PyUnicode_READY(temp) == -1) {
12936                                Py_CLEAR(temp);
12937                                goto onError;
12938                            }
12939                            pbuf = PyUnicode_DATA(temp);
12940                            kind = PyUnicode_KIND(temp);
12941                            len = PyUnicode_GET_LENGTH(temp);
12942                            sign = 1;
12943                        }
12944                        else {
12945                            Py_DECREF(iobj);
12946                        }
12947                    }
12948                }
12949                if (!isnumok) {
12950                    PyErr_Format(PyExc_TypeError,
12951                                 "%%%c format: a number is required, "
12952                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12953                    goto onError;
12954                }
12955                if (flags & F_ZERO)
12956                    fill = '0';
12957                break;
12958
12959            case 'e':
12960            case 'E':
12961            case 'f':
12962            case 'F':
12963            case 'g':
12964            case 'G':
12965                temp = formatfloat(v, flags, prec, c);
12966                if (!temp)
12967                    goto onError;
12968                if (PyUnicode_READY(temp) == -1) {
12969                    Py_CLEAR(temp);
12970                    goto onError;
12971                }
12972                pbuf = PyUnicode_DATA(temp);
12973                kind = PyUnicode_KIND(temp);
12974                len = PyUnicode_GET_LENGTH(temp);
12975                sign = 1;
12976                if (flags & F_ZERO)
12977                    fill = '0';
12978                break;
12979
12980            case 'c':
12981                pbuf = formatbuf;
12982                kind = PyUnicode_4BYTE_KIND;
12983                len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
12984                if (len < 0)
12985                    goto onError;
12986                break;
12987
12988            default:
12989                PyErr_Format(PyExc_ValueError,
12990                             "unsupported format character '%c' (0x%x) "
12991                             "at index %zd",
12992                             (31<=c && c<=126) ? (char)c : '?',
12993                             (int)c,
12994                             fmtpos - 1);
12995                goto onError;
12996            }
12997            /* pbuf is initialized here. */
12998            pindex = 0;
12999            if (sign) {
13000                if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13001                    PyUnicode_READ(kind, pbuf, pindex) == '+') {
13002                    sign = PyUnicode_READ(kind, pbuf, pindex++);
13003                    len--;
13004                }
13005                else if (flags & F_SIGN)
13006                    sign = '+';
13007                else if (flags & F_BLANK)
13008                    sign = ' ';
13009                else
13010                    sign = 0;
13011            }
13012            if (width < len)
13013                width = len;
13014            if (rescnt - (sign != 0) < width) {
13015                reslen -= rescnt;
13016                rescnt = width + fmtcnt + 100;
13017                reslen += rescnt;
13018                if (reslen < 0) {
13019                    Py_XDECREF(temp);
13020                    PyErr_NoMemory();
13021                    goto onError;
13022                }
13023                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13024                if (res0 == 0) {
13025                    PyErr_NoMemory();
13026                    Py_XDECREF(temp);
13027                    goto onError;
13028                }
13029                res = res0 + reslen - rescnt;
13030            }
13031            if (sign) {
13032                if (fill != ' ')
13033                    *res++ = sign;
13034                rescnt--;
13035                if (width > len)
13036                    width--;
13037            }
13038            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13039                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13040                assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13041                if (fill != ' ') {
13042                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13043                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13044                }
13045                rescnt -= 2;
13046                width -= 2;
13047                if (width < 0)
13048                    width = 0;
13049                len -= 2;
13050            }
13051            if (width > len && !(flags & F_LJUST)) {
13052                do {
13053                    --rescnt;
13054                    *res++ = fill;
13055                } while (--width > len);
13056            }
13057            if (fill == ' ') {
13058                if (sign)
13059                    *res++ = sign;
13060                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13061                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13062                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13063                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13064                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13065                }
13066            }
13067            /* Copy all characters, preserving len */
13068            len1 = len;
13069            while (len1--) {
13070                *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13071                rescnt--;
13072            }
13073            while (--width >= len) {
13074                --rescnt;
13075                *res++ = ' ';
13076            }
13077            if (dict && (argidx < arglen) && c != '%') {
13078                PyErr_SetString(PyExc_TypeError,
13079                                "not all arguments converted during string formatting");
13080                Py_XDECREF(temp);
13081                goto onError;
13082            }
13083            Py_XDECREF(temp);
13084        } /* '%' */
13085    } /* until end */
13086    if (argidx < arglen && !dict) {
13087        PyErr_SetString(PyExc_TypeError,
13088                        "not all arguments converted during string formatting");
13089        goto onError;
13090    }
13091
13092
13093    for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13094        if (*res > max)
13095            max = *res;
13096    result = PyUnicode_New(reslen - rescnt, max);
13097    if (!result)
13098        goto onError;
13099    kind = PyUnicode_KIND(result);
13100    for (res = res0; res < res0+reslen-rescnt; res++)
13101        PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13102    PyMem_Free(res0);
13103    if (args_owned) {
13104        Py_DECREF(args);
13105    }
13106    Py_DECREF(uformat);
13107    assert(_PyUnicode_CheckConsistency(result, 1));
13108    return (PyObject *)result;
13109
13110  onError:
13111    PyMem_Free(res0);
13112    Py_DECREF(uformat);
13113    if (args_owned) {
13114        Py_DECREF(args);
13115    }
13116    return NULL;
13117}
13118
13119static PyObject *
13120unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13121
13122static PyObject *
13123unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13124{
13125    PyObject *x = NULL;
13126    static char *kwlist[] = {"object", "encoding", "errors", 0};
13127    char *encoding = NULL;
13128    char *errors = NULL;
13129
13130    if (type != &PyUnicode_Type)
13131        return unicode_subtype_new(type, args, kwds);
13132    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13133                                     kwlist, &x, &encoding, &errors))
13134        return NULL;
13135    if (x == NULL)
13136        return (PyObject *)PyUnicode_New(0, 0);
13137    if (encoding == NULL && errors == NULL)
13138        return PyObject_Str(x);
13139    else
13140        return PyUnicode_FromEncodedObject(x, encoding, errors);
13141}
13142
13143static PyObject *
13144unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13145{
13146    PyUnicodeObject *unicode, *self;
13147    Py_ssize_t length, char_size;
13148    int share_wstr, share_utf8;
13149    unsigned int kind;
13150    void *data;
13151
13152    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13153
13154    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13155    if (unicode == NULL)
13156        return NULL;
13157    assert(_PyUnicode_CHECK(unicode));
13158    if (PyUnicode_READY(unicode))
13159        return NULL;
13160
13161    self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13162    if (self == NULL) {
13163        Py_DECREF(unicode);
13164        return NULL;
13165    }
13166    kind = PyUnicode_KIND(unicode);
13167    length = PyUnicode_GET_LENGTH(unicode);
13168
13169    _PyUnicode_LENGTH(self) = length;
13170#ifdef Py_DEBUG
13171    _PyUnicode_HASH(self) = -1;
13172#else
13173    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13174#endif
13175    _PyUnicode_STATE(self).interned = 0;
13176    _PyUnicode_STATE(self).kind = kind;
13177    _PyUnicode_STATE(self).compact = 0;
13178    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13179    _PyUnicode_STATE(self).ready = 1;
13180    _PyUnicode_WSTR(self) = NULL;
13181    _PyUnicode_UTF8_LENGTH(self) = 0;
13182    _PyUnicode_UTF8(self) = NULL;
13183    _PyUnicode_WSTR_LENGTH(self) = 0;
13184    _PyUnicode_DATA_ANY(self) = NULL;
13185
13186    share_utf8 = 0;
13187    share_wstr = 0;
13188    if (kind == PyUnicode_1BYTE_KIND) {
13189        char_size = 1;
13190        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13191            share_utf8 = 1;
13192    }
13193    else if (kind == PyUnicode_2BYTE_KIND) {
13194        char_size = 2;
13195        if (sizeof(wchar_t) == 2)
13196            share_wstr = 1;
13197    }
13198    else {
13199        assert(kind == PyUnicode_4BYTE_KIND);
13200        char_size = 4;
13201        if (sizeof(wchar_t) == 4)
13202            share_wstr = 1;
13203    }
13204
13205    /* Ensure we won't overflow the length. */
13206    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13207        PyErr_NoMemory();
13208        goto onError;
13209    }
13210    data = PyObject_MALLOC((length + 1) * char_size);
13211    if (data == NULL) {
13212        PyErr_NoMemory();
13213        goto onError;
13214    }
13215
13216    _PyUnicode_DATA_ANY(self) = data;
13217    if (share_utf8) {
13218        _PyUnicode_UTF8_LENGTH(self) = length;
13219        _PyUnicode_UTF8(self) = data;
13220    }
13221    if (share_wstr) {
13222        _PyUnicode_WSTR_LENGTH(self) = length;
13223        _PyUnicode_WSTR(self) = (wchar_t *)data;
13224    }
13225
13226    Py_MEMCPY(data, PyUnicode_DATA(unicode),
13227              PyUnicode_KIND_SIZE(kind, length + 1));
13228    Py_DECREF(unicode);
13229    assert(_PyUnicode_CheckConsistency(self, 1));
13230#ifdef Py_DEBUG
13231    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13232#endif
13233    return (PyObject *)self;
13234
13235onError:
13236    Py_DECREF(unicode);
13237    Py_DECREF(self);
13238    return NULL;
13239}
13240
13241PyDoc_STRVAR(unicode_doc,
13242             "str(string[, encoding[, errors]]) -> str\n\
13243\n\
13244Create a new string object from the given encoded string.\n\
13245encoding defaults to the current default string encoding.\n\
13246errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
13247
13248static PyObject *unicode_iter(PyObject *seq);
13249
13250PyTypeObject PyUnicode_Type = {
13251    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13252    "str",              /* tp_name */
13253    sizeof(PyUnicodeObject),        /* tp_size */
13254    0,                  /* tp_itemsize */
13255    /* Slots */
13256    (destructor)unicode_dealloc,    /* tp_dealloc */
13257    0,                  /* tp_print */
13258    0,                  /* tp_getattr */
13259    0,                  /* tp_setattr */
13260    0,                  /* tp_reserved */
13261    unicode_repr,           /* tp_repr */
13262    &unicode_as_number,         /* tp_as_number */
13263    &unicode_as_sequence,       /* tp_as_sequence */
13264    &unicode_as_mapping,        /* tp_as_mapping */
13265    (hashfunc) unicode_hash,        /* tp_hash*/
13266    0,                  /* tp_call*/
13267    (reprfunc) unicode_str,     /* tp_str */
13268    PyObject_GenericGetAttr,        /* tp_getattro */
13269    0,                  /* tp_setattro */
13270    0,                  /* tp_as_buffer */
13271    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
13272    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
13273    unicode_doc,            /* tp_doc */
13274    0,                  /* tp_traverse */
13275    0,                  /* tp_clear */
13276    PyUnicode_RichCompare,      /* tp_richcompare */
13277    0,                  /* tp_weaklistoffset */
13278    unicode_iter,           /* tp_iter */
13279    0,                  /* tp_iternext */
13280    unicode_methods,            /* tp_methods */
13281    0,                  /* tp_members */
13282    0,                  /* tp_getset */
13283    &PyBaseObject_Type,         /* tp_base */
13284    0,                  /* tp_dict */
13285    0,                  /* tp_descr_get */
13286    0,                  /* tp_descr_set */
13287    0,                  /* tp_dictoffset */
13288    0,                  /* tp_init */
13289    0,                  /* tp_alloc */
13290    unicode_new,            /* tp_new */
13291    PyObject_Del,           /* tp_free */
13292};
13293
13294/* Initialize the Unicode implementation */
13295
13296void _PyUnicode_Init(void)
13297{
13298    int i;
13299
13300    /* XXX - move this array to unicodectype.c ? */
13301    Py_UCS2 linebreak[] = {
13302        0x000A, /* LINE FEED */
13303        0x000D, /* CARRIAGE RETURN */
13304        0x001C, /* FILE SEPARATOR */
13305        0x001D, /* GROUP SEPARATOR */
13306        0x001E, /* RECORD SEPARATOR */
13307        0x0085, /* NEXT LINE */
13308        0x2028, /* LINE SEPARATOR */
13309        0x2029, /* PARAGRAPH SEPARATOR */
13310    };
13311
13312    /* Init the implementation */
13313    unicode_empty = PyUnicode_New(0, 0);
13314    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
13315    if (!unicode_empty)
13316        Py_FatalError("Can't create empty string");
13317
13318    for (i = 0; i < 256; i++)
13319        unicode_latin1[i] = NULL;
13320    if (PyType_Ready(&PyUnicode_Type) < 0)
13321        Py_FatalError("Can't initialize 'unicode'");
13322
13323    /* initialize the linebreak bloom filter */
13324    bloom_linebreak = make_bloom_mask(
13325        PyUnicode_2BYTE_KIND, linebreak,
13326        Py_ARRAY_LENGTH(linebreak));
13327
13328    PyType_Ready(&EncodingMapType);
13329}
13330
13331/* Finalize the Unicode implementation */
13332
13333int
13334PyUnicode_ClearFreeList(void)
13335{
13336    return 0;
13337}
13338
13339void
13340_PyUnicode_Fini(void)
13341{
13342    int i;
13343
13344    Py_XDECREF(unicode_empty);
13345    unicode_empty = NULL;
13346
13347    for (i = 0; i < 256; i++) {
13348        if (unicode_latin1[i]) {
13349            Py_DECREF(unicode_latin1[i]);
13350            unicode_latin1[i] = NULL;
13351        }
13352    }
13353    (void)PyUnicode_ClearFreeList();
13354}
13355
13356void
13357PyUnicode_InternInPlace(PyObject **p)
13358{
13359    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13360    PyObject *t;
13361#ifdef Py_DEBUG
13362    assert(s != NULL);
13363    assert(_PyUnicode_CHECK(s));
13364#else
13365    if (s == NULL || !PyUnicode_Check(s))
13366        return;
13367#endif
13368    /* If it's a subclass, we don't really know what putting
13369       it in the interned dict might do. */
13370    if (!PyUnicode_CheckExact(s))
13371        return;
13372    if (PyUnicode_CHECK_INTERNED(s))
13373        return;
13374    if (_PyUnicode_READY_REPLACE(p)) {
13375        assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
13376        return;
13377    }
13378    s = (PyUnicodeObject *)(*p);
13379    if (interned == NULL) {
13380        interned = PyDict_New();
13381        if (interned == NULL) {
13382            PyErr_Clear(); /* Don't leave an exception */
13383            return;
13384        }
13385    }
13386    /* It might be that the GetItem call fails even
13387       though the key is present in the dictionary,
13388       namely when this happens during a stack overflow. */
13389    Py_ALLOW_RECURSION
13390        t = PyDict_GetItem(interned, (PyObject *)s);
13391    Py_END_ALLOW_RECURSION
13392
13393        if (t) {
13394            Py_INCREF(t);
13395            Py_DECREF(*p);
13396            *p = t;
13397            return;
13398        }
13399
13400    PyThreadState_GET()->recursion_critical = 1;
13401    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13402        PyErr_Clear();
13403        PyThreadState_GET()->recursion_critical = 0;
13404        return;
13405    }
13406    PyThreadState_GET()->recursion_critical = 0;
13407    /* The two references in interned are not counted by refcnt.
13408       The deallocator will take care of this */
13409    Py_REFCNT(s) -= 2;
13410    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13411}
13412
13413void
13414PyUnicode_InternImmortal(PyObject **p)
13415{
13416    PyUnicodeObject *u = (PyUnicodeObject *)*p;
13417
13418    PyUnicode_InternInPlace(p);
13419    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13420        _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
13421        Py_INCREF(*p);
13422    }
13423}
13424
13425PyObject *
13426PyUnicode_InternFromString(const char *cp)
13427{
13428    PyObject *s = PyUnicode_FromString(cp);
13429    if (s == NULL)
13430        return NULL;
13431    PyUnicode_InternInPlace(&s);
13432    return s;
13433}
13434
13435void
13436_Py_ReleaseInternedUnicodeStrings(void)
13437{
13438    PyObject *keys;
13439    PyUnicodeObject *s;
13440    Py_ssize_t i, n;
13441    Py_ssize_t immortal_size = 0, mortal_size = 0;
13442
13443    if (interned == NULL || !PyDict_Check(interned))
13444        return;
13445    keys = PyDict_Keys(interned);
13446    if (keys == NULL || !PyList_Check(keys)) {
13447        PyErr_Clear();
13448        return;
13449    }
13450
13451    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13452       detector, interned unicode strings are not forcibly deallocated;
13453       rather, we give them their stolen references back, and then clear
13454       and DECREF the interned dict. */
13455
13456    n = PyList_GET_SIZE(keys);
13457    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13458            n);
13459    for (i = 0; i < n; i++) {
13460        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
13461        if (PyUnicode_READY(s) == -1) {
13462            assert(0 && "could not ready string");
13463            fprintf(stderr, "could not ready string\n");
13464        }
13465        switch (PyUnicode_CHECK_INTERNED(s)) {
13466        case SSTATE_NOT_INTERNED:
13467            /* XXX Shouldn't happen */
13468            break;
13469        case SSTATE_INTERNED_IMMORTAL:
13470            Py_REFCNT(s) += 1;
13471            immortal_size += PyUnicode_GET_LENGTH(s);
13472            break;
13473        case SSTATE_INTERNED_MORTAL:
13474            Py_REFCNT(s) += 2;
13475            mortal_size += PyUnicode_GET_LENGTH(s);
13476            break;
13477        default:
13478            Py_FatalError("Inconsistent interned string state.");
13479        }
13480        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13481    }
13482    fprintf(stderr, "total size of all interned strings: "
13483            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13484            "mortal/immortal\n", mortal_size, immortal_size);
13485    Py_DECREF(keys);
13486    PyDict_Clear(interned);
13487    Py_DECREF(interned);
13488    interned = NULL;
13489}
13490
13491
13492/********************* Unicode Iterator **************************/
13493
13494typedef struct {
13495    PyObject_HEAD
13496    Py_ssize_t it_index;
13497    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
13498} unicodeiterobject;
13499
13500static void
13501unicodeiter_dealloc(unicodeiterobject *it)
13502{
13503    _PyObject_GC_UNTRACK(it);
13504    Py_XDECREF(it->it_seq);
13505    PyObject_GC_Del(it);
13506}
13507
13508static int
13509unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13510{
13511    Py_VISIT(it->it_seq);
13512    return 0;
13513}
13514
13515static PyObject *
13516unicodeiter_next(unicodeiterobject *it)
13517{
13518    PyUnicodeObject *seq;
13519    PyObject *item;
13520
13521    assert(it != NULL);
13522    seq = it->it_seq;
13523    if (seq == NULL)
13524        return NULL;
13525    assert(_PyUnicode_CHECK(seq));
13526
13527    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13528        int kind = PyUnicode_KIND(seq);
13529        void *data = PyUnicode_DATA(seq);
13530        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13531        item = PyUnicode_FromOrdinal(chr);
13532        if (item != NULL)
13533            ++it->it_index;
13534        return item;
13535    }
13536
13537    Py_DECREF(seq);
13538    it->it_seq = NULL;
13539    return NULL;
13540}
13541
13542static PyObject *
13543unicodeiter_len(unicodeiterobject *it)
13544{
13545    Py_ssize_t len = 0;
13546    if (it->it_seq)
13547        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13548    return PyLong_FromSsize_t(len);
13549}
13550
13551PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13552
13553static PyMethodDef unicodeiter_methods[] = {
13554    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
13555     length_hint_doc},
13556    {NULL,      NULL}       /* sentinel */
13557};
13558
13559PyTypeObject PyUnicodeIter_Type = {
13560    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13561    "str_iterator",         /* tp_name */
13562    sizeof(unicodeiterobject),      /* tp_basicsize */
13563    0,                  /* tp_itemsize */
13564    /* methods */
13565    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
13566    0,                  /* tp_print */
13567    0,                  /* tp_getattr */
13568    0,                  /* tp_setattr */
13569    0,                  /* tp_reserved */
13570    0,                  /* tp_repr */
13571    0,                  /* tp_as_number */
13572    0,                  /* tp_as_sequence */
13573    0,                  /* tp_as_mapping */
13574    0,                  /* tp_hash */
13575    0,                  /* tp_call */
13576    0,                  /* tp_str */
13577    PyObject_GenericGetAttr,        /* tp_getattro */
13578    0,                  /* tp_setattro */
13579    0,                  /* tp_as_buffer */
13580    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13581    0,                  /* tp_doc */
13582    (traverseproc)unicodeiter_traverse, /* tp_traverse */
13583    0,                  /* tp_clear */
13584    0,                  /* tp_richcompare */
13585    0,                  /* tp_weaklistoffset */
13586    PyObject_SelfIter,          /* tp_iter */
13587    (iternextfunc)unicodeiter_next,     /* tp_iternext */
13588    unicodeiter_methods,            /* tp_methods */
13589    0,
13590};
13591
13592static PyObject *
13593unicode_iter(PyObject *seq)
13594{
13595    unicodeiterobject *it;
13596
13597    if (!PyUnicode_Check(seq)) {
13598        PyErr_BadInternalCall();
13599        return NULL;
13600    }
13601    if (PyUnicode_READY(seq) == -1)
13602        return NULL;
13603    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13604    if (it == NULL)
13605        return NULL;
13606    it->it_index = 0;
13607    Py_INCREF(seq);
13608    it->it_seq = (PyUnicodeObject *)seq;
13609    _PyObject_GC_TRACK(it);
13610    return (PyObject *)it;
13611}
13612
13613#define UNIOP(x) Py_UNICODE_##x
13614#define UNIOP_t Py_UNICODE
13615#include "uniops.h"
13616#undef UNIOP
13617#undef UNIOP_t
13618#define UNIOP(x) Py_UCS4_##x
13619#define UNIOP_t Py_UCS4
13620#include "uniops.h"
13621#undef UNIOP
13622#undef UNIOP_t
13623
13624Py_UNICODE*
13625PyUnicode_AsUnicodeCopy(PyObject *object)
13626{
13627    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13628    Py_UNICODE *copy;
13629    Py_ssize_t size;
13630
13631    if (!PyUnicode_Check(unicode)) {
13632        PyErr_BadArgument();
13633        return NULL;
13634    }
13635    /* Ensure we won't overflow the size. */
13636    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13637        PyErr_NoMemory();
13638        return NULL;
13639    }
13640    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13641    size *= sizeof(Py_UNICODE);
13642    copy = PyMem_Malloc(size);
13643    if (copy == NULL) {
13644        PyErr_NoMemory();
13645        return NULL;
13646    }
13647    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13648    return copy;
13649}
13650
13651/* A _string module, to export formatter_parser and formatter_field_name_split
13652   to the string.Formatter class implemented in Python. */
13653
13654static PyMethodDef _string_methods[] = {
13655    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13656     METH_O, PyDoc_STR("split the argument as a field name")},
13657    {"formatter_parser", (PyCFunction) formatter_parser,
13658     METH_O, PyDoc_STR("parse the argument as a format string")},
13659    {NULL, NULL}
13660};
13661
13662static struct PyModuleDef _string_module = {
13663    PyModuleDef_HEAD_INIT,
13664    "_string",
13665    PyDoc_STR("string helper module"),
13666    0,
13667    _string_methods,
13668    NULL,
13669    NULL,
13670    NULL,
13671    NULL
13672};
13673
13674PyMODINIT_FUNC
13675PyInit__string(void)
13676{
13677    return PyModule_Create(&_string_module);
13678}
13679
13680
13681#ifdef __cplusplus
13682}
13683#endif
13684