unicodeobject.c revision 794d567b173e4cc10ad233aeb8743283ea9c3e6b
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49#ifdef Py_DEBUG
50#  define DONT_MAKE_RESULT_READY
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96#ifdef Py_DEBUG
97#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
98#else
99#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
101
102#define _PyUnicode_UTF8(op)                             \
103    (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op)                              \
105    (assert(_PyUnicode_CHECK(op)),                      \
106     assert(PyUnicode_IS_READY(op)),                    \
107     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
108         ((char*)((PyASCIIObject*)(op) + 1)) :          \
109         _PyUnicode_UTF8(op))
110#define _PyUnicode_UTF8_LENGTH(op)                      \
111    (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op)                       \
113    (assert(_PyUnicode_CHECK(op)),                      \
114     assert(PyUnicode_IS_READY(op)),                    \
115     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
116         ((PyASCIIObject*)(op))->length :               \
117         _PyUnicode_UTF8_LENGTH(op))
118#define _PyUnicode_WSTR(op)                             \
119    (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op)                      \
121    (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op)                           \
123    (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op)                            \
125    (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op)                             \
127    (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op)                             \
129    (assert(_PyUnicode_CHECK(op)),                      \
130     ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op)                       \
132    (assert(_PyUnicode_CHECK(op)),                      \
133     ((PyASCIIObject *)(op))->length)
134#define _PyUnicode_DATA_ANY(op)                         \
135    (((PyUnicodeObject*)(op))->data.any)
136
137#undef PyUnicode_READY
138#define PyUnicode_READY(op)                             \
139    (assert(_PyUnicode_CHECK(op)),                      \
140     (PyUnicode_IS_READY(op) ?                          \
141      0 :                                               \
142      _PyUnicode_Ready((PyObject *)(op))))
143
144#define _PyUnicode_READY_REPLACE(p_obj)                 \
145    (assert(_PyUnicode_CHECK(*p_obj)),                  \
146     (PyUnicode_IS_READY(*p_obj) ?                      \
147      0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
149#define _PyUnicode_SHARE_UTF8(op)                       \
150    (assert(_PyUnicode_CHECK(op)),                      \
151     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
152     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op)                       \
154    (assert(_PyUnicode_CHECK(op)),                      \
155     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
157/* true if the Unicode object has an allocated UTF-8 memory block
158   (not shared with other data) */
159#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
160    (assert(_PyUnicode_CHECK(op)),                      \
161     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
162      && _PyUnicode_UTF8(op)                            \
163      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
165/* true if the Unicode object has an allocated wstr memory block
166   (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
168    (assert(_PyUnicode_CHECK(op)),                      \
169     (_PyUnicode_WSTR(op) &&                            \
170      (!PyUnicode_IS_READY(op) ||                       \
171       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
173/* Generic helper macro to convert characters of different types.
174   from_type and to_type have to be valid type names, begin and end
175   are pointers to the source characters which should be of type
176   "from_type *".  to is a pointer of type "to_type *" and points to the
177   buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179    do {                                                \
180        const from_type *iter_; to_type *to_;           \
181        for (iter_ = (begin), to_ = (to_type *)(to);    \
182             iter_ < (end);                             \
183             ++iter_, ++to_) {                          \
184            *to_ = (to_type)*iter_;                     \
185        }                                               \
186    } while (0)
187
188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
191/* This dictionary holds all interned unicode strings.  Note that references
192   to strings in this dictionary are *not* counted in the string's ob_refcnt.
193   When the interned string reaches a refcnt of 0 the string deallocation
194   function will delete the reference from this dictionary.
195
196   Another way to look at this is that to say that the actual reference
197   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
198*/
199static PyObject *interned;
200
201/* The empty Unicode object is shared to improve performance. */
202static PyObject *unicode_empty;
203
204/* List of static strings. */
205static _Py_Identifier *static_strings;
206
207/* Single character Unicode strings in the Latin-1 range are being
208   shared as well. */
209static PyObject *unicode_latin1[256];
210
211/* Fast detection of the most frequent whitespace characters */
212const unsigned char _Py_ascii_whitespace[] = {
213    0, 0, 0, 0, 0, 0, 0, 0,
214/*     case 0x0009: * CHARACTER TABULATION */
215/*     case 0x000A: * LINE FEED */
216/*     case 0x000B: * LINE TABULATION */
217/*     case 0x000C: * FORM FEED */
218/*     case 0x000D: * CARRIAGE RETURN */
219    0, 1, 1, 1, 1, 1, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221/*     case 0x001C: * FILE SEPARATOR */
222/*     case 0x001D: * GROUP SEPARATOR */
223/*     case 0x001E: * RECORD SEPARATOR */
224/*     case 0x001F: * UNIT SEPARATOR */
225    0, 0, 0, 0, 1, 1, 1, 1,
226/*     case 0x0020: * SPACE */
227    1, 0, 0, 0, 0, 0, 0, 0,
228    0, 0, 0, 0, 0, 0, 0, 0,
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0,
237    0, 0, 0, 0, 0, 0, 0, 0,
238    0, 0, 0, 0, 0, 0, 0, 0,
239    0, 0, 0, 0, 0, 0, 0, 0
240};
241
242/* forward */
243static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
244static PyObject* get_latin1_char(unsigned char ch);
245static void copy_characters(
246    PyObject *to, Py_ssize_t to_start,
247    PyObject *from, Py_ssize_t from_start,
248    Py_ssize_t how_many);
249#ifdef Py_DEBUG
250static int unicode_is_singleton(PyObject *unicode);
251#endif
252
253static PyObject *
254unicode_encode_call_errorhandler(const char *errors,
255       PyObject **errorHandler,const char *encoding, const char *reason,
256       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
257       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
259static void
260raise_encode_exception(PyObject **exceptionObject,
261                       const char *encoding,
262                       const Py_UNICODE *unicode, Py_ssize_t size,
263                       Py_ssize_t startpos, Py_ssize_t endpos,
264                       const char *reason);
265
266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
268    0, 0, 0, 0, 0, 0, 0, 0,
269/*         0x000A, * LINE FEED */
270/*         0x000B, * LINE TABULATION */
271/*         0x000C, * FORM FEED */
272/*         0x000D, * CARRIAGE RETURN */
273    0, 0, 1, 1, 1, 1, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275/*         0x001C, * FILE SEPARATOR */
276/*         0x001D, * GROUP SEPARATOR */
277/*         0x001E, * RECORD SEPARATOR */
278    0, 0, 0, 0, 1, 1, 1, 0,
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0,
283
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0,
287    0, 0, 0, 0, 0, 0, 0, 0,
288    0, 0, 0, 0, 0, 0, 0, 0,
289    0, 0, 0, 0, 0, 0, 0, 0,
290    0, 0, 0, 0, 0, 0, 0, 0,
291    0, 0, 0, 0, 0, 0, 0, 0
292};
293
294/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
295   This function is kept for backward compatibility with the old API. */
296Py_UNICODE
297PyUnicode_GetMax(void)
298{
299#ifdef Py_UNICODE_WIDE
300    return 0x10FFFF;
301#else
302    /* This is actually an illegal character, so it should
303       not be passed to unichr. */
304    return 0xFFFF;
305#endif
306}
307
308#ifdef Py_DEBUG
309int
310/* FIXME: use PyObject* type for op */
311_PyUnicode_CheckConsistency(void *op, int check_content)
312{
313    PyASCIIObject *ascii;
314    unsigned int kind;
315
316    assert(PyUnicode_Check(op));
317
318    ascii = (PyASCIIObject *)op;
319    kind = ascii->state.kind;
320
321    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
322        assert(kind == PyUnicode_1BYTE_KIND);
323        assert(ascii->state.ready == 1);
324    }
325    else {
326        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
327        void *data;
328
329        if (ascii->state.compact == 1) {
330            data = compact + 1;
331            assert(kind == PyUnicode_1BYTE_KIND
332                   || kind == PyUnicode_2BYTE_KIND
333                   || kind == PyUnicode_4BYTE_KIND);
334            assert(ascii->state.ascii == 0);
335            assert(ascii->state.ready == 1);
336            assert (compact->utf8 != data);
337        } else {
338            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
339
340            data = unicode->data.any;
341            if (kind == PyUnicode_WCHAR_KIND) {
342                assert(ascii->state.compact == 0);
343                assert(ascii->state.ascii == 0);
344                assert(ascii->state.ready == 0);
345                assert(ascii->wstr != NULL);
346                assert(data == NULL);
347                assert(compact->utf8 == NULL);
348                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
349            }
350            else {
351                assert(kind == PyUnicode_1BYTE_KIND
352                       || kind == PyUnicode_2BYTE_KIND
353                       || kind == PyUnicode_4BYTE_KIND);
354                assert(ascii->state.compact == 0);
355                assert(ascii->state.ready == 1);
356                assert(data != NULL);
357                if (ascii->state.ascii) {
358                    assert (compact->utf8 == data);
359                    assert (compact->utf8_length == ascii->length);
360                }
361                else
362                    assert (compact->utf8 != data);
363            }
364        }
365        if (kind != PyUnicode_WCHAR_KIND) {
366            if (
367#if SIZEOF_WCHAR_T == 2
368                kind == PyUnicode_2BYTE_KIND
369#else
370                kind == PyUnicode_4BYTE_KIND
371#endif
372               )
373            {
374                assert(ascii->wstr == data);
375                assert(compact->wstr_length == ascii->length);
376            } else
377                assert(ascii->wstr != data);
378        }
379
380        if (compact->utf8 == NULL)
381            assert(compact->utf8_length == 0);
382        if (ascii->wstr == NULL)
383            assert(compact->wstr_length == 0);
384    }
385    /* check that the best kind is used */
386    if (check_content && kind != PyUnicode_WCHAR_KIND)
387    {
388        Py_ssize_t i;
389        Py_UCS4 maxchar = 0;
390        void *data = PyUnicode_DATA(ascii);
391        for (i=0; i < ascii->length; i++)
392        {
393            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
394            if (ch > maxchar)
395                maxchar = ch;
396        }
397        if (kind == PyUnicode_1BYTE_KIND) {
398            if (ascii->state.ascii == 0)
399                assert(maxchar >= 128);
400            else
401                assert(maxchar < 128);
402        }
403        else if (kind == PyUnicode_2BYTE_KIND)
404            assert(maxchar >= 0x100);
405        else
406            assert(maxchar >= 0x10000);
407    }
408    if (check_content && !unicode_is_singleton((PyObject*)ascii))
409        assert(ascii->hash == -1);
410    return 1;
411}
412#endif
413
414/* --- Bloom Filters ----------------------------------------------------- */
415
416/* stuff to implement simple "bloom filters" for Unicode characters.
417   to keep things simple, we use a single bitmask, using the least 5
418   bits from each unicode characters as the bit index. */
419
420/* the linebreak mask is set up by Unicode_Init below */
421
422#if LONG_BIT >= 128
423#define BLOOM_WIDTH 128
424#elif LONG_BIT >= 64
425#define BLOOM_WIDTH 64
426#elif LONG_BIT >= 32
427#define BLOOM_WIDTH 32
428#else
429#error "LONG_BIT is smaller than 32"
430#endif
431
432#define BLOOM_MASK unsigned long
433
434static BLOOM_MASK bloom_linebreak;
435
436#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
437#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
438
439#define BLOOM_LINEBREAK(ch)                                             \
440    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
441     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
442
443Py_LOCAL_INLINE(BLOOM_MASK)
444make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
445{
446    /* calculate simple bloom-style bitmask for a given unicode string */
447
448    BLOOM_MASK mask;
449    Py_ssize_t i;
450
451    mask = 0;
452    for (i = 0; i < len; i++)
453        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
454
455    return mask;
456}
457
458#define BLOOM_MEMBER(mask, chr, str) \
459    (BLOOM(mask, chr) \
460     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
461
462/* --- Unicode Object ----------------------------------------------------- */
463
464static PyObject *
465fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
466
467Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
468                                 Py_ssize_t size, Py_UCS4 ch,
469                                 int direction)
470{
471    /* like wcschr, but doesn't stop at NULL characters */
472    Py_ssize_t i;
473    if (direction == 1) {
474        for(i = 0; i < size; i++)
475            if (PyUnicode_READ(kind, s, i) == ch)
476                return (char*)s + kind * i;
477    }
478    else {
479        for(i = size-1; i >= 0; i--)
480            if (PyUnicode_READ(kind, s, i) == ch)
481                return (char*)s + kind * i;
482    }
483    return NULL;
484}
485
486static PyObject*
487resize_compact(PyObject *unicode, Py_ssize_t length)
488{
489    Py_ssize_t char_size;
490    Py_ssize_t struct_size;
491    Py_ssize_t new_size;
492    int share_wstr;
493
494    assert(PyUnicode_IS_READY(unicode));
495    char_size = PyUnicode_KIND(unicode);
496    if (PyUnicode_IS_COMPACT_ASCII(unicode))
497        struct_size = sizeof(PyASCIIObject);
498    else
499        struct_size = sizeof(PyCompactUnicodeObject);
500    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
501
502    _Py_DEC_REFTOTAL;
503    _Py_ForgetReference(unicode);
504
505    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
506        PyErr_NoMemory();
507        return NULL;
508    }
509    new_size = (struct_size + (length + 1) * char_size);
510
511    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
512    if (unicode == NULL) {
513        PyObject_Del(unicode);
514        PyErr_NoMemory();
515        return NULL;
516    }
517    _Py_NewReference(unicode);
518    _PyUnicode_LENGTH(unicode) = length;
519    if (share_wstr) {
520        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
521        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
522            _PyUnicode_WSTR_LENGTH(unicode) = length;
523    }
524    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
525                    length, 0);
526    return unicode;
527}
528
529static int
530resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
531{
532    wchar_t *wstr;
533    assert(!PyUnicode_IS_COMPACT(unicode));
534    assert(Py_REFCNT(unicode) == 1);
535
536    _PyUnicode_DIRTY(unicode);
537
538    if (PyUnicode_IS_READY(unicode)) {
539        Py_ssize_t char_size;
540        Py_ssize_t new_size;
541        int share_wstr, share_utf8;
542        void *data;
543
544        data = _PyUnicode_DATA_ANY(unicode);
545        assert(data != NULL);
546        char_size = PyUnicode_KIND(unicode);
547        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
548        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
549        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
550        {
551            PyObject_DEL(_PyUnicode_UTF8(unicode));
552            _PyUnicode_UTF8(unicode) = NULL;
553            _PyUnicode_UTF8_LENGTH(unicode) = 0;
554        }
555
556        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
557            PyErr_NoMemory();
558            return -1;
559        }
560        new_size = (length + 1) * char_size;
561
562        data = (PyObject *)PyObject_REALLOC(data, new_size);
563        if (data == NULL) {
564            PyErr_NoMemory();
565            return -1;
566        }
567        _PyUnicode_DATA_ANY(unicode) = data;
568        if (share_wstr) {
569            _PyUnicode_WSTR(unicode) = data;
570            _PyUnicode_WSTR_LENGTH(unicode) = length;
571        }
572        if (share_utf8) {
573            _PyUnicode_UTF8(unicode) = data;
574            _PyUnicode_UTF8_LENGTH(unicode) = length;
575        }
576        _PyUnicode_LENGTH(unicode) = length;
577        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
578        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
579            assert(_PyUnicode_CheckConsistency(unicode, 0));
580            return 0;
581        }
582    }
583    assert(_PyUnicode_WSTR(unicode) != NULL);
584
585    /* check for integer overflow */
586    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
587        PyErr_NoMemory();
588        return -1;
589    }
590    wstr =  _PyUnicode_WSTR(unicode);
591    wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
592    if (!wstr) {
593        PyErr_NoMemory();
594        return -1;
595    }
596    _PyUnicode_WSTR(unicode) = wstr;
597    _PyUnicode_WSTR(unicode)[length] = 0;
598    _PyUnicode_WSTR_LENGTH(unicode) = length;
599    assert(_PyUnicode_CheckConsistency(unicode, 0));
600    return 0;
601}
602
603static PyObject*
604resize_copy(PyObject *unicode, Py_ssize_t length)
605{
606    Py_ssize_t copy_length;
607    if (PyUnicode_IS_COMPACT(unicode)) {
608        PyObject *copy;
609        assert(PyUnicode_IS_READY(unicode));
610
611        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
612        if (copy == NULL)
613            return NULL;
614
615        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
616        copy_characters(copy, 0, unicode, 0, copy_length);
617        return copy;
618    }
619    else {
620        PyUnicodeObject *w;
621        assert(_PyUnicode_WSTR(unicode) != NULL);
622        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
623        w = _PyUnicode_New(length);
624        if (w == NULL)
625            return NULL;
626        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
627        copy_length = Py_MIN(copy_length, length);
628        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
629                        copy_length);
630        return (PyObject*)w;
631    }
632}
633
634/* We allocate one more byte to make sure the string is
635   Ux0000 terminated; some code (e.g. new_identifier)
636   relies on that.
637
638   XXX This allocator could further be enhanced by assuring that the
639   free list never reduces its size below 1.
640
641*/
642
643#ifdef Py_DEBUG
644int unicode_old_new_calls = 0;
645#endif
646
647static PyUnicodeObject *
648_PyUnicode_New(Py_ssize_t length)
649{
650    register PyUnicodeObject *unicode;
651    size_t new_size;
652
653    /* Optimization for empty strings */
654    if (length == 0 && unicode_empty != NULL) {
655        Py_INCREF(unicode_empty);
656        return (PyUnicodeObject*)unicode_empty;
657    }
658
659    /* Ensure we won't overflow the size. */
660    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
661        return (PyUnicodeObject *)PyErr_NoMemory();
662    }
663    if (length < 0) {
664        PyErr_SetString(PyExc_SystemError,
665                        "Negative size passed to _PyUnicode_New");
666        return NULL;
667    }
668
669#ifdef Py_DEBUG
670    ++unicode_old_new_calls;
671#endif
672
673    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
674    if (unicode == NULL)
675        return NULL;
676    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
677    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
678    if (!_PyUnicode_WSTR(unicode)) {
679        PyErr_NoMemory();
680        goto onError;
681    }
682
683    /* Initialize the first element to guard against cases where
684     * the caller fails before initializing str -- unicode_resize()
685     * reads str[0], and the Keep-Alive optimization can keep memory
686     * allocated for str alive across a call to unicode_dealloc(unicode).
687     * We don't want unicode_resize to read uninitialized memory in
688     * that case.
689     */
690    _PyUnicode_WSTR(unicode)[0] = 0;
691    _PyUnicode_WSTR(unicode)[length] = 0;
692    _PyUnicode_WSTR_LENGTH(unicode) = length;
693    _PyUnicode_HASH(unicode) = -1;
694    _PyUnicode_STATE(unicode).interned = 0;
695    _PyUnicode_STATE(unicode).kind = 0;
696    _PyUnicode_STATE(unicode).compact = 0;
697    _PyUnicode_STATE(unicode).ready = 0;
698    _PyUnicode_STATE(unicode).ascii = 0;
699    _PyUnicode_DATA_ANY(unicode) = NULL;
700    _PyUnicode_LENGTH(unicode) = 0;
701    _PyUnicode_UTF8(unicode) = NULL;
702    _PyUnicode_UTF8_LENGTH(unicode) = 0;
703    return unicode;
704
705  onError:
706    /* XXX UNREF/NEWREF interface should be more symmetrical */
707    _Py_DEC_REFTOTAL;
708    _Py_ForgetReference((PyObject *)unicode);
709    PyObject_Del(unicode);
710    return NULL;
711}
712
713static const char*
714unicode_kind_name(PyObject *unicode)
715{
716    /* don't check consistency: unicode_kind_name() is called from
717       _PyUnicode_Dump() */
718    if (!PyUnicode_IS_COMPACT(unicode))
719    {
720        if (!PyUnicode_IS_READY(unicode))
721            return "wstr";
722        switch(PyUnicode_KIND(unicode))
723        {
724        case PyUnicode_1BYTE_KIND:
725            if (PyUnicode_IS_ASCII(unicode))
726                return "legacy ascii";
727            else
728                return "legacy latin1";
729        case PyUnicode_2BYTE_KIND:
730            return "legacy UCS2";
731        case PyUnicode_4BYTE_KIND:
732            return "legacy UCS4";
733        default:
734            return "<legacy invalid kind>";
735        }
736    }
737    assert(PyUnicode_IS_READY(unicode));
738    switch(PyUnicode_KIND(unicode))
739    {
740    case PyUnicode_1BYTE_KIND:
741        if (PyUnicode_IS_ASCII(unicode))
742            return "ascii";
743        else
744            return "latin1";
745    case PyUnicode_2BYTE_KIND:
746        return "UCS2";
747    case PyUnicode_4BYTE_KIND:
748        return "UCS4";
749    default:
750        return "<invalid compact kind>";
751    }
752}
753
754#ifdef Py_DEBUG
755int unicode_new_new_calls = 0;
756
757/* Functions wrapping macros for use in debugger */
758char *_PyUnicode_utf8(void *unicode){
759    return PyUnicode_UTF8(unicode);
760}
761
762void *_PyUnicode_compact_data(void *unicode) {
763    return _PyUnicode_COMPACT_DATA(unicode);
764}
765void *_PyUnicode_data(void *unicode){
766    printf("obj %p\n", unicode);
767    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
768    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
769    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
770    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
771    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
772    return PyUnicode_DATA(unicode);
773}
774
775void
776_PyUnicode_Dump(PyObject *op)
777{
778    PyASCIIObject *ascii = (PyASCIIObject *)op;
779    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
780    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
781    void *data;
782    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
783    if (ascii->state.compact)
784        data = (compact + 1);
785    else
786        data = unicode->data.any;
787    if (ascii->wstr == data)
788        printf("shared ");
789    printf("wstr=%p", ascii->wstr);
790    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
791        printf(" (%zu), ", compact->wstr_length);
792        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
793            printf("shared ");
794        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
795    }
796    printf(", data=%p\n", data);
797}
798#endif
799
800PyObject *
801PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
802{
803    PyObject *obj;
804    PyCompactUnicodeObject *unicode;
805    void *data;
806    int kind_state;
807    int is_sharing, is_ascii;
808    Py_ssize_t char_size;
809    Py_ssize_t struct_size;
810
811    /* Optimization for empty strings */
812    if (size == 0 && unicode_empty != NULL) {
813        Py_INCREF(unicode_empty);
814        return unicode_empty;
815    }
816
817#ifdef Py_DEBUG
818    ++unicode_new_new_calls;
819#endif
820
821    is_ascii = 0;
822    is_sharing = 0;
823    struct_size = sizeof(PyCompactUnicodeObject);
824    if (maxchar < 128) {
825        kind_state = PyUnicode_1BYTE_KIND;
826        char_size = 1;
827        is_ascii = 1;
828        struct_size = sizeof(PyASCIIObject);
829    }
830    else if (maxchar < 256) {
831        kind_state = PyUnicode_1BYTE_KIND;
832        char_size = 1;
833    }
834    else if (maxchar < 65536) {
835        kind_state = PyUnicode_2BYTE_KIND;
836        char_size = 2;
837        if (sizeof(wchar_t) == 2)
838            is_sharing = 1;
839    }
840    else {
841        kind_state = PyUnicode_4BYTE_KIND;
842        char_size = 4;
843        if (sizeof(wchar_t) == 4)
844            is_sharing = 1;
845    }
846
847    /* Ensure we won't overflow the size. */
848    if (size < 0) {
849        PyErr_SetString(PyExc_SystemError,
850                        "Negative size passed to PyUnicode_New");
851        return NULL;
852    }
853    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
854        return PyErr_NoMemory();
855
856    /* Duplicated allocation code from _PyObject_New() instead of a call to
857     * PyObject_New() so we are able to allocate space for the object and
858     * it's data buffer.
859     */
860    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
861    if (obj == NULL)
862        return PyErr_NoMemory();
863    obj = PyObject_INIT(obj, &PyUnicode_Type);
864    if (obj == NULL)
865        return NULL;
866
867    unicode = (PyCompactUnicodeObject *)obj;
868    if (is_ascii)
869        data = ((PyASCIIObject*)obj) + 1;
870    else
871        data = unicode + 1;
872    _PyUnicode_LENGTH(unicode) = size;
873    _PyUnicode_HASH(unicode) = -1;
874    _PyUnicode_STATE(unicode).interned = 0;
875    _PyUnicode_STATE(unicode).kind = kind_state;
876    _PyUnicode_STATE(unicode).compact = 1;
877    _PyUnicode_STATE(unicode).ready = 1;
878    _PyUnicode_STATE(unicode).ascii = is_ascii;
879    if (is_ascii) {
880        ((char*)data)[size] = 0;
881        _PyUnicode_WSTR(unicode) = NULL;
882    }
883    else if (kind_state == PyUnicode_1BYTE_KIND) {
884        ((char*)data)[size] = 0;
885        _PyUnicode_WSTR(unicode) = NULL;
886        _PyUnicode_WSTR_LENGTH(unicode) = 0;
887        unicode->utf8 = NULL;
888        unicode->utf8_length = 0;
889        }
890    else {
891        unicode->utf8 = NULL;
892        unicode->utf8_length = 0;
893        if (kind_state == PyUnicode_2BYTE_KIND)
894            ((Py_UCS2*)data)[size] = 0;
895        else /* kind_state == PyUnicode_4BYTE_KIND */
896            ((Py_UCS4*)data)[size] = 0;
897        if (is_sharing) {
898            _PyUnicode_WSTR_LENGTH(unicode) = size;
899            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
900        }
901        else {
902            _PyUnicode_WSTR_LENGTH(unicode) = 0;
903            _PyUnicode_WSTR(unicode) = NULL;
904        }
905    }
906    assert(_PyUnicode_CheckConsistency(unicode, 0));
907    return obj;
908}
909
910#if SIZEOF_WCHAR_T == 2
911/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
912   will decode surrogate pairs, the other conversions are implemented as macros
913   for efficiency.
914
915   This function assumes that unicode can hold one more code point than wstr
916   characters for a terminating null character. */
917static void
918unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
919                              PyUnicodeObject *unicode)
920{
921    const wchar_t *iter;
922    Py_UCS4 *ucs4_out;
923
924    assert(unicode != NULL);
925    assert(_PyUnicode_CHECK(unicode));
926    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
927    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
928
929    for (iter = begin; iter < end; ) {
930        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
931                           _PyUnicode_GET_LENGTH(unicode)));
932        if (*iter >= 0xD800 && *iter <= 0xDBFF
933            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
934        {
935            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
936            iter += 2;
937        }
938        else {
939            *ucs4_out++ = *iter;
940            iter++;
941        }
942    }
943    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
944                        _PyUnicode_GET_LENGTH(unicode)));
945
946}
947#endif
948
949static int
950_PyUnicode_Dirty(PyObject *unicode)
951{
952    assert(_PyUnicode_CHECK(unicode));
953    if (Py_REFCNT(unicode) != 1) {
954        PyErr_SetString(PyExc_SystemError,
955                        "Cannot modify a string having more than 1 reference");
956        return -1;
957    }
958    _PyUnicode_DIRTY(unicode);
959    return 0;
960}
961
962static int
963_copy_characters(PyObject *to, Py_ssize_t to_start,
964                 PyObject *from, Py_ssize_t from_start,
965                 Py_ssize_t how_many, int check_maxchar)
966{
967    unsigned int from_kind, to_kind;
968    void *from_data, *to_data;
969    int fast;
970
971    assert(PyUnicode_Check(from));
972    assert(PyUnicode_Check(to));
973    assert(PyUnicode_IS_READY(from));
974    assert(PyUnicode_IS_READY(to));
975
976    assert(PyUnicode_GET_LENGTH(from) >= how_many);
977    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
978    assert(0 <= how_many);
979
980    if (how_many == 0)
981        return 0;
982
983    from_kind = PyUnicode_KIND(from);
984    from_data = PyUnicode_DATA(from);
985    to_kind = PyUnicode_KIND(to);
986    to_data = PyUnicode_DATA(to);
987
988#ifdef Py_DEBUG
989    if (!check_maxchar
990        && (from_kind > to_kind
991            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
992    {
993        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
994        Py_UCS4 ch;
995        Py_ssize_t i;
996        for (i=0; i < how_many; i++) {
997            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
998            assert(ch <= to_maxchar);
999        }
1000    }
1001#endif
1002    fast = (from_kind == to_kind);
1003    if (check_maxchar
1004        && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1005    {
1006        /* deny latin1 => ascii */
1007        fast = 0;
1008    }
1009
1010    if (fast) {
1011        Py_MEMCPY((char*)to_data + to_kind * to_start,
1012                  (char*)from_data + from_kind * from_start,
1013                  to_kind * how_many);
1014    }
1015    else if (from_kind == PyUnicode_1BYTE_KIND
1016             && to_kind == PyUnicode_2BYTE_KIND)
1017    {
1018        _PyUnicode_CONVERT_BYTES(
1019            Py_UCS1, Py_UCS2,
1020            PyUnicode_1BYTE_DATA(from) + from_start,
1021            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1022            PyUnicode_2BYTE_DATA(to) + to_start
1023            );
1024    }
1025    else if (from_kind == PyUnicode_1BYTE_KIND
1026             && to_kind == PyUnicode_4BYTE_KIND)
1027    {
1028        _PyUnicode_CONVERT_BYTES(
1029            Py_UCS1, Py_UCS4,
1030            PyUnicode_1BYTE_DATA(from) + from_start,
1031            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1032            PyUnicode_4BYTE_DATA(to) + to_start
1033            );
1034    }
1035    else if (from_kind == PyUnicode_2BYTE_KIND
1036             && to_kind == PyUnicode_4BYTE_KIND)
1037    {
1038        _PyUnicode_CONVERT_BYTES(
1039            Py_UCS2, Py_UCS4,
1040            PyUnicode_2BYTE_DATA(from) + from_start,
1041            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1042            PyUnicode_4BYTE_DATA(to) + to_start
1043            );
1044    }
1045    else {
1046        /* check if max_char(from substring) <= max_char(to) */
1047        if (from_kind > to_kind
1048                /* latin1 => ascii */
1049            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1050        {
1051            /* slow path to check for character overflow */
1052            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1053            Py_UCS4 ch;
1054            Py_ssize_t i;
1055
1056#ifdef Py_DEBUG
1057            for (i=0; i < how_many; i++) {
1058                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1059                assert(ch <= to_maxchar);
1060                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1061            }
1062#else
1063            if (!check_maxchar) {
1064                for (i=0; i < how_many; i++) {
1065                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1066                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1067                }
1068            }
1069            else {
1070                for (i=0; i < how_many; i++) {
1071                    ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1072                    if (ch > to_maxchar)
1073                        return 1;
1074                    PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1075                }
1076            }
1077#endif
1078        }
1079        else {
1080            assert(0 && "inconsistent state");
1081            return 1;
1082        }
1083    }
1084    return 0;
1085}
1086
1087static void
1088copy_characters(PyObject *to, Py_ssize_t to_start,
1089                       PyObject *from, Py_ssize_t from_start,
1090                       Py_ssize_t how_many)
1091{
1092    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1093}
1094
1095Py_ssize_t
1096PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1097                         PyObject *from, Py_ssize_t from_start,
1098                         Py_ssize_t how_many)
1099{
1100    int err;
1101
1102    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1103        PyErr_BadInternalCall();
1104        return -1;
1105    }
1106
1107    if (PyUnicode_READY(from))
1108        return -1;
1109    if (PyUnicode_READY(to))
1110        return -1;
1111
1112    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1113    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1114        PyErr_Format(PyExc_SystemError,
1115                     "Cannot write %zi characters at %zi "
1116                     "in a string of %zi characters",
1117                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1118        return -1;
1119    }
1120
1121    if (how_many == 0)
1122        return 0;
1123
1124    if (_PyUnicode_Dirty(to))
1125        return -1;
1126
1127    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1128    if (err) {
1129        PyErr_Format(PyExc_SystemError,
1130                     "Cannot copy %s characters "
1131                     "into a string of %s characters",
1132                     unicode_kind_name(from),
1133                     unicode_kind_name(to));
1134        return -1;
1135    }
1136    return how_many;
1137}
1138
1139/* Find the maximum code point and count the number of surrogate pairs so a
1140   correct string length can be computed before converting a string to UCS4.
1141   This function counts single surrogates as a character and not as a pair.
1142
1143   Return 0 on success, or -1 on error. */
1144static int
1145find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1146                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1147{
1148    const wchar_t *iter;
1149
1150    assert(num_surrogates != NULL && maxchar != NULL);
1151    *num_surrogates = 0;
1152    *maxchar = 0;
1153
1154    for (iter = begin; iter < end; ) {
1155        if (*iter > *maxchar) {
1156            *maxchar = *iter;
1157#if SIZEOF_WCHAR_T != 2
1158            if (*maxchar >= 0x10000)
1159                return 0;
1160#endif
1161        }
1162#if SIZEOF_WCHAR_T == 2
1163        if (*iter >= 0xD800 && *iter <= 0xDBFF
1164            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1165        {
1166            Py_UCS4 surrogate_val;
1167            surrogate_val = (((iter[0] & 0x3FF)<<10)
1168                             | (iter[1] & 0x3FF)) + 0x10000;
1169            ++(*num_surrogates);
1170            if (surrogate_val > *maxchar)
1171                *maxchar = surrogate_val;
1172            iter += 2;
1173        }
1174        else
1175            iter++;
1176#else
1177        iter++;
1178#endif
1179    }
1180    return 0;
1181}
1182
1183#ifdef Py_DEBUG
1184int unicode_ready_calls = 0;
1185#endif
1186
1187static int
1188unicode_ready(PyObject **p_obj, int replace)
1189{
1190    PyUnicodeObject *unicode;
1191    wchar_t *end;
1192    Py_UCS4 maxchar = 0;
1193    Py_ssize_t num_surrogates;
1194#if SIZEOF_WCHAR_T == 2
1195    Py_ssize_t length_wo_surrogates;
1196#endif
1197
1198    assert(p_obj != NULL);
1199    unicode = (PyUnicodeObject *)*p_obj;
1200
1201    /* _PyUnicode_Ready() is only intended for old-style API usage where
1202       strings were created using _PyObject_New() and where no canonical
1203       representation (the str field) has been set yet aka strings
1204       which are not yet ready. */
1205    assert(_PyUnicode_CHECK(unicode));
1206    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1207    assert(_PyUnicode_WSTR(unicode) != NULL);
1208    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1209    assert(_PyUnicode_UTF8(unicode) == NULL);
1210    /* Actually, it should neither be interned nor be anything else: */
1211    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1212
1213#ifdef Py_DEBUG
1214    ++unicode_ready_calls;
1215#endif
1216
1217#ifdef Py_DEBUG
1218    assert(!replace || Py_REFCNT(unicode) == 1);
1219#else
1220    if (replace && Py_REFCNT(unicode) != 1)
1221        replace = 0;
1222#endif
1223    if (replace) {
1224        Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1225        wchar_t *wstr = _PyUnicode_WSTR(unicode);
1226        /* Optimization for empty strings */
1227        if (len == 0) {
1228            Py_INCREF(unicode_empty);
1229            Py_DECREF(*p_obj);
1230            *p_obj = unicode_empty;
1231            return 0;
1232        }
1233        if (len == 1 && wstr[0] < 256) {
1234            PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1235            if (latin1_char == NULL)
1236                return -1;
1237            Py_DECREF(*p_obj);
1238            *p_obj = latin1_char;
1239            return 0;
1240        }
1241    }
1242
1243    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1244    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1245                                &maxchar, &num_surrogates) == -1)
1246        return -1;
1247
1248    if (maxchar < 256) {
1249        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1250        if (!_PyUnicode_DATA_ANY(unicode)) {
1251            PyErr_NoMemory();
1252            return -1;
1253        }
1254        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1255                                _PyUnicode_WSTR(unicode), end,
1256                                PyUnicode_1BYTE_DATA(unicode));
1257        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1258        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1259        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1260        if (maxchar < 128) {
1261            _PyUnicode_STATE(unicode).ascii = 1;
1262            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1263            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1264        }
1265        else {
1266            _PyUnicode_STATE(unicode).ascii = 0;
1267            _PyUnicode_UTF8(unicode) = NULL;
1268            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1269        }
1270        PyObject_FREE(_PyUnicode_WSTR(unicode));
1271        _PyUnicode_WSTR(unicode) = NULL;
1272        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1273    }
1274    /* In this case we might have to convert down from 4-byte native
1275       wchar_t to 2-byte unicode. */
1276    else if (maxchar < 65536) {
1277        assert(num_surrogates == 0 &&
1278               "FindMaxCharAndNumSurrogatePairs() messed up");
1279
1280#if SIZEOF_WCHAR_T == 2
1281        /* We can share representations and are done. */
1282        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1283        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1284        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1285        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1286        _PyUnicode_UTF8(unicode) = NULL;
1287        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1288#else
1289        /* sizeof(wchar_t) == 4 */
1290        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1291            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1292        if (!_PyUnicode_DATA_ANY(unicode)) {
1293            PyErr_NoMemory();
1294            return -1;
1295        }
1296        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1297                                _PyUnicode_WSTR(unicode), end,
1298                                PyUnicode_2BYTE_DATA(unicode));
1299        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1300        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1301        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1302        _PyUnicode_UTF8(unicode) = NULL;
1303        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1304        PyObject_FREE(_PyUnicode_WSTR(unicode));
1305        _PyUnicode_WSTR(unicode) = NULL;
1306        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1307#endif
1308    }
1309    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1310    else {
1311#if SIZEOF_WCHAR_T == 2
1312        /* in case the native representation is 2-bytes, we need to allocate a
1313           new normalized 4-byte version. */
1314        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1315        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1316        if (!_PyUnicode_DATA_ANY(unicode)) {
1317            PyErr_NoMemory();
1318            return -1;
1319        }
1320        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1321        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1322        _PyUnicode_UTF8(unicode) = NULL;
1323        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1324        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1325        _PyUnicode_STATE(unicode).ready = 1;
1326        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1327        PyObject_FREE(_PyUnicode_WSTR(unicode));
1328        _PyUnicode_WSTR(unicode) = NULL;
1329        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1330#else
1331        assert(num_surrogates == 0);
1332
1333        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1334        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1335        _PyUnicode_UTF8(unicode) = NULL;
1336        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1337        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1338#endif
1339        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1340    }
1341    _PyUnicode_STATE(unicode).ready = 1;
1342    assert(_PyUnicode_CheckConsistency(unicode, 1));
1343    return 0;
1344}
1345
1346int
1347_PyUnicode_ReadyReplace(PyObject **op)
1348{
1349    return unicode_ready(op, 1);
1350}
1351
1352int
1353_PyUnicode_Ready(PyObject *op)
1354{
1355    return unicode_ready(&op, 0);
1356}
1357
1358static void
1359unicode_dealloc(register PyUnicodeObject *unicode)
1360{
1361    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1362    case SSTATE_NOT_INTERNED:
1363        break;
1364
1365    case SSTATE_INTERNED_MORTAL:
1366        /* revive dead object temporarily for DelItem */
1367        Py_REFCNT(unicode) = 3;
1368        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1369            Py_FatalError(
1370                "deletion of interned string failed");
1371        break;
1372
1373    case SSTATE_INTERNED_IMMORTAL:
1374        Py_FatalError("Immortal interned string died.");
1375
1376    default:
1377        Py_FatalError("Inconsistent interned string state.");
1378    }
1379
1380    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1381        PyObject_DEL(_PyUnicode_WSTR(unicode));
1382    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1383        PyObject_DEL(_PyUnicode_UTF8(unicode));
1384
1385    if (PyUnicode_IS_COMPACT(unicode)) {
1386        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1387    }
1388    else {
1389        if (_PyUnicode_DATA_ANY(unicode))
1390            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1391        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1392    }
1393}
1394
1395#ifdef Py_DEBUG
1396static int
1397unicode_is_singleton(PyObject *unicode)
1398{
1399    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1400    if (unicode == unicode_empty)
1401        return 1;
1402    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1403    {
1404        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1405        if (ch < 256 && unicode_latin1[ch] == unicode)
1406            return 1;
1407    }
1408    return 0;
1409}
1410#endif
1411
1412static int
1413unicode_resizable(PyObject *unicode)
1414{
1415    if (Py_REFCNT(unicode) != 1)
1416        return 0;
1417    if (PyUnicode_CHECK_INTERNED(unicode))
1418        return 0;
1419#ifdef Py_DEBUG
1420    /* singleton refcount is greater than 1 */
1421    assert(!unicode_is_singleton(unicode));
1422#endif
1423    return 1;
1424}
1425
1426static int
1427unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1428{
1429    PyObject *unicode;
1430    Py_ssize_t old_length;
1431
1432    assert(p_unicode != NULL);
1433    unicode = *p_unicode;
1434
1435    assert(unicode != NULL);
1436    assert(PyUnicode_Check(unicode));
1437    assert(0 <= length);
1438
1439    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1440        old_length = PyUnicode_WSTR_LENGTH(unicode);
1441    else
1442        old_length = PyUnicode_GET_LENGTH(unicode);
1443    if (old_length == length)
1444        return 0;
1445
1446    if (!unicode_resizable(unicode)) {
1447        PyObject *copy = resize_copy(unicode, length);
1448        if (copy == NULL)
1449            return -1;
1450        Py_DECREF(*p_unicode);
1451        *p_unicode = copy;
1452        return 0;
1453    }
1454
1455    if (PyUnicode_IS_COMPACT(unicode)) {
1456        *p_unicode = resize_compact(unicode, length);
1457        if (*p_unicode == NULL)
1458            return -1;
1459        assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
1460        return 0;
1461    }
1462    return resize_inplace((PyUnicodeObject*)unicode, length);
1463}
1464
1465int
1466PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1467{
1468    PyObject *unicode;
1469    if (p_unicode == NULL) {
1470        PyErr_BadInternalCall();
1471        return -1;
1472    }
1473    unicode = *p_unicode;
1474    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1475        || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1476    {
1477        PyErr_BadInternalCall();
1478        return -1;
1479    }
1480    return unicode_resize(p_unicode, length);
1481}
1482
1483static PyObject*
1484get_latin1_char(unsigned char ch)
1485{
1486    PyObject *unicode = unicode_latin1[ch];
1487    if (!unicode) {
1488        unicode = PyUnicode_New(1, ch);
1489        if (!unicode)
1490            return NULL;
1491        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1492        assert(_PyUnicode_CheckConsistency(unicode, 1));
1493        unicode_latin1[ch] = unicode;
1494    }
1495    Py_INCREF(unicode);
1496    return unicode;
1497}
1498
1499PyObject *
1500PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1501{
1502    PyUnicodeObject *unicode;
1503    Py_UCS4 maxchar = 0;
1504    Py_ssize_t num_surrogates;
1505
1506    if (u == NULL)
1507        return (PyObject*)_PyUnicode_New(size);
1508
1509    /* If the Unicode data is known at construction time, we can apply
1510       some optimizations which share commonly used objects. */
1511
1512    /* Optimization for empty strings */
1513    if (size == 0 && unicode_empty != NULL) {
1514        Py_INCREF(unicode_empty);
1515        return unicode_empty;
1516    }
1517
1518    /* Single character Unicode objects in the Latin-1 range are
1519       shared when using this constructor */
1520    if (size == 1 && *u < 256)
1521        return get_latin1_char((unsigned char)*u);
1522
1523    /* If not empty and not single character, copy the Unicode data
1524       into the new object */
1525    if (find_maxchar_surrogates(u, u + size,
1526                                &maxchar, &num_surrogates) == -1)
1527        return NULL;
1528
1529    unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1530                                                maxchar);
1531    if (!unicode)
1532        return NULL;
1533
1534    switch (PyUnicode_KIND(unicode)) {
1535    case PyUnicode_1BYTE_KIND:
1536        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1537                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1538        break;
1539    case PyUnicode_2BYTE_KIND:
1540#if Py_UNICODE_SIZE == 2
1541        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1542#else
1543        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1544                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1545#endif
1546        break;
1547    case PyUnicode_4BYTE_KIND:
1548#if SIZEOF_WCHAR_T == 2
1549        /* This is the only case which has to process surrogates, thus
1550           a simple copy loop is not enough and we need a function. */
1551        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1552#else
1553        assert(num_surrogates == 0);
1554        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1555#endif
1556        break;
1557    default:
1558        assert(0 && "Impossible state");
1559    }
1560
1561    assert(_PyUnicode_CheckConsistency(unicode, 1));
1562    return (PyObject *)unicode;
1563}
1564
1565PyObject *
1566PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1567{
1568    PyUnicodeObject *unicode;
1569
1570    if (size < 0) {
1571        PyErr_SetString(PyExc_SystemError,
1572                        "Negative size passed to PyUnicode_FromStringAndSize");
1573        return NULL;
1574    }
1575
1576    /* If the Unicode data is known at construction time, we can apply
1577       some optimizations which share commonly used objects.
1578       Also, this means the input must be UTF-8, so fall back to the
1579       UTF-8 decoder at the end. */
1580    if (u != NULL) {
1581
1582        /* Optimization for empty strings */
1583        if (size == 0 && unicode_empty != NULL) {
1584            Py_INCREF(unicode_empty);
1585            return unicode_empty;
1586        }
1587
1588        /* Single characters are shared when using this constructor.
1589           Restrict to ASCII, since the input must be UTF-8. */
1590        if (size == 1 && Py_CHARMASK(*u) < 128)
1591            return get_latin1_char(Py_CHARMASK(*u));
1592
1593        return PyUnicode_DecodeUTF8(u, size, NULL);
1594    }
1595
1596    unicode = _PyUnicode_New(size);
1597    if (!unicode)
1598        return NULL;
1599
1600    return (PyObject *)unicode;
1601}
1602
1603PyObject *
1604PyUnicode_FromString(const char *u)
1605{
1606    size_t size = strlen(u);
1607    if (size > PY_SSIZE_T_MAX) {
1608        PyErr_SetString(PyExc_OverflowError, "input too long");
1609        return NULL;
1610    }
1611
1612    return PyUnicode_FromStringAndSize(u, size);
1613}
1614
1615PyObject *
1616_PyUnicode_FromId(_Py_Identifier *id)
1617{
1618    if (!id->object) {
1619        id->object = PyUnicode_FromString(id->string);
1620        if (!id->object)
1621            return NULL;
1622        PyUnicode_InternInPlace(&id->object);
1623        assert(!id->next);
1624        id->next = static_strings;
1625        static_strings = id;
1626    }
1627    Py_INCREF(id->object);
1628    return id->object;
1629}
1630
1631void
1632_PyUnicode_ClearStaticStrings()
1633{
1634    _Py_Identifier *i;
1635    for (i = static_strings; i; i = i->next) {
1636        Py_DECREF(i->object);
1637        i->object = NULL;
1638        i->next = NULL;
1639    }
1640}
1641
1642static PyObject*
1643unicode_fromascii(const unsigned char* s, Py_ssize_t size)
1644{
1645    PyObject *res;
1646#ifdef Py_DEBUG
1647    const unsigned char *p;
1648    const unsigned char *end = s + size;
1649    for (p=s; p < end; p++) {
1650        assert(*p < 128);
1651    }
1652#endif
1653    if (size == 1)
1654        return get_latin1_char(s[0]);
1655    res = PyUnicode_New(size, 127);
1656    if (!res)
1657        return NULL;
1658    memcpy(PyUnicode_1BYTE_DATA(res), s, size);
1659    return res;
1660}
1661
1662static Py_UCS4
1663kind_maxchar_limit(unsigned int kind)
1664{
1665    switch(kind) {
1666    case PyUnicode_1BYTE_KIND:
1667        return 0x80;
1668    case PyUnicode_2BYTE_KIND:
1669        return 0x100;
1670    case PyUnicode_4BYTE_KIND:
1671        return 0x10000;
1672    default:
1673        assert(0 && "invalid kind");
1674        return 0x10ffff;
1675    }
1676}
1677
1678static PyObject*
1679_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1680{
1681    PyObject *res;
1682    unsigned char max_char = 127;
1683    Py_ssize_t i;
1684
1685    assert(size >= 0);
1686    if (size == 1)
1687        return get_latin1_char(u[0]);
1688    for (i = 0; i < size; i++) {
1689        if (u[i] & 0x80) {
1690            max_char = 255;
1691            break;
1692        }
1693    }
1694    res = PyUnicode_New(size, max_char);
1695    if (!res)
1696        return NULL;
1697    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1698    assert(_PyUnicode_CheckConsistency(res, 1));
1699    return res;
1700}
1701
1702static PyObject*
1703_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1704{
1705    PyObject *res;
1706    Py_UCS2 max_char = 0;
1707    Py_ssize_t i;
1708
1709    assert(size >= 0);
1710    if (size == 1 && u[0] < 256)
1711        return get_latin1_char(u[0]);
1712    for (i = 0; i < size; i++) {
1713        if (u[i] > max_char) {
1714            max_char = u[i];
1715            if (max_char >= 256)
1716                break;
1717        }
1718    }
1719    res = PyUnicode_New(size, max_char);
1720    if (!res)
1721        return NULL;
1722    if (max_char >= 256)
1723        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1724    else
1725        for (i = 0; i < size; i++)
1726            PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1727    assert(_PyUnicode_CheckConsistency(res, 1));
1728    return res;
1729}
1730
1731static PyObject*
1732_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1733{
1734    PyObject *res;
1735    Py_UCS4 max_char = 0;
1736    Py_ssize_t i;
1737
1738    assert(size >= 0);
1739    if (size == 1 && u[0] < 256)
1740        return get_latin1_char(u[0]);
1741    for (i = 0; i < size; i++) {
1742        if (u[i] > max_char) {
1743            max_char = u[i];
1744            if (max_char >= 0x10000)
1745                break;
1746        }
1747    }
1748    res = PyUnicode_New(size, max_char);
1749    if (!res)
1750        return NULL;
1751    if (max_char >= 0x10000)
1752        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1753    else {
1754        int kind = PyUnicode_KIND(res);
1755        void *data = PyUnicode_DATA(res);
1756        for (i = 0; i < size; i++)
1757            PyUnicode_WRITE(kind, data, i, u[i]);
1758    }
1759    assert(_PyUnicode_CheckConsistency(res, 1));
1760    return res;
1761}
1762
1763PyObject*
1764PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1765{
1766    switch(kind) {
1767    case PyUnicode_1BYTE_KIND:
1768        return _PyUnicode_FromUCS1(buffer, size);
1769    case PyUnicode_2BYTE_KIND:
1770        return _PyUnicode_FromUCS2(buffer, size);
1771    case PyUnicode_4BYTE_KIND:
1772        return _PyUnicode_FromUCS4(buffer, size);
1773    default:
1774        assert(0 && "invalid kind");
1775        PyErr_SetString(PyExc_SystemError, "invalid kind");
1776        return NULL;
1777    }
1778}
1779
1780/* Ensure that a string uses the most efficient storage, if it is not the
1781   case: create a new string with of the right kind. Write NULL into *p_unicode
1782   on error. */
1783void
1784unicode_adjust_maxchar(PyObject **p_unicode)
1785{
1786    PyObject *unicode, *copy;
1787    Py_UCS4 max_char;
1788    Py_ssize_t i, len;
1789    unsigned int kind;
1790
1791    assert(p_unicode != NULL);
1792    unicode = *p_unicode;
1793    assert(PyUnicode_IS_READY(unicode));
1794    if (PyUnicode_IS_ASCII(unicode))
1795        return;
1796
1797    len = PyUnicode_GET_LENGTH(unicode);
1798    kind = PyUnicode_KIND(unicode);
1799    if (kind == PyUnicode_1BYTE_KIND) {
1800        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
1801        for (i = 0; i < len; i++) {
1802            if (u[i] & 0x80)
1803                return;
1804        }
1805        max_char = 127;
1806    }
1807    else if (kind == PyUnicode_2BYTE_KIND) {
1808        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
1809        max_char = 0;
1810        for (i = 0; i < len; i++) {
1811            if (u[i] > max_char) {
1812                max_char = u[i];
1813                if (max_char >= 256)
1814                    return;
1815            }
1816        }
1817    }
1818    else {
1819        const Py_UCS4 *u;
1820        assert(kind == PyUnicode_4BYTE_KIND);
1821        u = PyUnicode_4BYTE_DATA(unicode);
1822        max_char = 0;
1823        for (i = 0; i < len; i++) {
1824            if (u[i] > max_char) {
1825                max_char = u[i];
1826                if (max_char >= 0x10000)
1827                    return;
1828            }
1829        }
1830    }
1831    assert(max_char < PyUnicode_MAX_CHAR_VALUE(unicode));
1832    copy = PyUnicode_New(len, max_char);
1833    copy_characters(copy, 0, unicode, 0, len);
1834    Py_DECREF(unicode);
1835    *p_unicode = copy;
1836}
1837
1838PyObject*
1839PyUnicode_Copy(PyObject *unicode)
1840{
1841    Py_ssize_t size;
1842    PyObject *copy;
1843    void *data;
1844
1845    if (!PyUnicode_Check(unicode)) {
1846        PyErr_BadInternalCall();
1847        return NULL;
1848    }
1849    if (PyUnicode_READY(unicode))
1850        return NULL;
1851
1852    size = PyUnicode_GET_LENGTH(unicode);
1853    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1854    if (!copy)
1855        return NULL;
1856    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1857
1858    data = PyUnicode_DATA(unicode);
1859    switch (PyUnicode_KIND(unicode))
1860    {
1861    case PyUnicode_1BYTE_KIND:
1862        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1863        break;
1864    case PyUnicode_2BYTE_KIND:
1865        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1866        break;
1867    case PyUnicode_4BYTE_KIND:
1868        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1869        break;
1870    default:
1871        assert(0);
1872        break;
1873    }
1874    assert(_PyUnicode_CheckConsistency(copy, 1));
1875    return copy;
1876}
1877
1878
1879/* Widen Unicode objects to larger buffers. Don't write terminating null
1880   character. Return NULL on error. */
1881
1882void*
1883_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1884{
1885    Py_ssize_t len;
1886    void *result;
1887    unsigned int skind;
1888
1889    if (PyUnicode_READY(s))
1890        return NULL;
1891
1892    len = PyUnicode_GET_LENGTH(s);
1893    skind = PyUnicode_KIND(s);
1894    if (skind >= kind) {
1895        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
1896        return NULL;
1897    }
1898    switch(kind) {
1899    case PyUnicode_2BYTE_KIND:
1900        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1901        if (!result)
1902            return PyErr_NoMemory();
1903        assert(skind == PyUnicode_1BYTE_KIND);
1904        _PyUnicode_CONVERT_BYTES(
1905            Py_UCS1, Py_UCS2,
1906            PyUnicode_1BYTE_DATA(s),
1907            PyUnicode_1BYTE_DATA(s) + len,
1908            result);
1909        return result;
1910    case PyUnicode_4BYTE_KIND:
1911        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1912        if (!result)
1913            return PyErr_NoMemory();
1914        if (skind == PyUnicode_2BYTE_KIND) {
1915            _PyUnicode_CONVERT_BYTES(
1916                Py_UCS2, Py_UCS4,
1917                PyUnicode_2BYTE_DATA(s),
1918                PyUnicode_2BYTE_DATA(s) + len,
1919                result);
1920        }
1921        else {
1922            assert(skind == PyUnicode_1BYTE_KIND);
1923            _PyUnicode_CONVERT_BYTES(
1924                Py_UCS1, Py_UCS4,
1925                PyUnicode_1BYTE_DATA(s),
1926                PyUnicode_1BYTE_DATA(s) + len,
1927                result);
1928        }
1929        return result;
1930    default:
1931        break;
1932    }
1933    PyErr_SetString(PyExc_SystemError, "invalid kind");
1934    return NULL;
1935}
1936
1937static Py_UCS4*
1938as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1939        int copy_null)
1940{
1941    int kind;
1942    void *data;
1943    Py_ssize_t len, targetlen;
1944    if (PyUnicode_READY(string) == -1)
1945        return NULL;
1946    kind = PyUnicode_KIND(string);
1947    data = PyUnicode_DATA(string);
1948    len = PyUnicode_GET_LENGTH(string);
1949    targetlen = len;
1950    if (copy_null)
1951        targetlen++;
1952    if (!target) {
1953        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1954            PyErr_NoMemory();
1955            return NULL;
1956        }
1957        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1958        if (!target) {
1959            PyErr_NoMemory();
1960            return NULL;
1961        }
1962    }
1963    else {
1964        if (targetsize < targetlen) {
1965            PyErr_Format(PyExc_SystemError,
1966                         "string is longer than the buffer");
1967            if (copy_null && 0 < targetsize)
1968                target[0] = 0;
1969            return NULL;
1970        }
1971    }
1972    if (kind != PyUnicode_4BYTE_KIND) {
1973        Py_ssize_t i;
1974        for (i = 0; i < len; i++)
1975            target[i] = PyUnicode_READ(kind, data, i);
1976    }
1977    else
1978        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1979    if (copy_null)
1980        target[len] = 0;
1981    return target;
1982}
1983
1984Py_UCS4*
1985PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1986                 int copy_null)
1987{
1988    if (target == NULL || targetsize < 1) {
1989        PyErr_BadInternalCall();
1990        return NULL;
1991    }
1992    return as_ucs4(string, target, targetsize, copy_null);
1993}
1994
1995Py_UCS4*
1996PyUnicode_AsUCS4Copy(PyObject *string)
1997{
1998    return as_ucs4(string, NULL, 0, 1);
1999}
2000
2001#ifdef HAVE_WCHAR_H
2002
2003PyObject *
2004PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2005{
2006    if (w == NULL) {
2007        if (size == 0)
2008            return PyUnicode_New(0, 0);
2009        PyErr_BadInternalCall();
2010        return NULL;
2011    }
2012
2013    if (size == -1) {
2014        size = wcslen(w);
2015    }
2016
2017    return PyUnicode_FromUnicode(w, size);
2018}
2019
2020#endif /* HAVE_WCHAR_H */
2021
2022static void
2023makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2024        int zeropad, int width, int precision, char c)
2025{
2026    *fmt++ = '%';
2027    if (width) {
2028        if (zeropad)
2029            *fmt++ = '0';
2030        fmt += sprintf(fmt, "%d", width);
2031    }
2032    if (precision)
2033        fmt += sprintf(fmt, ".%d", precision);
2034    if (longflag)
2035        *fmt++ = 'l';
2036    else if (longlongflag) {
2037        /* longlongflag should only ever be nonzero on machines with
2038           HAVE_LONG_LONG defined */
2039#ifdef HAVE_LONG_LONG
2040        char *f = PY_FORMAT_LONG_LONG;
2041        while (*f)
2042            *fmt++ = *f++;
2043#else
2044        /* we shouldn't ever get here */
2045        assert(0);
2046        *fmt++ = 'l';
2047#endif
2048    }
2049    else if (size_tflag) {
2050        char *f = PY_FORMAT_SIZE_T;
2051        while (*f)
2052            *fmt++ = *f++;
2053    }
2054    *fmt++ = c;
2055    *fmt = '\0';
2056}
2057
2058/* helper for PyUnicode_FromFormatV() */
2059
2060static const char*
2061parse_format_flags(const char *f,
2062                   int *p_width, int *p_precision,
2063                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2064{
2065    int width, precision, longflag, longlongflag, size_tflag;
2066
2067    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2068    f++;
2069    width = 0;
2070    while (Py_ISDIGIT((unsigned)*f))
2071        width = (width*10) + *f++ - '0';
2072    precision = 0;
2073    if (*f == '.') {
2074        f++;
2075        while (Py_ISDIGIT((unsigned)*f))
2076            precision = (precision*10) + *f++ - '0';
2077        if (*f == '%') {
2078            /* "%.3%s" => f points to "3" */
2079            f--;
2080        }
2081    }
2082    if (*f == '\0') {
2083        /* bogus format "%.1" => go backward, f points to "1" */
2084        f--;
2085    }
2086    if (p_width != NULL)
2087        *p_width = width;
2088    if (p_precision != NULL)
2089        *p_precision = precision;
2090
2091    /* Handle %ld, %lu, %lld and %llu. */
2092    longflag = 0;
2093    longlongflag = 0;
2094    size_tflag = 0;
2095
2096    if (*f == 'l') {
2097        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2098            longflag = 1;
2099            ++f;
2100        }
2101#ifdef HAVE_LONG_LONG
2102        else if (f[1] == 'l' &&
2103                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2104            longlongflag = 1;
2105            f += 2;
2106        }
2107#endif
2108    }
2109    /* handle the size_t flag. */
2110    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2111        size_tflag = 1;
2112        ++f;
2113    }
2114    if (p_longflag != NULL)
2115        *p_longflag = longflag;
2116    if (p_longlongflag != NULL)
2117        *p_longlongflag = longlongflag;
2118    if (p_size_tflag != NULL)
2119        *p_size_tflag = size_tflag;
2120    return f;
2121}
2122
2123/* maximum number of characters required for output of %ld.  21 characters
2124   allows for 64-bit integers (in decimal) and an optional sign. */
2125#define MAX_LONG_CHARS 21
2126/* maximum number of characters required for output of %lld.
2127   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2128   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2129#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2130
2131PyObject *
2132PyUnicode_FromFormatV(const char *format, va_list vargs)
2133{
2134    va_list count;
2135    Py_ssize_t callcount = 0;
2136    PyObject **callresults = NULL;
2137    PyObject **callresult = NULL;
2138    Py_ssize_t n = 0;
2139    int width = 0;
2140    int precision = 0;
2141    int zeropad;
2142    const char* f;
2143    PyObject *string;
2144    /* used by sprintf */
2145    char fmt[61]; /* should be enough for %0width.precisionlld */
2146    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2147    Py_UCS4 argmaxchar;
2148    Py_ssize_t numbersize = 0;
2149    char *numberresults = NULL;
2150    char *numberresult = NULL;
2151    Py_ssize_t i;
2152    int kind;
2153    void *data;
2154
2155    Py_VA_COPY(count, vargs);
2156    /* step 1: count the number of %S/%R/%A/%s format specifications
2157     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2158     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2159     * result in an array)
2160     * also estimate a upper bound for all the number formats in the string,
2161     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2162     * buffer before putting everything together. */
2163    for (f = format; *f; f++) {
2164        if (*f == '%') {
2165            int longlongflag;
2166            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2167            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2168            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2169                ++callcount;
2170
2171            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2172#ifdef HAVE_LONG_LONG
2173                if (longlongflag) {
2174                    if (width < MAX_LONG_LONG_CHARS)
2175                        width = MAX_LONG_LONG_CHARS;
2176                }
2177                else
2178#endif
2179                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2180                       including sign.  Decimal takes the most space.  This
2181                       isn't enough for octal.  If a width is specified we
2182                       need more (which we allocate later). */
2183                    if (width < MAX_LONG_CHARS)
2184                        width = MAX_LONG_CHARS;
2185
2186                /* account for the size + '\0' to separate numbers
2187                   inside of the numberresults buffer */
2188                numbersize += (width + 1);
2189            }
2190        }
2191        else if ((unsigned char)*f > 127) {
2192            PyErr_Format(PyExc_ValueError,
2193                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2194                "string, got a non-ASCII byte: 0x%02x",
2195                (unsigned char)*f);
2196            return NULL;
2197        }
2198    }
2199    /* step 2: allocate memory for the results of
2200     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2201    if (callcount) {
2202        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2203        if (!callresults) {
2204            PyErr_NoMemory();
2205            return NULL;
2206        }
2207        callresult = callresults;
2208    }
2209    /* step 2.5: allocate memory for the results of formating numbers */
2210    if (numbersize) {
2211        numberresults = PyObject_Malloc(numbersize);
2212        if (!numberresults) {
2213            PyErr_NoMemory();
2214            goto fail;
2215        }
2216        numberresult = numberresults;
2217    }
2218
2219    /* step 3: format numbers and figure out how large a buffer we need */
2220    for (f = format; *f; f++) {
2221        if (*f == '%') {
2222            const char* p;
2223            int longflag;
2224            int longlongflag;
2225            int size_tflag;
2226            int numprinted;
2227
2228            p = f;
2229            zeropad = (f[1] == '0');
2230            f = parse_format_flags(f, &width, &precision,
2231                                   &longflag, &longlongflag, &size_tflag);
2232            switch (*f) {
2233            case 'c':
2234            {
2235                Py_UCS4 ordinal = va_arg(count, int);
2236                maxchar = Py_MAX(maxchar, ordinal);
2237                n++;
2238                break;
2239            }
2240            case '%':
2241                n++;
2242                break;
2243            case 'i':
2244            case 'd':
2245                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2246                        width, precision, *f);
2247                if (longflag)
2248                    numprinted = sprintf(numberresult, fmt,
2249                                         va_arg(count, long));
2250#ifdef HAVE_LONG_LONG
2251                else if (longlongflag)
2252                    numprinted = sprintf(numberresult, fmt,
2253                                         va_arg(count, PY_LONG_LONG));
2254#endif
2255                else if (size_tflag)
2256                    numprinted = sprintf(numberresult, fmt,
2257                                         va_arg(count, Py_ssize_t));
2258                else
2259                    numprinted = sprintf(numberresult, fmt,
2260                                         va_arg(count, int));
2261                n += numprinted;
2262                /* advance by +1 to skip over the '\0' */
2263                numberresult += (numprinted + 1);
2264                assert(*(numberresult - 1) == '\0');
2265                assert(*(numberresult - 2) != '\0');
2266                assert(numprinted >= 0);
2267                assert(numberresult <= numberresults + numbersize);
2268                break;
2269            case 'u':
2270                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2271                        width, precision, 'u');
2272                if (longflag)
2273                    numprinted = sprintf(numberresult, fmt,
2274                                         va_arg(count, unsigned long));
2275#ifdef HAVE_LONG_LONG
2276                else if (longlongflag)
2277                    numprinted = sprintf(numberresult, fmt,
2278                                         va_arg(count, unsigned PY_LONG_LONG));
2279#endif
2280                else if (size_tflag)
2281                    numprinted = sprintf(numberresult, fmt,
2282                                         va_arg(count, size_t));
2283                else
2284                    numprinted = sprintf(numberresult, fmt,
2285                                         va_arg(count, unsigned int));
2286                n += numprinted;
2287                numberresult += (numprinted + 1);
2288                assert(*(numberresult - 1) == '\0');
2289                assert(*(numberresult - 2) != '\0');
2290                assert(numprinted >= 0);
2291                assert(numberresult <= numberresults + numbersize);
2292                break;
2293            case 'x':
2294                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2295                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2296                n += numprinted;
2297                numberresult += (numprinted + 1);
2298                assert(*(numberresult - 1) == '\0');
2299                assert(*(numberresult - 2) != '\0');
2300                assert(numprinted >= 0);
2301                assert(numberresult <= numberresults + numbersize);
2302                break;
2303            case 'p':
2304                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2305                /* %p is ill-defined:  ensure leading 0x. */
2306                if (numberresult[1] == 'X')
2307                    numberresult[1] = 'x';
2308                else if (numberresult[1] != 'x') {
2309                    memmove(numberresult + 2, numberresult,
2310                            strlen(numberresult) + 1);
2311                    numberresult[0] = '0';
2312                    numberresult[1] = 'x';
2313                    numprinted += 2;
2314                }
2315                n += numprinted;
2316                numberresult += (numprinted + 1);
2317                assert(*(numberresult - 1) == '\0');
2318                assert(*(numberresult - 2) != '\0');
2319                assert(numprinted >= 0);
2320                assert(numberresult <= numberresults + numbersize);
2321                break;
2322            case 's':
2323            {
2324                /* UTF-8 */
2325                const char *s = va_arg(count, const char*);
2326                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2327                if (!str)
2328                    goto fail;
2329                /* since PyUnicode_DecodeUTF8 returns already flexible
2330                   unicode objects, there is no need to call ready on them */
2331                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2332                maxchar = Py_MAX(maxchar, argmaxchar);
2333                n += PyUnicode_GET_LENGTH(str);
2334                /* Remember the str and switch to the next slot */
2335                *callresult++ = str;
2336                break;
2337            }
2338            case 'U':
2339            {
2340                PyObject *obj = va_arg(count, PyObject *);
2341                assert(obj && _PyUnicode_CHECK(obj));
2342                if (PyUnicode_READY(obj) == -1)
2343                    goto fail;
2344                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2345                maxchar = Py_MAX(maxchar, argmaxchar);
2346                n += PyUnicode_GET_LENGTH(obj);
2347                break;
2348            }
2349            case 'V':
2350            {
2351                PyObject *obj = va_arg(count, PyObject *);
2352                const char *str = va_arg(count, const char *);
2353                PyObject *str_obj;
2354                assert(obj || str);
2355                assert(!obj || _PyUnicode_CHECK(obj));
2356                if (obj) {
2357                    if (PyUnicode_READY(obj) == -1)
2358                        goto fail;
2359                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2360                    maxchar = Py_MAX(maxchar, argmaxchar);
2361                    n += PyUnicode_GET_LENGTH(obj);
2362                    *callresult++ = NULL;
2363                }
2364                else {
2365                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2366                    if (!str_obj)
2367                        goto fail;
2368                    if (PyUnicode_READY(str_obj)) {
2369                        Py_DECREF(str_obj);
2370                        goto fail;
2371                    }
2372                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2373                    maxchar = Py_MAX(maxchar, argmaxchar);
2374                    n += PyUnicode_GET_LENGTH(str_obj);
2375                    *callresult++ = str_obj;
2376                }
2377                break;
2378            }
2379            case 'S':
2380            {
2381                PyObject *obj = va_arg(count, PyObject *);
2382                PyObject *str;
2383                assert(obj);
2384                str = PyObject_Str(obj);
2385                if (!str || PyUnicode_READY(str) == -1)
2386                    goto fail;
2387                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2388                maxchar = Py_MAX(maxchar, argmaxchar);
2389                n += PyUnicode_GET_LENGTH(str);
2390                /* Remember the str and switch to the next slot */
2391                *callresult++ = str;
2392                break;
2393            }
2394            case 'R':
2395            {
2396                PyObject *obj = va_arg(count, PyObject *);
2397                PyObject *repr;
2398                assert(obj);
2399                repr = PyObject_Repr(obj);
2400                if (!repr || PyUnicode_READY(repr) == -1)
2401                    goto fail;
2402                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2403                maxchar = Py_MAX(maxchar, argmaxchar);
2404                n += PyUnicode_GET_LENGTH(repr);
2405                /* Remember the repr and switch to the next slot */
2406                *callresult++ = repr;
2407                break;
2408            }
2409            case 'A':
2410            {
2411                PyObject *obj = va_arg(count, PyObject *);
2412                PyObject *ascii;
2413                assert(obj);
2414                ascii = PyObject_ASCII(obj);
2415                if (!ascii || PyUnicode_READY(ascii) == -1)
2416                    goto fail;
2417                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2418                maxchar = Py_MAX(maxchar, argmaxchar);
2419                n += PyUnicode_GET_LENGTH(ascii);
2420                /* Remember the repr and switch to the next slot */
2421                *callresult++ = ascii;
2422                break;
2423            }
2424            default:
2425                /* if we stumble upon an unknown
2426                   formatting code, copy the rest of
2427                   the format string to the output
2428                   string. (we cannot just skip the
2429                   code, since there's no way to know
2430                   what's in the argument list) */
2431                n += strlen(p);
2432                goto expand;
2433            }
2434        } else
2435            n++;
2436    }
2437  expand:
2438    /* step 4: fill the buffer */
2439    /* Since we've analyzed how much space we need,
2440       we don't have to resize the string.
2441       There can be no errors beyond this point. */
2442    string = PyUnicode_New(n, maxchar);
2443    if (!string)
2444        goto fail;
2445    kind = PyUnicode_KIND(string);
2446    data = PyUnicode_DATA(string);
2447    callresult = callresults;
2448    numberresult = numberresults;
2449
2450    for (i = 0, f = format; *f; f++) {
2451        if (*f == '%') {
2452            const char* p;
2453
2454            p = f;
2455            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2456            /* checking for == because the last argument could be a empty
2457               string, which causes i to point to end, the assert at the end of
2458               the loop */
2459            assert(i <= PyUnicode_GET_LENGTH(string));
2460
2461            switch (*f) {
2462            case 'c':
2463            {
2464                const int ordinal = va_arg(vargs, int);
2465                PyUnicode_WRITE(kind, data, i++, ordinal);
2466                break;
2467            }
2468            case 'i':
2469            case 'd':
2470            case 'u':
2471            case 'x':
2472            case 'p':
2473                /* unused, since we already have the result */
2474                if (*f == 'p')
2475                    (void) va_arg(vargs, void *);
2476                else
2477                    (void) va_arg(vargs, int);
2478                /* extract the result from numberresults and append. */
2479                for (; *numberresult; ++i, ++numberresult)
2480                    PyUnicode_WRITE(kind, data, i, *numberresult);
2481                /* skip over the separating '\0' */
2482                assert(*numberresult == '\0');
2483                numberresult++;
2484                assert(numberresult <= numberresults + numbersize);
2485                break;
2486            case 's':
2487            {
2488                /* unused, since we already have the result */
2489                Py_ssize_t size;
2490                (void) va_arg(vargs, char *);
2491                size = PyUnicode_GET_LENGTH(*callresult);
2492                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2493                copy_characters(string, i, *callresult, 0, size);
2494                i += size;
2495                /* We're done with the unicode()/repr() => forget it */
2496                Py_DECREF(*callresult);
2497                /* switch to next unicode()/repr() result */
2498                ++callresult;
2499                break;
2500            }
2501            case 'U':
2502            {
2503                PyObject *obj = va_arg(vargs, PyObject *);
2504                Py_ssize_t size;
2505                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2506                size = PyUnicode_GET_LENGTH(obj);
2507                copy_characters(string, i, obj, 0, size);
2508                i += size;
2509                break;
2510            }
2511            case 'V':
2512            {
2513                Py_ssize_t size;
2514                PyObject *obj = va_arg(vargs, PyObject *);
2515                va_arg(vargs, const char *);
2516                if (obj) {
2517                    size = PyUnicode_GET_LENGTH(obj);
2518                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2519                    copy_characters(string, i, obj, 0, size);
2520                    i += size;
2521                } else {
2522                    size = PyUnicode_GET_LENGTH(*callresult);
2523                    assert(PyUnicode_KIND(*callresult) <=
2524                           PyUnicode_KIND(string));
2525                    copy_characters(string, i, *callresult, 0, size);
2526                    i += size;
2527                    Py_DECREF(*callresult);
2528                }
2529                ++callresult;
2530                break;
2531            }
2532            case 'S':
2533            case 'R':
2534            case 'A':
2535            {
2536                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2537                /* unused, since we already have the result */
2538                (void) va_arg(vargs, PyObject *);
2539                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2540                copy_characters(string, i, *callresult, 0,  size);
2541                i += size;
2542                /* We're done with the unicode()/repr() => forget it */
2543                Py_DECREF(*callresult);
2544                /* switch to next unicode()/repr() result */
2545                ++callresult;
2546                break;
2547            }
2548            case '%':
2549                PyUnicode_WRITE(kind, data, i++, '%');
2550                break;
2551            default:
2552                for (; *p; ++p, ++i)
2553                    PyUnicode_WRITE(kind, data, i, *p);
2554                assert(i == PyUnicode_GET_LENGTH(string));
2555                goto end;
2556            }
2557        }
2558        else {
2559            assert(i < PyUnicode_GET_LENGTH(string));
2560            PyUnicode_WRITE(kind, data, i++, *f);
2561        }
2562    }
2563    assert(i == PyUnicode_GET_LENGTH(string));
2564
2565  end:
2566    if (callresults)
2567        PyObject_Free(callresults);
2568    if (numberresults)
2569        PyObject_Free(numberresults);
2570    assert(_PyUnicode_CheckConsistency(string, 1));
2571    return (PyObject *)string;
2572  fail:
2573    if (callresults) {
2574        PyObject **callresult2 = callresults;
2575        while (callresult2 < callresult) {
2576            Py_XDECREF(*callresult2);
2577            ++callresult2;
2578        }
2579        PyObject_Free(callresults);
2580    }
2581    if (numberresults)
2582        PyObject_Free(numberresults);
2583    return NULL;
2584}
2585
2586PyObject *
2587PyUnicode_FromFormat(const char *format, ...)
2588{
2589    PyObject* ret;
2590    va_list vargs;
2591
2592#ifdef HAVE_STDARG_PROTOTYPES
2593    va_start(vargs, format);
2594#else
2595    va_start(vargs);
2596#endif
2597    ret = PyUnicode_FromFormatV(format, vargs);
2598    va_end(vargs);
2599    return ret;
2600}
2601
2602#ifdef HAVE_WCHAR_H
2603
2604/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2605   convert a Unicode object to a wide character string.
2606
2607   - If w is NULL: return the number of wide characters (including the null
2608     character) required to convert the unicode object. Ignore size argument.
2609
2610   - Otherwise: return the number of wide characters (excluding the null
2611     character) written into w. Write at most size wide characters (including
2612     the null character). */
2613static Py_ssize_t
2614unicode_aswidechar(PyUnicodeObject *unicode,
2615                   wchar_t *w,
2616                   Py_ssize_t size)
2617{
2618    Py_ssize_t res;
2619    const wchar_t *wstr;
2620
2621    wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2622    if (wstr == NULL)
2623        return -1;
2624
2625    if (w != NULL) {
2626        if (size > res)
2627            size = res + 1;
2628        else
2629            res = size;
2630        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2631        return res;
2632    }
2633    else
2634        return res + 1;
2635}
2636
2637Py_ssize_t
2638PyUnicode_AsWideChar(PyObject *unicode,
2639                     wchar_t *w,
2640                     Py_ssize_t size)
2641{
2642    if (unicode == NULL) {
2643        PyErr_BadInternalCall();
2644        return -1;
2645    }
2646    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
2647}
2648
2649wchar_t*
2650PyUnicode_AsWideCharString(PyObject *unicode,
2651                           Py_ssize_t *size)
2652{
2653    wchar_t* buffer;
2654    Py_ssize_t buflen;
2655
2656    if (unicode == NULL) {
2657        PyErr_BadInternalCall();
2658        return NULL;
2659    }
2660
2661    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
2662    if (buflen == -1)
2663        return NULL;
2664    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2665        PyErr_NoMemory();
2666        return NULL;
2667    }
2668
2669    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2670    if (buffer == NULL) {
2671        PyErr_NoMemory();
2672        return NULL;
2673    }
2674    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
2675    if (buflen == -1)
2676        return NULL;
2677    if (size != NULL)
2678        *size = buflen;
2679    return buffer;
2680}
2681
2682#endif /* HAVE_WCHAR_H */
2683
2684PyObject *
2685PyUnicode_FromOrdinal(int ordinal)
2686{
2687    PyObject *v;
2688    if (ordinal < 0 || ordinal > 0x10ffff) {
2689        PyErr_SetString(PyExc_ValueError,
2690                        "chr() arg not in range(0x110000)");
2691        return NULL;
2692    }
2693
2694    if (ordinal < 256)
2695        return get_latin1_char(ordinal);
2696
2697    v = PyUnicode_New(1, ordinal);
2698    if (v == NULL)
2699        return NULL;
2700    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2701    assert(_PyUnicode_CheckConsistency(v, 1));
2702    return v;
2703}
2704
2705PyObject *
2706PyUnicode_FromObject(register PyObject *obj)
2707{
2708    /* XXX Perhaps we should make this API an alias of
2709       PyObject_Str() instead ?! */
2710    if (PyUnicode_CheckExact(obj)) {
2711        if (PyUnicode_READY(obj))
2712            return NULL;
2713        Py_INCREF(obj);
2714        return obj;
2715    }
2716    if (PyUnicode_Check(obj)) {
2717        /* For a Unicode subtype that's not a Unicode object,
2718           return a true Unicode object with the same data. */
2719        return PyUnicode_Copy(obj);
2720    }
2721    PyErr_Format(PyExc_TypeError,
2722                 "Can't convert '%.100s' object to str implicitly",
2723                 Py_TYPE(obj)->tp_name);
2724    return NULL;
2725}
2726
2727PyObject *
2728PyUnicode_FromEncodedObject(register PyObject *obj,
2729                            const char *encoding,
2730                            const char *errors)
2731{
2732    Py_buffer buffer;
2733    PyObject *v;
2734
2735    if (obj == NULL) {
2736        PyErr_BadInternalCall();
2737        return NULL;
2738    }
2739
2740    /* Decoding bytes objects is the most common case and should be fast */
2741    if (PyBytes_Check(obj)) {
2742        if (PyBytes_GET_SIZE(obj) == 0) {
2743            Py_INCREF(unicode_empty);
2744            v = unicode_empty;
2745        }
2746        else {
2747            v = PyUnicode_Decode(
2748                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2749                    encoding, errors);
2750        }
2751        return v;
2752    }
2753
2754    if (PyUnicode_Check(obj)) {
2755        PyErr_SetString(PyExc_TypeError,
2756                        "decoding str is not supported");
2757        return NULL;
2758    }
2759
2760    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2761    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2762        PyErr_Format(PyExc_TypeError,
2763                     "coercing to str: need bytes, bytearray "
2764                     "or buffer-like object, %.80s found",
2765                     Py_TYPE(obj)->tp_name);
2766        return NULL;
2767    }
2768
2769    if (buffer.len == 0) {
2770        Py_INCREF(unicode_empty);
2771        v = unicode_empty;
2772    }
2773    else
2774        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2775
2776    PyBuffer_Release(&buffer);
2777    return v;
2778}
2779
2780/* Convert encoding to lower case and replace '_' with '-' in order to
2781   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2782   1 on success. */
2783static int
2784normalize_encoding(const char *encoding,
2785                   char *lower,
2786                   size_t lower_len)
2787{
2788    const char *e;
2789    char *l;
2790    char *l_end;
2791
2792    e = encoding;
2793    l = lower;
2794    l_end = &lower[lower_len - 1];
2795    while (*e) {
2796        if (l == l_end)
2797            return 0;
2798        if (Py_ISUPPER(*e)) {
2799            *l++ = Py_TOLOWER(*e++);
2800        }
2801        else if (*e == '_') {
2802            *l++ = '-';
2803            e++;
2804        }
2805        else {
2806            *l++ = *e++;
2807        }
2808    }
2809    *l = '\0';
2810    return 1;
2811}
2812
2813PyObject *
2814PyUnicode_Decode(const char *s,
2815                 Py_ssize_t size,
2816                 const char *encoding,
2817                 const char *errors)
2818{
2819    PyObject *buffer = NULL, *unicode;
2820    Py_buffer info;
2821    char lower[11];  /* Enough for any encoding shortcut */
2822
2823    if (encoding == NULL)
2824        return PyUnicode_DecodeUTF8(s, size, errors);
2825
2826    /* Shortcuts for common default encodings */
2827    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2828        if ((strcmp(lower, "utf-8") == 0) ||
2829            (strcmp(lower, "utf8") == 0))
2830            return PyUnicode_DecodeUTF8(s, size, errors);
2831        else if ((strcmp(lower, "latin-1") == 0) ||
2832                 (strcmp(lower, "latin1") == 0) ||
2833                 (strcmp(lower, "iso-8859-1") == 0))
2834            return PyUnicode_DecodeLatin1(s, size, errors);
2835#ifdef HAVE_MBCS
2836        else if (strcmp(lower, "mbcs") == 0)
2837            return PyUnicode_DecodeMBCS(s, size, errors);
2838#endif
2839        else if (strcmp(lower, "ascii") == 0)
2840            return PyUnicode_DecodeASCII(s, size, errors);
2841        else if (strcmp(lower, "utf-16") == 0)
2842            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2843        else if (strcmp(lower, "utf-32") == 0)
2844            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2845    }
2846
2847    /* Decode via the codec registry */
2848    buffer = NULL;
2849    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2850        goto onError;
2851    buffer = PyMemoryView_FromBuffer(&info);
2852    if (buffer == NULL)
2853        goto onError;
2854    unicode = PyCodec_Decode(buffer, encoding, errors);
2855    if (unicode == NULL)
2856        goto onError;
2857    if (!PyUnicode_Check(unicode)) {
2858        PyErr_Format(PyExc_TypeError,
2859                     "decoder did not return a str object (type=%.400s)",
2860                     Py_TYPE(unicode)->tp_name);
2861        Py_DECREF(unicode);
2862        goto onError;
2863    }
2864    Py_DECREF(buffer);
2865#ifndef DONT_MAKE_RESULT_READY
2866    if (_PyUnicode_READY_REPLACE(&unicode)) {
2867        Py_DECREF(unicode);
2868        return NULL;
2869    }
2870#endif
2871    assert(_PyUnicode_CheckConsistency(unicode, 1));
2872    return unicode;
2873
2874  onError:
2875    Py_XDECREF(buffer);
2876    return NULL;
2877}
2878
2879PyObject *
2880PyUnicode_AsDecodedObject(PyObject *unicode,
2881                          const char *encoding,
2882                          const char *errors)
2883{
2884    PyObject *v;
2885
2886    if (!PyUnicode_Check(unicode)) {
2887        PyErr_BadArgument();
2888        goto onError;
2889    }
2890
2891    if (encoding == NULL)
2892        encoding = PyUnicode_GetDefaultEncoding();
2893
2894    /* Decode via the codec registry */
2895    v = PyCodec_Decode(unicode, encoding, errors);
2896    if (v == NULL)
2897        goto onError;
2898    assert(_PyUnicode_CheckConsistency(v, 1));
2899    return v;
2900
2901  onError:
2902    return NULL;
2903}
2904
2905PyObject *
2906PyUnicode_AsDecodedUnicode(PyObject *unicode,
2907                           const char *encoding,
2908                           const char *errors)
2909{
2910    PyObject *v;
2911
2912    if (!PyUnicode_Check(unicode)) {
2913        PyErr_BadArgument();
2914        goto onError;
2915    }
2916
2917    if (encoding == NULL)
2918        encoding = PyUnicode_GetDefaultEncoding();
2919
2920    /* Decode via the codec registry */
2921    v = PyCodec_Decode(unicode, encoding, errors);
2922    if (v == NULL)
2923        goto onError;
2924    if (!PyUnicode_Check(v)) {
2925        PyErr_Format(PyExc_TypeError,
2926                     "decoder did not return a str object (type=%.400s)",
2927                     Py_TYPE(v)->tp_name);
2928        Py_DECREF(v);
2929        goto onError;
2930    }
2931    assert(_PyUnicode_CheckConsistency(v, 1));
2932    return v;
2933
2934  onError:
2935    return NULL;
2936}
2937
2938PyObject *
2939PyUnicode_Encode(const Py_UNICODE *s,
2940                 Py_ssize_t size,
2941                 const char *encoding,
2942                 const char *errors)
2943{
2944    PyObject *v, *unicode;
2945
2946    unicode = PyUnicode_FromUnicode(s, size);
2947    if (unicode == NULL)
2948        return NULL;
2949    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2950    Py_DECREF(unicode);
2951    return v;
2952}
2953
2954PyObject *
2955PyUnicode_AsEncodedObject(PyObject *unicode,
2956                          const char *encoding,
2957                          const char *errors)
2958{
2959    PyObject *v;
2960
2961    if (!PyUnicode_Check(unicode)) {
2962        PyErr_BadArgument();
2963        goto onError;
2964    }
2965
2966    if (encoding == NULL)
2967        encoding = PyUnicode_GetDefaultEncoding();
2968
2969    /* Encode via the codec registry */
2970    v = PyCodec_Encode(unicode, encoding, errors);
2971    if (v == NULL)
2972        goto onError;
2973    return v;
2974
2975  onError:
2976    return NULL;
2977}
2978
2979PyObject *
2980PyUnicode_EncodeFSDefault(PyObject *unicode)
2981{
2982#ifdef HAVE_MBCS
2983    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2984                                PyUnicode_GET_SIZE(unicode),
2985                                NULL);
2986#elif defined(__APPLE__)
2987    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
2988#else
2989    PyInterpreterState *interp = PyThreadState_GET()->interp;
2990    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2991       cannot use it to encode and decode filenames before it is loaded. Load
2992       the Python codec requires to encode at least its own filename. Use the C
2993       version of the locale codec until the codec registry is initialized and
2994       the Python codec is loaded.
2995
2996       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2997       cannot only rely on it: check also interp->fscodec_initialized for
2998       subinterpreters. */
2999    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3000        return PyUnicode_AsEncodedString(unicode,
3001                                         Py_FileSystemDefaultEncoding,
3002                                         "surrogateescape");
3003    }
3004    else {
3005        /* locale encoding with surrogateescape */
3006        wchar_t *wchar;
3007        char *bytes;
3008        PyObject *bytes_obj;
3009        size_t error_pos;
3010
3011        wchar = PyUnicode_AsWideCharString(unicode, NULL);
3012        if (wchar == NULL)
3013            return NULL;
3014        bytes = _Py_wchar2char(wchar, &error_pos);
3015        if (bytes == NULL) {
3016            if (error_pos != (size_t)-1) {
3017                char *errmsg = strerror(errno);
3018                PyObject *exc = NULL;
3019                if (errmsg == NULL)
3020                    errmsg = "Py_wchar2char() failed";
3021                raise_encode_exception(&exc,
3022                    "filesystemencoding",
3023                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3024                    error_pos, error_pos+1,
3025                    errmsg);
3026                Py_XDECREF(exc);
3027            }
3028            else
3029                PyErr_NoMemory();
3030            PyMem_Free(wchar);
3031            return NULL;
3032        }
3033        PyMem_Free(wchar);
3034
3035        bytes_obj = PyBytes_FromString(bytes);
3036        PyMem_Free(bytes);
3037        return bytes_obj;
3038    }
3039#endif
3040}
3041
3042PyObject *
3043PyUnicode_AsEncodedString(PyObject *unicode,
3044                          const char *encoding,
3045                          const char *errors)
3046{
3047    PyObject *v;
3048    char lower[11];  /* Enough for any encoding shortcut */
3049
3050    if (!PyUnicode_Check(unicode)) {
3051        PyErr_BadArgument();
3052        return NULL;
3053    }
3054
3055    if (encoding == NULL) {
3056        if (errors == NULL || strcmp(errors, "strict") == 0)
3057            return _PyUnicode_AsUTF8String(unicode, NULL);
3058        else
3059            return _PyUnicode_AsUTF8String(unicode, errors);
3060    }
3061
3062    /* Shortcuts for common default encodings */
3063    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3064        if ((strcmp(lower, "utf-8") == 0) ||
3065            (strcmp(lower, "utf8") == 0))
3066        {
3067            if (errors == NULL || strcmp(errors, "strict") == 0)
3068                return _PyUnicode_AsUTF8String(unicode, NULL);
3069            else
3070                return _PyUnicode_AsUTF8String(unicode, errors);
3071        }
3072        else if ((strcmp(lower, "latin-1") == 0) ||
3073                 (strcmp(lower, "latin1") == 0) ||
3074                 (strcmp(lower, "iso-8859-1") == 0))
3075            return _PyUnicode_AsLatin1String(unicode, errors);
3076#ifdef HAVE_MBCS
3077        else if (strcmp(lower, "mbcs") == 0)
3078            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3079                                        PyUnicode_GET_SIZE(unicode),
3080                                        errors);
3081#endif
3082        else if (strcmp(lower, "ascii") == 0)
3083            return _PyUnicode_AsASCIIString(unicode, errors);
3084    }
3085
3086    /* Encode via the codec registry */
3087    v = PyCodec_Encode(unicode, encoding, errors);
3088    if (v == NULL)
3089        return NULL;
3090
3091    /* The normal path */
3092    if (PyBytes_Check(v))
3093        return v;
3094
3095    /* If the codec returns a buffer, raise a warning and convert to bytes */
3096    if (PyByteArray_Check(v)) {
3097        int error;
3098        PyObject *b;
3099
3100        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3101            "encoder %s returned bytearray instead of bytes",
3102            encoding);
3103        if (error) {
3104            Py_DECREF(v);
3105            return NULL;
3106        }
3107
3108        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3109        Py_DECREF(v);
3110        return b;
3111    }
3112
3113    PyErr_Format(PyExc_TypeError,
3114                 "encoder did not return a bytes object (type=%.400s)",
3115                 Py_TYPE(v)->tp_name);
3116    Py_DECREF(v);
3117    return NULL;
3118}
3119
3120PyObject *
3121PyUnicode_AsEncodedUnicode(PyObject *unicode,
3122                           const char *encoding,
3123                           const char *errors)
3124{
3125    PyObject *v;
3126
3127    if (!PyUnicode_Check(unicode)) {
3128        PyErr_BadArgument();
3129        goto onError;
3130    }
3131
3132    if (encoding == NULL)
3133        encoding = PyUnicode_GetDefaultEncoding();
3134
3135    /* Encode via the codec registry */
3136    v = PyCodec_Encode(unicode, encoding, errors);
3137    if (v == NULL)
3138        goto onError;
3139    if (!PyUnicode_Check(v)) {
3140        PyErr_Format(PyExc_TypeError,
3141                     "encoder did not return an str object (type=%.400s)",
3142                     Py_TYPE(v)->tp_name);
3143        Py_DECREF(v);
3144        goto onError;
3145    }
3146    return v;
3147
3148  onError:
3149    return NULL;
3150}
3151
3152PyObject*
3153PyUnicode_DecodeFSDefault(const char *s) {
3154    Py_ssize_t size = (Py_ssize_t)strlen(s);
3155    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3156}
3157
3158PyObject*
3159PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3160{
3161#ifdef HAVE_MBCS
3162    return PyUnicode_DecodeMBCS(s, size, NULL);
3163#elif defined(__APPLE__)
3164    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3165#else
3166    PyInterpreterState *interp = PyThreadState_GET()->interp;
3167    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3168       cannot use it to encode and decode filenames before it is loaded. Load
3169       the Python codec requires to encode at least its own filename. Use the C
3170       version of the locale codec until the codec registry is initialized and
3171       the Python codec is loaded.
3172
3173       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3174       cannot only rely on it: check also interp->fscodec_initialized for
3175       subinterpreters. */
3176    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3177        return PyUnicode_Decode(s, size,
3178                                Py_FileSystemDefaultEncoding,
3179                                "surrogateescape");
3180    }
3181    else {
3182        /* locale encoding with surrogateescape */
3183        wchar_t *wchar;
3184        PyObject *unicode;
3185        size_t len;
3186
3187        if (s[size] != '\0' || size != strlen(s)) {
3188            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3189            return NULL;
3190        }
3191
3192        wchar = _Py_char2wchar(s, &len);
3193        if (wchar == NULL)
3194            return PyErr_NoMemory();
3195
3196        unicode = PyUnicode_FromWideChar(wchar, len);
3197        PyMem_Free(wchar);
3198        return unicode;
3199    }
3200#endif
3201}
3202
3203
3204int
3205PyUnicode_FSConverter(PyObject* arg, void* addr)
3206{
3207    PyObject *output = NULL;
3208    Py_ssize_t size;
3209    void *data;
3210    if (arg == NULL) {
3211        Py_DECREF(*(PyObject**)addr);
3212        return 1;
3213    }
3214    if (PyBytes_Check(arg)) {
3215        output = arg;
3216        Py_INCREF(output);
3217    }
3218    else {
3219        arg = PyUnicode_FromObject(arg);
3220        if (!arg)
3221            return 0;
3222        output = PyUnicode_EncodeFSDefault(arg);
3223        Py_DECREF(arg);
3224        if (!output)
3225            return 0;
3226        if (!PyBytes_Check(output)) {
3227            Py_DECREF(output);
3228            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3229            return 0;
3230        }
3231    }
3232    size = PyBytes_GET_SIZE(output);
3233    data = PyBytes_AS_STRING(output);
3234    if (size != strlen(data)) {
3235        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3236        Py_DECREF(output);
3237        return 0;
3238    }
3239    *(PyObject**)addr = output;
3240    return Py_CLEANUP_SUPPORTED;
3241}
3242
3243
3244int
3245PyUnicode_FSDecoder(PyObject* arg, void* addr)
3246{
3247    PyObject *output = NULL;
3248    if (arg == NULL) {
3249        Py_DECREF(*(PyObject**)addr);
3250        return 1;
3251    }
3252    if (PyUnicode_Check(arg)) {
3253        if (PyUnicode_READY(arg))
3254            return 0;
3255        output = arg;
3256        Py_INCREF(output);
3257    }
3258    else {
3259        arg = PyBytes_FromObject(arg);
3260        if (!arg)
3261            return 0;
3262        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3263                                                  PyBytes_GET_SIZE(arg));
3264        Py_DECREF(arg);
3265        if (!output)
3266            return 0;
3267        if (!PyUnicode_Check(output)) {
3268            Py_DECREF(output);
3269            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3270            return 0;
3271        }
3272    }
3273    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3274                 PyUnicode_GET_LENGTH(output), 0, 1)) {
3275        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3276        Py_DECREF(output);
3277        return 0;
3278    }
3279    *(PyObject**)addr = output;
3280    return Py_CLEANUP_SUPPORTED;
3281}
3282
3283
3284char*
3285PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3286{
3287    PyObject *bytes;
3288    PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3289
3290    if (!PyUnicode_Check(unicode)) {
3291        PyErr_BadArgument();
3292        return NULL;
3293    }
3294    if (PyUnicode_READY(u) == -1)
3295        return NULL;
3296
3297    if (PyUnicode_UTF8(unicode) == NULL) {
3298        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3299        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3300        if (bytes == NULL)
3301            return NULL;
3302        _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3303        if (_PyUnicode_UTF8(u) == NULL) {
3304            Py_DECREF(bytes);
3305            return NULL;
3306        }
3307        _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3308        Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
3309        Py_DECREF(bytes);
3310    }
3311
3312    if (psize)
3313        *psize = PyUnicode_UTF8_LENGTH(unicode);
3314    return PyUnicode_UTF8(unicode);
3315}
3316
3317char*
3318PyUnicode_AsUTF8(PyObject *unicode)
3319{
3320    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3321}
3322
3323#ifdef Py_DEBUG
3324int unicode_as_unicode_calls = 0;
3325#endif
3326
3327
3328Py_UNICODE *
3329PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3330{
3331    PyUnicodeObject *u;
3332    const unsigned char *one_byte;
3333#if SIZEOF_WCHAR_T == 4
3334    const Py_UCS2 *two_bytes;
3335#else
3336    const Py_UCS4 *four_bytes;
3337    const Py_UCS4 *ucs4_end;
3338    Py_ssize_t num_surrogates;
3339#endif
3340    wchar_t *w;
3341    wchar_t *wchar_end;
3342
3343    if (!PyUnicode_Check(unicode)) {
3344        PyErr_BadArgument();
3345        return NULL;
3346    }
3347    u = (PyUnicodeObject*)unicode;
3348    if (_PyUnicode_WSTR(u) == NULL) {
3349        /* Non-ASCII compact unicode object */
3350        assert(_PyUnicode_KIND(u) != 0);
3351        assert(PyUnicode_IS_READY(u));
3352
3353#ifdef Py_DEBUG
3354        ++unicode_as_unicode_calls;
3355#endif
3356
3357        if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3358#if SIZEOF_WCHAR_T == 2
3359            four_bytes = PyUnicode_4BYTE_DATA(u);
3360            ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3361            num_surrogates = 0;
3362
3363            for (; four_bytes < ucs4_end; ++four_bytes) {
3364                if (*four_bytes > 0xFFFF)
3365                    ++num_surrogates;
3366            }
3367
3368            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3369                    sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3370            if (!_PyUnicode_WSTR(u)) {
3371                PyErr_NoMemory();
3372                return NULL;
3373            }
3374            _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3375
3376            w = _PyUnicode_WSTR(u);
3377            wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3378            four_bytes = PyUnicode_4BYTE_DATA(u);
3379            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3380                if (*four_bytes > 0xFFFF) {
3381                    /* encode surrogate pair in this case */
3382                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3383                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3384                }
3385                else
3386                    *w = *four_bytes;
3387
3388                if (w > wchar_end) {
3389                    assert(0 && "Miscalculated string end");
3390                }
3391            }
3392            *w = 0;
3393#else
3394            /* sizeof(wchar_t) == 4 */
3395            Py_FatalError("Impossible unicode object state, wstr and str "
3396                          "should share memory already.");
3397            return NULL;
3398#endif
3399        }
3400        else {
3401            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3402                                                  (_PyUnicode_LENGTH(u) + 1));
3403            if (!_PyUnicode_WSTR(u)) {
3404                PyErr_NoMemory();
3405                return NULL;
3406            }
3407            if (!PyUnicode_IS_COMPACT_ASCII(u))
3408                _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3409            w = _PyUnicode_WSTR(u);
3410            wchar_end = w + _PyUnicode_LENGTH(u);
3411
3412            if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3413                one_byte = PyUnicode_1BYTE_DATA(u);
3414                for (; w < wchar_end; ++one_byte, ++w)
3415                    *w = *one_byte;
3416                /* null-terminate the wstr */
3417                *w = 0;
3418            }
3419            else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3420#if SIZEOF_WCHAR_T == 4
3421                two_bytes = PyUnicode_2BYTE_DATA(u);
3422                for (; w < wchar_end; ++two_bytes, ++w)
3423                    *w = *two_bytes;
3424                /* null-terminate the wstr */
3425                *w = 0;
3426#else
3427                /* sizeof(wchar_t) == 2 */
3428                PyObject_FREE(_PyUnicode_WSTR(u));
3429                _PyUnicode_WSTR(u) = NULL;
3430                Py_FatalError("Impossible unicode object state, wstr "
3431                              "and str should share memory already.");
3432                return NULL;
3433#endif
3434            }
3435            else {
3436                assert(0 && "This should never happen.");
3437            }
3438        }
3439    }
3440    if (size != NULL)
3441        *size = PyUnicode_WSTR_LENGTH(u);
3442    return _PyUnicode_WSTR(u);
3443}
3444
3445Py_UNICODE *
3446PyUnicode_AsUnicode(PyObject *unicode)
3447{
3448    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3449}
3450
3451
3452Py_ssize_t
3453PyUnicode_GetSize(PyObject *unicode)
3454{
3455    if (!PyUnicode_Check(unicode)) {
3456        PyErr_BadArgument();
3457        goto onError;
3458    }
3459    return PyUnicode_GET_SIZE(unicode);
3460
3461  onError:
3462    return -1;
3463}
3464
3465Py_ssize_t
3466PyUnicode_GetLength(PyObject *unicode)
3467{
3468    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3469        PyErr_BadArgument();
3470        return -1;
3471    }
3472
3473    return PyUnicode_GET_LENGTH(unicode);
3474}
3475
3476Py_UCS4
3477PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3478{
3479    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3480        PyErr_BadArgument();
3481        return (Py_UCS4)-1;
3482    }
3483    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3484        PyErr_SetString(PyExc_IndexError, "string index out of range");
3485        return (Py_UCS4)-1;
3486    }
3487    return PyUnicode_READ_CHAR(unicode, index);
3488}
3489
3490int
3491PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3492{
3493    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3494        PyErr_BadArgument();
3495        return -1;
3496    }
3497    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3498        PyErr_SetString(PyExc_IndexError, "string index out of range");
3499        return -1;
3500    }
3501    if (_PyUnicode_Dirty(unicode))
3502        return -1;
3503    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3504                    index, ch);
3505    return 0;
3506}
3507
3508const char *
3509PyUnicode_GetDefaultEncoding(void)
3510{
3511    return "utf-8";
3512}
3513
3514/* create or adjust a UnicodeDecodeError */
3515static void
3516make_decode_exception(PyObject **exceptionObject,
3517                      const char *encoding,
3518                      const char *input, Py_ssize_t length,
3519                      Py_ssize_t startpos, Py_ssize_t endpos,
3520                      const char *reason)
3521{
3522    if (*exceptionObject == NULL) {
3523        *exceptionObject = PyUnicodeDecodeError_Create(
3524            encoding, input, length, startpos, endpos, reason);
3525    }
3526    else {
3527        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3528            goto onError;
3529        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3530            goto onError;
3531        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3532            goto onError;
3533    }
3534    return;
3535
3536onError:
3537    Py_DECREF(*exceptionObject);
3538    *exceptionObject = NULL;
3539}
3540
3541/* error handling callback helper:
3542   build arguments, call the callback and check the arguments,
3543   if no exception occurred, copy the replacement to the output
3544   and adjust various state variables.
3545   return 0 on success, -1 on error
3546*/
3547
3548static int
3549unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3550                                 const char *encoding, const char *reason,
3551                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3552                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3553                                 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
3554{
3555    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3556
3557    PyObject *restuple = NULL;
3558    PyObject *repunicode = NULL;
3559    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
3560    Py_ssize_t insize;
3561    Py_ssize_t requiredsize;
3562    Py_ssize_t newpos;
3563    const Py_UNICODE *repptr;
3564    PyObject *inputobj = NULL;
3565    Py_ssize_t repsize;
3566    int res = -1;
3567
3568    if (*errorHandler == NULL) {
3569        *errorHandler = PyCodec_LookupError(errors);
3570        if (*errorHandler == NULL)
3571            goto onError;
3572    }
3573
3574    make_decode_exception(exceptionObject,
3575        encoding,
3576        *input, *inend - *input,
3577        *startinpos, *endinpos,
3578        reason);
3579    if (*exceptionObject == NULL)
3580        goto onError;
3581
3582    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3583    if (restuple == NULL)
3584        goto onError;
3585    if (!PyTuple_Check(restuple)) {
3586        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3587        goto onError;
3588    }
3589    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3590        goto onError;
3591
3592    /* Copy back the bytes variables, which might have been modified by the
3593       callback */
3594    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3595    if (!inputobj)
3596        goto onError;
3597    if (!PyBytes_Check(inputobj)) {
3598        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3599    }
3600    *input = PyBytes_AS_STRING(inputobj);
3601    insize = PyBytes_GET_SIZE(inputobj);
3602    *inend = *input + insize;
3603    /* we can DECREF safely, as the exception has another reference,
3604       so the object won't go away. */
3605    Py_DECREF(inputobj);
3606
3607    if (newpos<0)
3608        newpos = insize+newpos;
3609    if (newpos<0 || newpos>insize) {
3610        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3611        goto onError;
3612    }
3613
3614    /* need more space? (at least enough for what we
3615       have+the replacement+the rest of the string (starting
3616       at the new input position), so we won't have to check space
3617       when there are no errors in the rest of the string) */
3618    repptr = PyUnicode_AS_UNICODE(repunicode);
3619    repsize = PyUnicode_GET_SIZE(repunicode);
3620    requiredsize = *outpos + repsize + insize-newpos;
3621    if (requiredsize > outsize) {
3622        if (requiredsize<2*outsize)
3623            requiredsize = 2*outsize;
3624        if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
3625            goto onError;
3626        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
3627    }
3628    *endinpos = newpos;
3629    *inptr = *input + newpos;
3630    Py_UNICODE_COPY(*outptr, repptr, repsize);
3631    *outptr += repsize;
3632    *outpos += repsize;
3633
3634    /* we made it! */
3635    res = 0;
3636
3637  onError:
3638    Py_XDECREF(restuple);
3639    return res;
3640}
3641
3642/* --- UTF-7 Codec -------------------------------------------------------- */
3643
3644/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3645
3646/* Three simple macros defining base-64. */
3647
3648/* Is c a base-64 character? */
3649
3650#define IS_BASE64(c) \
3651    (((c) >= 'A' && (c) <= 'Z') ||     \
3652     ((c) >= 'a' && (c) <= 'z') ||     \
3653     ((c) >= '0' && (c) <= '9') ||     \
3654     (c) == '+' || (c) == '/')
3655
3656/* given that c is a base-64 character, what is its base-64 value? */
3657
3658#define FROM_BASE64(c)                                                  \
3659    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3660     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3661     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3662     (c) == '+' ? 62 : 63)
3663
3664/* What is the base-64 character of the bottom 6 bits of n? */
3665
3666#define TO_BASE64(n)  \
3667    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3668
3669/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3670 * decoded as itself.  We are permissive on decoding; the only ASCII
3671 * byte not decoding to itself is the + which begins a base64
3672 * string. */
3673
3674#define DECODE_DIRECT(c)                                \
3675    ((c) <= 127 && (c) != '+')
3676
3677/* The UTF-7 encoder treats ASCII characters differently according to
3678 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3679 * the above).  See RFC2152.  This array identifies these different
3680 * sets:
3681 * 0 : "Set D"
3682 *     alphanumeric and '(),-./:?
3683 * 1 : "Set O"
3684 *     !"#$%&*;<=>@[]^_`{|}
3685 * 2 : "whitespace"
3686 *     ht nl cr sp
3687 * 3 : special (must be base64 encoded)
3688 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3689 */
3690
3691static
3692char utf7_category[128] = {
3693/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3694    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3695/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3696    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3697/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3698    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3699/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3700    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3701/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3702    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3703/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3704    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3705/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3706    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3707/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3708    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3709};
3710
3711/* ENCODE_DIRECT: this character should be encoded as itself.  The
3712 * answer depends on whether we are encoding set O as itself, and also
3713 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3714 * clear that the answers to these questions vary between
3715 * applications, so this code needs to be flexible.  */
3716
3717#define ENCODE_DIRECT(c, directO, directWS)             \
3718    ((c) < 128 && (c) > 0 &&                            \
3719     ((utf7_category[(c)] == 0) ||                      \
3720      (directWS && (utf7_category[(c)] == 2)) ||        \
3721      (directO && (utf7_category[(c)] == 1))))
3722
3723PyObject *
3724PyUnicode_DecodeUTF7(const char *s,
3725                     Py_ssize_t size,
3726                     const char *errors)
3727{
3728    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3729}
3730
3731/* The decoder.  The only state we preserve is our read position,
3732 * i.e. how many characters we have consumed.  So if we end in the
3733 * middle of a shift sequence we have to back off the read position
3734 * and the output to the beginning of the sequence, otherwise we lose
3735 * all the shift state (seen bits, number of bits seen, high
3736 * surrogate). */
3737
3738PyObject *
3739PyUnicode_DecodeUTF7Stateful(const char *s,
3740                             Py_ssize_t size,
3741                             const char *errors,
3742                             Py_ssize_t *consumed)
3743{
3744    const char *starts = s;
3745    Py_ssize_t startinpos;
3746    Py_ssize_t endinpos;
3747    Py_ssize_t outpos;
3748    const char *e;
3749    PyUnicodeObject *unicode;
3750    Py_UNICODE *p;
3751    const char *errmsg = "";
3752    int inShift = 0;
3753    Py_UNICODE *shiftOutStart;
3754    unsigned int base64bits = 0;
3755    unsigned long base64buffer = 0;
3756    Py_UNICODE surrogate = 0;
3757    PyObject *errorHandler = NULL;
3758    PyObject *exc = NULL;
3759
3760    unicode = _PyUnicode_New(size);
3761    if (!unicode)
3762        return NULL;
3763    if (size == 0) {
3764        if (consumed)
3765            *consumed = 0;
3766        return (PyObject *)unicode;
3767    }
3768
3769    p = PyUnicode_AS_UNICODE(unicode);
3770    shiftOutStart = p;
3771    e = s + size;
3772
3773    while (s < e) {
3774        Py_UNICODE ch;
3775      restart:
3776        ch = (unsigned char) *s;
3777
3778        if (inShift) { /* in a base-64 section */
3779            if (IS_BASE64(ch)) { /* consume a base-64 character */
3780                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3781                base64bits += 6;
3782                s++;
3783                if (base64bits >= 16) {
3784                    /* we have enough bits for a UTF-16 value */
3785                    Py_UNICODE outCh = (Py_UNICODE)
3786                                       (base64buffer >> (base64bits-16));
3787                    base64bits -= 16;
3788                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3789                    if (surrogate) {
3790                        /* expecting a second surrogate */
3791                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3792#ifdef Py_UNICODE_WIDE
3793                            *p++ = (((surrogate & 0x3FF)<<10)
3794                                    | (outCh & 0x3FF)) + 0x10000;
3795#else
3796                            *p++ = surrogate;
3797                            *p++ = outCh;
3798#endif
3799                            surrogate = 0;
3800                        }
3801                        else {
3802                            surrogate = 0;
3803                            errmsg = "second surrogate missing";
3804                            goto utf7Error;
3805                        }
3806                    }
3807                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3808                        /* first surrogate */
3809                        surrogate = outCh;
3810                    }
3811                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3812                        errmsg = "unexpected second surrogate";
3813                        goto utf7Error;
3814                    }
3815                    else {
3816                        *p++ = outCh;
3817                    }
3818                }
3819            }
3820            else { /* now leaving a base-64 section */
3821                inShift = 0;
3822                s++;
3823                if (surrogate) {
3824                    errmsg = "second surrogate missing at end of shift sequence";
3825                    goto utf7Error;
3826                }
3827                if (base64bits > 0) { /* left-over bits */
3828                    if (base64bits >= 6) {
3829                        /* We've seen at least one base-64 character */
3830                        errmsg = "partial character in shift sequence";
3831                        goto utf7Error;
3832                    }
3833                    else {
3834                        /* Some bits remain; they should be zero */
3835                        if (base64buffer != 0) {
3836                            errmsg = "non-zero padding bits in shift sequence";
3837                            goto utf7Error;
3838                        }
3839                    }
3840                }
3841                if (ch != '-') {
3842                    /* '-' is absorbed; other terminating
3843                       characters are preserved */
3844                    *p++ = ch;
3845                }
3846            }
3847        }
3848        else if ( ch == '+' ) {
3849            startinpos = s-starts;
3850            s++; /* consume '+' */
3851            if (s < e && *s == '-') { /* '+-' encodes '+' */
3852                s++;
3853                *p++ = '+';
3854            }
3855            else { /* begin base64-encoded section */
3856                inShift = 1;
3857                shiftOutStart = p;
3858                base64bits = 0;
3859            }
3860        }
3861        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3862            *p++ = ch;
3863            s++;
3864        }
3865        else {
3866            startinpos = s-starts;
3867            s++;
3868            errmsg = "unexpected special character";
3869            goto utf7Error;
3870        }
3871        continue;
3872utf7Error:
3873        outpos = p-PyUnicode_AS_UNICODE(unicode);
3874        endinpos = s-starts;
3875        if (unicode_decode_call_errorhandler(
3876                errors, &errorHandler,
3877                "utf7", errmsg,
3878                &starts, &e, &startinpos, &endinpos, &exc, &s,
3879                &unicode, &outpos, &p))
3880            goto onError;
3881    }
3882
3883    /* end of string */
3884
3885    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3886        /* if we're in an inconsistent state, that's an error */
3887        if (surrogate ||
3888                (base64bits >= 6) ||
3889                (base64bits > 0 && base64buffer != 0)) {
3890            outpos = p-PyUnicode_AS_UNICODE(unicode);
3891            endinpos = size;
3892            if (unicode_decode_call_errorhandler(
3893                    errors, &errorHandler,
3894                    "utf7", "unterminated shift sequence",
3895                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3896                    &unicode, &outpos, &p))
3897                goto onError;
3898            if (s < e)
3899                goto restart;
3900        }
3901    }
3902
3903    /* return state */
3904    if (consumed) {
3905        if (inShift) {
3906            p = shiftOutStart; /* back off output */
3907            *consumed = startinpos;
3908        }
3909        else {
3910            *consumed = s-starts;
3911        }
3912    }
3913
3914    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
3915        goto onError;
3916
3917    Py_XDECREF(errorHandler);
3918    Py_XDECREF(exc);
3919#ifndef DONT_MAKE_RESULT_READY
3920    if (_PyUnicode_READY_REPLACE(&unicode)) {
3921        Py_DECREF(unicode);
3922        return NULL;
3923    }
3924#endif
3925    assert(_PyUnicode_CheckConsistency(unicode, 1));
3926    return (PyObject *)unicode;
3927
3928  onError:
3929    Py_XDECREF(errorHandler);
3930    Py_XDECREF(exc);
3931    Py_DECREF(unicode);
3932    return NULL;
3933}
3934
3935
3936PyObject *
3937PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3938                     Py_ssize_t size,
3939                     int base64SetO,
3940                     int base64WhiteSpace,
3941                     const char *errors)
3942{
3943    PyObject *v;
3944    /* It might be possible to tighten this worst case */
3945    Py_ssize_t allocated = 8 * size;
3946    int inShift = 0;
3947    Py_ssize_t i = 0;
3948    unsigned int base64bits = 0;
3949    unsigned long base64buffer = 0;
3950    char * out;
3951    char * start;
3952
3953    if (size == 0)
3954        return PyBytes_FromStringAndSize(NULL, 0);
3955
3956    if (allocated / 8 != size)
3957        return PyErr_NoMemory();
3958
3959    v = PyBytes_FromStringAndSize(NULL, allocated);
3960    if (v == NULL)
3961        return NULL;
3962
3963    start = out = PyBytes_AS_STRING(v);
3964    for (;i < size; ++i) {
3965        Py_UNICODE ch = s[i];
3966
3967        if (inShift) {
3968            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3969                /* shifting out */
3970                if (base64bits) { /* output remaining bits */
3971                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
3972                    base64buffer = 0;
3973                    base64bits = 0;
3974                }
3975                inShift = 0;
3976                /* Characters not in the BASE64 set implicitly unshift the sequence
3977                   so no '-' is required, except if the character is itself a '-' */
3978                if (IS_BASE64(ch) || ch == '-') {
3979                    *out++ = '-';
3980                }
3981                *out++ = (char) ch;
3982            }
3983            else {
3984                goto encode_char;
3985            }
3986        }
3987        else { /* not in a shift sequence */
3988            if (ch == '+') {
3989                *out++ = '+';
3990                        *out++ = '-';
3991            }
3992            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3993                *out++ = (char) ch;
3994            }
3995            else {
3996                *out++ = '+';
3997                inShift = 1;
3998                goto encode_char;
3999            }
4000        }
4001        continue;
4002encode_char:
4003#ifdef Py_UNICODE_WIDE
4004        if (ch >= 0x10000) {
4005            /* code first surrogate */
4006            base64bits += 16;
4007            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4008            while (base64bits >= 6) {
4009                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4010                base64bits -= 6;
4011            }
4012            /* prepare second surrogate */
4013            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
4014        }
4015#endif
4016        base64bits += 16;
4017        base64buffer = (base64buffer << 16) | ch;
4018        while (base64bits >= 6) {
4019            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4020            base64bits -= 6;
4021        }
4022    }
4023    if (base64bits)
4024        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4025    if (inShift)
4026        *out++ = '-';
4027    if (_PyBytes_Resize(&v, out - start) < 0)
4028        return NULL;
4029    return v;
4030}
4031
4032#undef IS_BASE64
4033#undef FROM_BASE64
4034#undef TO_BASE64
4035#undef DECODE_DIRECT
4036#undef ENCODE_DIRECT
4037
4038/* --- UTF-8 Codec -------------------------------------------------------- */
4039
4040static
4041char utf8_code_length[256] = {
4042    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
4043       illegal prefix.  See RFC 3629 for details */
4044    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4045    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4046    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4047    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4048    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4049    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4050    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4051    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4052    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
4053    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4054    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4055    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4056    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4057    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4058    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4059    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
4060};
4061
4062PyObject *
4063PyUnicode_DecodeUTF8(const char *s,
4064                     Py_ssize_t size,
4065                     const char *errors)
4066{
4067    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4068}
4069
4070/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4071#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4072
4073/* Mask to quickly check whether a C 'long' contains a
4074   non-ASCII, UTF8-encoded char. */
4075#if (SIZEOF_LONG == 8)
4076# define ASCII_CHAR_MASK 0x8080808080808080L
4077#elif (SIZEOF_LONG == 4)
4078# define ASCII_CHAR_MASK 0x80808080L
4079#else
4080# error C 'long' size should be either 4 or 8!
4081#endif
4082
4083/* Scans a UTF-8 string and returns the maximum character to be expected,
4084   the size of the decoded unicode string and if any major errors were
4085   encountered.
4086
4087   This function does check basic UTF-8 sanity, it does however NOT CHECK
4088   if the string contains surrogates, and if all continuation bytes are
4089   within the correct ranges, these checks are performed in
4090   PyUnicode_DecodeUTF8Stateful.
4091
4092   If it sets has_errors to 1, it means the value of unicode_size and max_char
4093   will be bogus and you should not rely on useful information in them.
4094   */
4095static Py_UCS4
4096utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4097                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4098                                  int *has_errors)
4099{
4100    Py_ssize_t n;
4101    Py_ssize_t char_count = 0;
4102    Py_UCS4 max_char = 127, new_max;
4103    Py_UCS4 upper_bound;
4104    const unsigned char *p = (const unsigned char *)s;
4105    const unsigned char *end = p + string_size;
4106    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4107    int err = 0;
4108
4109    for (; p < end && !err; ++p, ++char_count) {
4110        /* Only check value if it's not a ASCII char... */
4111        if (*p < 0x80) {
4112            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4113               an explanation. */
4114            if (!((size_t) p & LONG_PTR_MASK)) {
4115                /* Help register allocation */
4116                register const unsigned char *_p = p;
4117                while (_p < aligned_end) {
4118                    unsigned long value = *(unsigned long *) _p;
4119                    if (value & ASCII_CHAR_MASK)
4120                        break;
4121                    _p += SIZEOF_LONG;
4122                    char_count += SIZEOF_LONG;
4123                }
4124                p = _p;
4125                if (p == end)
4126                    break;
4127            }
4128        }
4129        if (*p >= 0x80) {
4130            n = utf8_code_length[*p];
4131            new_max = max_char;
4132            switch (n) {
4133            /* invalid start byte */
4134            case 0:
4135                err = 1;
4136                break;
4137            case 2:
4138                /* Code points between 0x00FF and 0x07FF inclusive.
4139                   Approximate the upper bound of the code point,
4140                   if this flips over 255 we can be sure it will be more
4141                   than 255 and the string will need 2 bytes per code coint,
4142                   if it stays under or equal to 255, we can be sure 1 byte
4143                   is enough.
4144                   ((*p & 0b00011111) << 6) | 0b00111111 */
4145                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4146                if (max_char < upper_bound)
4147                    new_max = upper_bound;
4148                /* Ensure we track at least that we left ASCII space. */
4149                if (new_max < 128)
4150                    new_max = 128;
4151                break;
4152            case 3:
4153                /* Between 0x0FFF and 0xFFFF inclusive, so values are
4154                   always > 255 and <= 65535 and will always need 2 bytes. */
4155                if (max_char < 65535)
4156                    new_max = 65535;
4157                break;
4158            case 4:
4159                /* Code point will be above 0xFFFF for sure in this case. */
4160                new_max = 65537;
4161                break;
4162            /* Internal error, this should be caught by the first if */
4163            case 1:
4164            default:
4165                assert(0 && "Impossible case in utf8_max_char_and_size");
4166                err = 1;
4167            }
4168            /* Instead of number of overall bytes for this code point,
4169               n contains the number of following bytes: */
4170            --n;
4171            /* Check if the follow up chars are all valid continuation bytes */
4172            if (n >= 1) {
4173                const unsigned char *cont;
4174                if ((p + n) >= end) {
4175                    if (consumed == 0)
4176                        /* incomplete data, non-incremental decoding */
4177                        err = 1;
4178                    break;
4179                }
4180                for (cont = p + 1; cont < (p + n); ++cont) {
4181                    if ((*cont & 0xc0) != 0x80) {
4182                        err = 1;
4183                        break;
4184                    }
4185                }
4186                p += n;
4187            }
4188            else
4189                err = 1;
4190            max_char = new_max;
4191        }
4192    }
4193
4194    if (unicode_size)
4195        *unicode_size = char_count;
4196    if (has_errors)
4197        *has_errors = err;
4198    return max_char;
4199}
4200
4201/* Similar to PyUnicode_WRITE but can also write into wstr field
4202   of the legacy unicode representation */
4203#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4204    do { \
4205        const int k_ = (kind); \
4206        if (k_ == PyUnicode_WCHAR_KIND) \
4207            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4208        else if (k_ == PyUnicode_1BYTE_KIND) \
4209            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4210        else if (k_ == PyUnicode_2BYTE_KIND) \
4211            ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4212        else \
4213            ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4214    } while (0)
4215
4216PyObject *
4217PyUnicode_DecodeUTF8Stateful(const char *s,
4218                             Py_ssize_t size,
4219                             const char *errors,
4220                             Py_ssize_t *consumed)
4221{
4222    const char *starts = s;
4223    int n;
4224    int k;
4225    Py_ssize_t startinpos;
4226    Py_ssize_t endinpos;
4227    const char *e, *aligned_end;
4228    PyUnicodeObject *unicode;
4229    const char *errmsg = "";
4230    PyObject *errorHandler = NULL;
4231    PyObject *exc = NULL;
4232    Py_UCS4 maxchar = 0;
4233    Py_ssize_t unicode_size;
4234    Py_ssize_t i;
4235    int kind;
4236    void *data;
4237    int has_errors;
4238    Py_UNICODE *error_outptr;
4239#if SIZEOF_WCHAR_T == 2
4240    Py_ssize_t wchar_offset = 0;
4241#endif
4242
4243    if (size == 0) {
4244        if (consumed)
4245            *consumed = 0;
4246        return (PyObject *)PyUnicode_New(0, 0);
4247    }
4248    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4249                                                consumed, &has_errors);
4250    if (has_errors) {
4251        unicode = _PyUnicode_New(size);
4252        if (!unicode)
4253            return NULL;
4254        kind = PyUnicode_WCHAR_KIND;
4255        data = PyUnicode_AS_UNICODE(unicode);
4256        assert(data != NULL);
4257    }
4258    else {
4259        unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4260        if (!unicode)
4261            return NULL;
4262        /* When the string is ASCII only, just use memcpy and return.
4263           unicode_size may be != size if there is an incomplete UTF-8
4264           sequence at the end of the ASCII block.  */
4265        if (maxchar < 128 && size == unicode_size) {
4266            Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4267            return (PyObject *)unicode;
4268        }
4269        kind = PyUnicode_KIND(unicode);
4270        data = PyUnicode_DATA(unicode);
4271    }
4272    /* Unpack UTF-8 encoded data */
4273    i = 0;
4274    e = s + size;
4275    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4276
4277    while (s < e) {
4278        Py_UCS4 ch = (unsigned char)*s;
4279
4280        if (ch < 0x80) {
4281            /* Fast path for runs of ASCII characters. Given that common UTF-8
4282               input will consist of an overwhelming majority of ASCII
4283               characters, we try to optimize for this case by checking
4284               as many characters as a C 'long' can contain.
4285               First, check if we can do an aligned read, as most CPUs have
4286               a penalty for unaligned reads.
4287            */
4288            if (!((size_t) s & LONG_PTR_MASK)) {
4289                /* Help register allocation */
4290                register const char *_s = s;
4291                register Py_ssize_t _i = i;
4292                while (_s < aligned_end) {
4293                    /* Read a whole long at a time (either 4 or 8 bytes),
4294                       and do a fast unrolled copy if it only contains ASCII
4295                       characters. */
4296                    unsigned long value = *(unsigned long *) _s;
4297                    if (value & ASCII_CHAR_MASK)
4298                        break;
4299                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4300                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4301                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4302                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
4303#if (SIZEOF_LONG == 8)
4304                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4305                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4306                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4307                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
4308#endif
4309                    _s += SIZEOF_LONG;
4310                    _i += SIZEOF_LONG;
4311                }
4312                s = _s;
4313                i = _i;
4314                if (s == e)
4315                    break;
4316                ch = (unsigned char)*s;
4317            }
4318        }
4319
4320        if (ch < 0x80) {
4321            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4322            s++;
4323            continue;
4324        }
4325
4326        n = utf8_code_length[ch];
4327
4328        if (s + n > e) {
4329            if (consumed)
4330                break;
4331            else {
4332                errmsg = "unexpected end of data";
4333                startinpos = s-starts;
4334                endinpos = startinpos+1;
4335                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4336                    endinpos++;
4337                goto utf8Error;
4338            }
4339        }
4340
4341        switch (n) {
4342
4343        case 0:
4344            errmsg = "invalid start byte";
4345            startinpos = s-starts;
4346            endinpos = startinpos+1;
4347            goto utf8Error;
4348
4349        case 1:
4350            errmsg = "internal error";
4351            startinpos = s-starts;
4352            endinpos = startinpos+1;
4353            goto utf8Error;
4354
4355        case 2:
4356            if ((s[1] & 0xc0) != 0x80) {
4357                errmsg = "invalid continuation byte";
4358                startinpos = s-starts;
4359                endinpos = startinpos + 1;
4360                goto utf8Error;
4361            }
4362            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4363            assert ((ch > 0x007F) && (ch <= 0x07FF));
4364            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4365            break;
4366
4367        case 3:
4368            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4369               will result in surrogates in range d800-dfff. Surrogates are
4370               not valid UTF-8 so they are rejected.
4371               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4372               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4373            if ((s[1] & 0xc0) != 0x80 ||
4374                (s[2] & 0xc0) != 0x80 ||
4375                ((unsigned char)s[0] == 0xE0 &&
4376                 (unsigned char)s[1] < 0xA0) ||
4377                ((unsigned char)s[0] == 0xED &&
4378                 (unsigned char)s[1] > 0x9F)) {
4379                errmsg = "invalid continuation byte";
4380                startinpos = s-starts;
4381                endinpos = startinpos + 1;
4382
4383                /* if s[1] first two bits are 1 and 0, then the invalid
4384                   continuation byte is s[2], so increment endinpos by 1,
4385                   if not, s[1] is invalid and endinpos doesn't need to
4386                   be incremented. */
4387                if ((s[1] & 0xC0) == 0x80)
4388                    endinpos++;
4389                goto utf8Error;
4390            }
4391            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4392            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4393            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4394            break;
4395
4396        case 4:
4397            if ((s[1] & 0xc0) != 0x80 ||
4398                (s[2] & 0xc0) != 0x80 ||
4399                (s[3] & 0xc0) != 0x80 ||
4400                ((unsigned char)s[0] == 0xF0 &&
4401                 (unsigned char)s[1] < 0x90) ||
4402                ((unsigned char)s[0] == 0xF4 &&
4403                 (unsigned char)s[1] > 0x8F)) {
4404                errmsg = "invalid continuation byte";
4405                startinpos = s-starts;
4406                endinpos = startinpos + 1;
4407                if ((s[1] & 0xC0) == 0x80) {
4408                    endinpos++;
4409                    if ((s[2] & 0xC0) == 0x80)
4410                        endinpos++;
4411                }
4412                goto utf8Error;
4413            }
4414            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4415                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4416            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4417
4418            /* If the string is flexible or we have native UCS-4, write
4419               directly.. */
4420            if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4421                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4422
4423            else {
4424                /* compute and append the two surrogates: */
4425
4426                /* translate from 10000..10FFFF to 0..FFFF */
4427                ch -= 0x10000;
4428
4429                /* high surrogate = top 10 bits added to D800 */
4430                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4431                                       (Py_UNICODE)(0xD800 + (ch >> 10)));
4432
4433                /* low surrogate = bottom 10 bits added to DC00 */
4434                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4435                                       (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4436            }
4437#if SIZEOF_WCHAR_T == 2
4438            wchar_offset++;
4439#endif
4440            break;
4441        }
4442        s += n;
4443        continue;
4444
4445      utf8Error:
4446        /* If this is not yet a resizable string, make it one.. */
4447        if (kind != PyUnicode_WCHAR_KIND) {
4448            const Py_UNICODE *u;
4449            PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4450            if (!new_unicode)
4451                goto onError;
4452            u = PyUnicode_AsUnicode((PyObject *)unicode);
4453            if (!u)
4454                goto onError;
4455#if SIZEOF_WCHAR_T == 2
4456            i += wchar_offset;
4457#endif
4458            Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4459            Py_DECREF(unicode);
4460            unicode = new_unicode;
4461            kind = 0;
4462            data = PyUnicode_AS_UNICODE(new_unicode);
4463            assert(data != NULL);
4464        }
4465        error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
4466        if (unicode_decode_call_errorhandler(
4467                errors, &errorHandler,
4468                "utf8", errmsg,
4469                &starts, &e, &startinpos, &endinpos, &exc, &s,
4470                &unicode, &i, &error_outptr))
4471            goto onError;
4472        /* Update data because unicode_decode_call_errorhandler might have
4473           re-created or resized the unicode object. */
4474        data = PyUnicode_AS_UNICODE(unicode);
4475        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4476    }
4477    /* Ensure the unicode_size calculation above was correct: */
4478    assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4479
4480    if (consumed)
4481        *consumed = s-starts;
4482
4483    /* Adjust length and ready string when it contained errors and
4484       is of the old resizable kind. */
4485    if (kind == PyUnicode_WCHAR_KIND) {
4486        if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
4487            goto onError;
4488    }
4489
4490    Py_XDECREF(errorHandler);
4491    Py_XDECREF(exc);
4492#ifndef DONT_MAKE_RESULT_READY
4493    if (_PyUnicode_READY_REPLACE(&unicode)) {
4494        Py_DECREF(unicode);
4495        return NULL;
4496    }
4497#endif
4498    assert(_PyUnicode_CheckConsistency(unicode, 1));
4499    return (PyObject *)unicode;
4500
4501  onError:
4502    Py_XDECREF(errorHandler);
4503    Py_XDECREF(exc);
4504    Py_DECREF(unicode);
4505    return NULL;
4506}
4507
4508#undef WRITE_FLEXIBLE_OR_WSTR
4509
4510#ifdef __APPLE__
4511
4512/* Simplified UTF-8 decoder using surrogateescape error handler,
4513   used to decode the command line arguments on Mac OS X. */
4514
4515wchar_t*
4516_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4517{
4518    int n;
4519    const char *e;
4520    wchar_t *unicode, *p;
4521
4522    /* Note: size will always be longer than the resulting Unicode
4523       character count */
4524    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4525        PyErr_NoMemory();
4526        return NULL;
4527    }
4528    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4529    if (!unicode)
4530        return NULL;
4531
4532    /* Unpack UTF-8 encoded data */
4533    p = unicode;
4534    e = s + size;
4535    while (s < e) {
4536        Py_UCS4 ch = (unsigned char)*s;
4537
4538        if (ch < 0x80) {
4539            *p++ = (wchar_t)ch;
4540            s++;
4541            continue;
4542        }
4543
4544        n = utf8_code_length[ch];
4545        if (s + n > e) {
4546            goto surrogateescape;
4547        }
4548
4549        switch (n) {
4550        case 0:
4551        case 1:
4552            goto surrogateescape;
4553
4554        case 2:
4555            if ((s[1] & 0xc0) != 0x80)
4556                goto surrogateescape;
4557            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4558            assert ((ch > 0x007F) && (ch <= 0x07FF));
4559            *p++ = (wchar_t)ch;
4560            break;
4561
4562        case 3:
4563            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4564               will result in surrogates in range d800-dfff. Surrogates are
4565               not valid UTF-8 so they are rejected.
4566               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4567               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4568            if ((s[1] & 0xc0) != 0x80 ||
4569                (s[2] & 0xc0) != 0x80 ||
4570                ((unsigned char)s[0] == 0xE0 &&
4571                 (unsigned char)s[1] < 0xA0) ||
4572                ((unsigned char)s[0] == 0xED &&
4573                 (unsigned char)s[1] > 0x9F)) {
4574
4575                goto surrogateescape;
4576            }
4577            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4578            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4579            *p++ = (wchar_t)ch;
4580            break;
4581
4582        case 4:
4583            if ((s[1] & 0xc0) != 0x80 ||
4584                (s[2] & 0xc0) != 0x80 ||
4585                (s[3] & 0xc0) != 0x80 ||
4586                ((unsigned char)s[0] == 0xF0 &&
4587                 (unsigned char)s[1] < 0x90) ||
4588                ((unsigned char)s[0] == 0xF4 &&
4589                 (unsigned char)s[1] > 0x8F)) {
4590                goto surrogateescape;
4591            }
4592            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4593                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4594            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4595
4596#if SIZEOF_WCHAR_T == 4
4597            *p++ = (wchar_t)ch;
4598#else
4599            /*  compute and append the two surrogates: */
4600
4601            /*  translate from 10000..10FFFF to 0..FFFF */
4602            ch -= 0x10000;
4603
4604            /*  high surrogate = top 10 bits added to D800 */
4605            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4606
4607            /*  low surrogate = bottom 10 bits added to DC00 */
4608            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4609#endif
4610            break;
4611        }
4612        s += n;
4613        continue;
4614
4615      surrogateescape:
4616        *p++ = 0xDC00 + ch;
4617        s++;
4618    }
4619    *p = L'\0';
4620    return unicode;
4621}
4622
4623#endif /* __APPLE__ */
4624
4625/* Primary internal function which creates utf8 encoded bytes objects.
4626
4627   Allocation strategy:  if the string is short, convert into a stack buffer
4628   and allocate exactly as much space needed at the end.  Else allocate the
4629   maximum possible needed (4 result bytes per Unicode character), and return
4630   the excess memory at the end.
4631*/
4632PyObject *
4633_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
4634{
4635#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4636
4637    Py_ssize_t i;                /* index into s of next input byte */
4638    PyObject *result;            /* result string object */
4639    char *p;                     /* next free byte in output buffer */
4640    Py_ssize_t nallocated;      /* number of result bytes allocated */
4641    Py_ssize_t nneeded;            /* number of result bytes needed */
4642    char stackbuf[MAX_SHORT_UNICHARS * 4];
4643    PyObject *errorHandler = NULL;
4644    PyObject *exc = NULL;
4645    int kind;
4646    void *data;
4647    Py_ssize_t size;
4648    PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4649#if SIZEOF_WCHAR_T == 2
4650    Py_ssize_t wchar_offset = 0;
4651#endif
4652
4653    if (!PyUnicode_Check(unicode)) {
4654        PyErr_BadArgument();
4655        return NULL;
4656    }
4657
4658    if (PyUnicode_READY(unicode) == -1)
4659        return NULL;
4660
4661    if (PyUnicode_UTF8(unicode))
4662        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4663                                         PyUnicode_UTF8_LENGTH(unicode));
4664
4665    kind = PyUnicode_KIND(unicode);
4666    data = PyUnicode_DATA(unicode);
4667    size = PyUnicode_GET_LENGTH(unicode);
4668
4669    assert(size >= 0);
4670
4671    if (size <= MAX_SHORT_UNICHARS) {
4672        /* Write into the stack buffer; nallocated can't overflow.
4673         * At the end, we'll allocate exactly as much heap space as it
4674         * turns out we need.
4675         */
4676        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4677        result = NULL;   /* will allocate after we're done */
4678        p = stackbuf;
4679    }
4680    else {
4681        /* Overallocate on the heap, and give the excess back at the end. */
4682        nallocated = size * 4;
4683        if (nallocated / 4 != size)  /* overflow! */
4684            return PyErr_NoMemory();
4685        result = PyBytes_FromStringAndSize(NULL, nallocated);
4686        if (result == NULL)
4687            return NULL;
4688        p = PyBytes_AS_STRING(result);
4689    }
4690
4691    for (i = 0; i < size;) {
4692        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4693
4694        if (ch < 0x80)
4695            /* Encode ASCII */
4696            *p++ = (char) ch;
4697
4698        else if (ch < 0x0800) {
4699            /* Encode Latin-1 */
4700            *p++ = (char)(0xc0 | (ch >> 6));
4701            *p++ = (char)(0x80 | (ch & 0x3f));
4702        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4703            Py_ssize_t newpos;
4704            PyObject *rep;
4705            Py_ssize_t repsize, k, startpos;
4706            startpos = i-1;
4707#if SIZEOF_WCHAR_T == 2
4708            startpos += wchar_offset;
4709#endif
4710            rep = unicode_encode_call_errorhandler(
4711                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4712                  PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4713                  &exc, startpos, startpos+1, &newpos);
4714            if (!rep)
4715                goto error;
4716
4717            if (PyBytes_Check(rep))
4718                repsize = PyBytes_GET_SIZE(rep);
4719            else
4720                repsize = PyUnicode_GET_SIZE(rep);
4721
4722            if (repsize > 4) {
4723                Py_ssize_t offset;
4724
4725                if (result == NULL)
4726                    offset = p - stackbuf;
4727                else
4728                    offset = p - PyBytes_AS_STRING(result);
4729
4730                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4731                    /* integer overflow */
4732                    PyErr_NoMemory();
4733                    goto error;
4734                }
4735                nallocated += repsize - 4;
4736                if (result != NULL) {
4737                    if (_PyBytes_Resize(&result, nallocated) < 0)
4738                        goto error;
4739                } else {
4740                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4741                    if (result == NULL)
4742                        goto error;
4743                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4744                }
4745                p = PyBytes_AS_STRING(result) + offset;
4746            }
4747
4748            if (PyBytes_Check(rep)) {
4749                char *prep = PyBytes_AS_STRING(rep);
4750                for(k = repsize; k > 0; k--)
4751                    *p++ = *prep++;
4752            } else /* rep is unicode */ {
4753                const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4754                Py_UNICODE c;
4755
4756                for(k=0; k<repsize; k++) {
4757                    c = prep[k];
4758                    if (0x80 <= c) {
4759                        raise_encode_exception(&exc, "utf-8",
4760                                               PyUnicode_AS_UNICODE(unicode),
4761                                               size, i-1, i,
4762                                               "surrogates not allowed");
4763                        goto error;
4764                    }
4765                    *p++ = (char)prep[k];
4766                }
4767            }
4768            Py_DECREF(rep);
4769        } else if (ch < 0x10000) {
4770            *p++ = (char)(0xe0 | (ch >> 12));
4771            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4772            *p++ = (char)(0x80 | (ch & 0x3f));
4773        } else /* ch >= 0x10000 */ {
4774            /* Encode UCS4 Unicode ordinals */
4775            *p++ = (char)(0xf0 | (ch >> 18));
4776            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4777            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4778            *p++ = (char)(0x80 | (ch & 0x3f));
4779#if SIZEOF_WCHAR_T == 2
4780            wchar_offset++;
4781#endif
4782        }
4783    }
4784
4785    if (result == NULL) {
4786        /* This was stack allocated. */
4787        nneeded = p - stackbuf;
4788        assert(nneeded <= nallocated);
4789        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4790    }
4791    else {
4792        /* Cut back to size actually needed. */
4793        nneeded = p - PyBytes_AS_STRING(result);
4794        assert(nneeded <= nallocated);
4795        _PyBytes_Resize(&result, nneeded);
4796    }
4797
4798    Py_XDECREF(errorHandler);
4799    Py_XDECREF(exc);
4800    return result;
4801 error:
4802    Py_XDECREF(errorHandler);
4803    Py_XDECREF(exc);
4804    Py_XDECREF(result);
4805    return NULL;
4806
4807#undef MAX_SHORT_UNICHARS
4808}
4809
4810PyObject *
4811PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4812                     Py_ssize_t size,
4813                     const char *errors)
4814{
4815    PyObject *v, *unicode;
4816
4817    unicode = PyUnicode_FromUnicode(s, size);
4818    if (unicode == NULL)
4819        return NULL;
4820    v = _PyUnicode_AsUTF8String(unicode, errors);
4821    Py_DECREF(unicode);
4822    return v;
4823}
4824
4825PyObject *
4826PyUnicode_AsUTF8String(PyObject *unicode)
4827{
4828    return _PyUnicode_AsUTF8String(unicode, NULL);
4829}
4830
4831/* --- UTF-32 Codec ------------------------------------------------------- */
4832
4833PyObject *
4834PyUnicode_DecodeUTF32(const char *s,
4835                      Py_ssize_t size,
4836                      const char *errors,
4837                      int *byteorder)
4838{
4839    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4840}
4841
4842PyObject *
4843PyUnicode_DecodeUTF32Stateful(const char *s,
4844                              Py_ssize_t size,
4845                              const char *errors,
4846                              int *byteorder,
4847                              Py_ssize_t *consumed)
4848{
4849    const char *starts = s;
4850    Py_ssize_t startinpos;
4851    Py_ssize_t endinpos;
4852    Py_ssize_t outpos;
4853    PyUnicodeObject *unicode;
4854    Py_UNICODE *p;
4855#ifndef Py_UNICODE_WIDE
4856    int pairs = 0;
4857    const unsigned char *qq;
4858#else
4859    const int pairs = 0;
4860#endif
4861    const unsigned char *q, *e;
4862    int bo = 0;       /* assume native ordering by default */
4863    const char *errmsg = "";
4864    /* Offsets from q for retrieving bytes in the right order. */
4865#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4866    int iorder[] = {0, 1, 2, 3};
4867#else
4868    int iorder[] = {3, 2, 1, 0};
4869#endif
4870    PyObject *errorHandler = NULL;
4871    PyObject *exc = NULL;
4872
4873    q = (unsigned char *)s;
4874    e = q + size;
4875
4876    if (byteorder)
4877        bo = *byteorder;
4878
4879    /* Check for BOM marks (U+FEFF) in the input and adjust current
4880       byte order setting accordingly. In native mode, the leading BOM
4881       mark is skipped, in all other modes, it is copied to the output
4882       stream as-is (giving a ZWNBSP character). */
4883    if (bo == 0) {
4884        if (size >= 4) {
4885            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4886                (q[iorder[1]] << 8) | q[iorder[0]];
4887#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4888            if (bom == 0x0000FEFF) {
4889                q += 4;
4890                bo = -1;
4891            }
4892            else if (bom == 0xFFFE0000) {
4893                q += 4;
4894                bo = 1;
4895            }
4896#else
4897            if (bom == 0x0000FEFF) {
4898                q += 4;
4899                bo = 1;
4900            }
4901            else if (bom == 0xFFFE0000) {
4902                q += 4;
4903                bo = -1;
4904            }
4905#endif
4906        }
4907    }
4908
4909    if (bo == -1) {
4910        /* force LE */
4911        iorder[0] = 0;
4912        iorder[1] = 1;
4913        iorder[2] = 2;
4914        iorder[3] = 3;
4915    }
4916    else if (bo == 1) {
4917        /* force BE */
4918        iorder[0] = 3;
4919        iorder[1] = 2;
4920        iorder[2] = 1;
4921        iorder[3] = 0;
4922    }
4923
4924    /* On narrow builds we split characters outside the BMP into two
4925       codepoints => count how much extra space we need. */
4926#ifndef Py_UNICODE_WIDE
4927    for (qq = q; qq < e; qq += 4)
4928        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4929            pairs++;
4930#endif
4931
4932    /* This might be one to much, because of a BOM */
4933    unicode = _PyUnicode_New((size+3)/4+pairs);
4934    if (!unicode)
4935        return NULL;
4936    if (size == 0)
4937        return (PyObject *)unicode;
4938
4939    /* Unpack UTF-32 encoded data */
4940    p = PyUnicode_AS_UNICODE(unicode);
4941
4942    while (q < e) {
4943        Py_UCS4 ch;
4944        /* remaining bytes at the end? (size should be divisible by 4) */
4945        if (e-q<4) {
4946            if (consumed)
4947                break;
4948            errmsg = "truncated data";
4949            startinpos = ((const char *)q)-starts;
4950            endinpos = ((const char *)e)-starts;
4951            goto utf32Error;
4952            /* The remaining input chars are ignored if the callback
4953               chooses to skip the input */
4954        }
4955        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4956            (q[iorder[1]] << 8) | q[iorder[0]];
4957
4958        if (ch >= 0x110000)
4959        {
4960            errmsg = "codepoint not in range(0x110000)";
4961            startinpos = ((const char *)q)-starts;
4962            endinpos = startinpos+4;
4963            goto utf32Error;
4964        }
4965#ifndef Py_UNICODE_WIDE
4966        if (ch >= 0x10000)
4967        {
4968            *p++ = 0xD800 | ((ch-0x10000) >> 10);
4969            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4970        }
4971        else
4972#endif
4973            *p++ = ch;
4974        q += 4;
4975        continue;
4976      utf32Error:
4977        outpos = p-PyUnicode_AS_UNICODE(unicode);
4978        if (unicode_decode_call_errorhandler(
4979                errors, &errorHandler,
4980                "utf32", errmsg,
4981                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4982                &unicode, &outpos, &p))
4983            goto onError;
4984    }
4985
4986    if (byteorder)
4987        *byteorder = bo;
4988
4989    if (consumed)
4990        *consumed = (const char *)q-starts;
4991
4992    /* Adjust length */
4993    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
4994        goto onError;
4995
4996    Py_XDECREF(errorHandler);
4997    Py_XDECREF(exc);
4998#ifndef DONT_MAKE_RESULT_READY
4999    if (_PyUnicode_READY_REPLACE(&unicode)) {
5000        Py_DECREF(unicode);
5001        return NULL;
5002    }
5003#endif
5004    assert(_PyUnicode_CheckConsistency(unicode, 1));
5005    return (PyObject *)unicode;
5006
5007  onError:
5008    Py_DECREF(unicode);
5009    Py_XDECREF(errorHandler);
5010    Py_XDECREF(exc);
5011    return NULL;
5012}
5013
5014PyObject *
5015PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5016                      Py_ssize_t size,
5017                      const char *errors,
5018                      int byteorder)
5019{
5020    PyObject *v;
5021    unsigned char *p;
5022    Py_ssize_t nsize, bytesize;
5023#ifndef Py_UNICODE_WIDE
5024    Py_ssize_t i, pairs;
5025#else
5026    const int pairs = 0;
5027#endif
5028    /* Offsets from p for storing byte pairs in the right order. */
5029#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5030    int iorder[] = {0, 1, 2, 3};
5031#else
5032    int iorder[] = {3, 2, 1, 0};
5033#endif
5034
5035#define STORECHAR(CH)                           \
5036    do {                                        \
5037        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5038        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5039        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5040        p[iorder[0]] = (CH) & 0xff;             \
5041        p += 4;                                 \
5042    } while(0)
5043
5044    /* In narrow builds we can output surrogate pairs as one codepoint,
5045       so we need less space. */
5046#ifndef Py_UNICODE_WIDE
5047    for (i = pairs = 0; i < size-1; i++)
5048        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5049            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5050            pairs++;
5051#endif
5052    nsize = (size - pairs + (byteorder == 0));
5053    bytesize = nsize * 4;
5054    if (bytesize / 4 != nsize)
5055        return PyErr_NoMemory();
5056    v = PyBytes_FromStringAndSize(NULL, bytesize);
5057    if (v == NULL)
5058        return NULL;
5059
5060    p = (unsigned char *)PyBytes_AS_STRING(v);
5061    if (byteorder == 0)
5062        STORECHAR(0xFEFF);
5063    if (size == 0)
5064        goto done;
5065
5066    if (byteorder == -1) {
5067        /* force LE */
5068        iorder[0] = 0;
5069        iorder[1] = 1;
5070        iorder[2] = 2;
5071        iorder[3] = 3;
5072    }
5073    else if (byteorder == 1) {
5074        /* force BE */
5075        iorder[0] = 3;
5076        iorder[1] = 2;
5077        iorder[2] = 1;
5078        iorder[3] = 0;
5079    }
5080
5081    while (size-- > 0) {
5082        Py_UCS4 ch = *s++;
5083#ifndef Py_UNICODE_WIDE
5084        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5085            Py_UCS4 ch2 = *s;
5086            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5087                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5088                s++;
5089                size--;
5090            }
5091        }
5092#endif
5093        STORECHAR(ch);
5094    }
5095
5096  done:
5097    return v;
5098#undef STORECHAR
5099}
5100
5101PyObject *
5102PyUnicode_AsUTF32String(PyObject *unicode)
5103{
5104    if (!PyUnicode_Check(unicode)) {
5105        PyErr_BadArgument();
5106        return NULL;
5107    }
5108    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
5109                                 PyUnicode_GET_SIZE(unicode),
5110                                 NULL,
5111                                 0);
5112}
5113
5114/* --- UTF-16 Codec ------------------------------------------------------- */
5115
5116PyObject *
5117PyUnicode_DecodeUTF16(const char *s,
5118                      Py_ssize_t size,
5119                      const char *errors,
5120                      int *byteorder)
5121{
5122    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5123}
5124
5125/* Two masks for fast checking of whether a C 'long' may contain
5126   UTF16-encoded surrogate characters. This is an efficient heuristic,
5127   assuming that non-surrogate characters with a code point >= 0x8000 are
5128   rare in most input.
5129   FAST_CHAR_MASK is used when the input is in native byte ordering,
5130   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
5131*/
5132#if (SIZEOF_LONG == 8)
5133# define FAST_CHAR_MASK         0x8000800080008000L
5134# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5135#elif (SIZEOF_LONG == 4)
5136# define FAST_CHAR_MASK         0x80008000L
5137# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5138#else
5139# error C 'long' size should be either 4 or 8!
5140#endif
5141
5142PyObject *
5143PyUnicode_DecodeUTF16Stateful(const char *s,
5144                              Py_ssize_t size,
5145                              const char *errors,
5146                              int *byteorder,
5147                              Py_ssize_t *consumed)
5148{
5149    const char *starts = s;
5150    Py_ssize_t startinpos;
5151    Py_ssize_t endinpos;
5152    Py_ssize_t outpos;
5153    PyUnicodeObject *unicode;
5154    Py_UNICODE *p;
5155    const unsigned char *q, *e, *aligned_end;
5156    int bo = 0;       /* assume native ordering by default */
5157    int native_ordering = 0;
5158    const char *errmsg = "";
5159    /* Offsets from q for retrieving byte pairs in the right order. */
5160#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5161    int ihi = 1, ilo = 0;
5162#else
5163    int ihi = 0, ilo = 1;
5164#endif
5165    PyObject *errorHandler = NULL;
5166    PyObject *exc = NULL;
5167
5168    /* Note: size will always be longer than the resulting Unicode
5169       character count */
5170    unicode = _PyUnicode_New(size);
5171    if (!unicode)
5172        return NULL;
5173    if (size == 0)
5174        return (PyObject *)unicode;
5175
5176    /* Unpack UTF-16 encoded data */
5177    p = PyUnicode_AS_UNICODE(unicode);
5178    q = (unsigned char *)s;
5179    e = q + size - 1;
5180
5181    if (byteorder)
5182        bo = *byteorder;
5183
5184    /* Check for BOM marks (U+FEFF) in the input and adjust current
5185       byte order setting accordingly. In native mode, the leading BOM
5186       mark is skipped, in all other modes, it is copied to the output
5187       stream as-is (giving a ZWNBSP character). */
5188    if (bo == 0) {
5189        if (size >= 2) {
5190            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
5191#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5192            if (bom == 0xFEFF) {
5193                q += 2;
5194                bo = -1;
5195            }
5196            else if (bom == 0xFFFE) {
5197                q += 2;
5198                bo = 1;
5199            }
5200#else
5201            if (bom == 0xFEFF) {
5202                q += 2;
5203                bo = 1;
5204            }
5205            else if (bom == 0xFFFE) {
5206                q += 2;
5207                bo = -1;
5208            }
5209#endif
5210        }
5211    }
5212
5213    if (bo == -1) {
5214        /* force LE */
5215        ihi = 1;
5216        ilo = 0;
5217    }
5218    else if (bo == 1) {
5219        /* force BE */
5220        ihi = 0;
5221        ilo = 1;
5222    }
5223#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5224    native_ordering = ilo < ihi;
5225#else
5226    native_ordering = ilo > ihi;
5227#endif
5228
5229    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5230    while (q < e) {
5231        Py_UNICODE ch;
5232        /* First check for possible aligned read of a C 'long'. Unaligned
5233           reads are more expensive, better to defer to another iteration. */
5234        if (!((size_t) q & LONG_PTR_MASK)) {
5235            /* Fast path for runs of non-surrogate chars. */
5236            register const unsigned char *_q = q;
5237            Py_UNICODE *_p = p;
5238            if (native_ordering) {
5239                /* Native ordering is simple: as long as the input cannot
5240                   possibly contain a surrogate char, do an unrolled copy
5241                   of several 16-bit code points to the target object.
5242                   The non-surrogate check is done on several input bytes
5243                   at a time (as many as a C 'long' can contain). */
5244                while (_q < aligned_end) {
5245                    unsigned long data = * (unsigned long *) _q;
5246                    if (data & FAST_CHAR_MASK)
5247                        break;
5248                    _p[0] = ((unsigned short *) _q)[0];
5249                    _p[1] = ((unsigned short *) _q)[1];
5250#if (SIZEOF_LONG == 8)
5251                    _p[2] = ((unsigned short *) _q)[2];
5252                    _p[3] = ((unsigned short *) _q)[3];
5253#endif
5254                    _q += SIZEOF_LONG;
5255                    _p += SIZEOF_LONG / 2;
5256                }
5257            }
5258            else {
5259                /* Byteswapped ordering is similar, but we must decompose
5260                   the copy bytewise, and take care of zero'ing out the
5261                   upper bytes if the target object is in 32-bit units
5262                   (that is, in UCS-4 builds). */
5263                while (_q < aligned_end) {
5264                    unsigned long data = * (unsigned long *) _q;
5265                    if (data & SWAPPED_FAST_CHAR_MASK)
5266                        break;
5267                    /* Zero upper bytes in UCS-4 builds */
5268#if (Py_UNICODE_SIZE > 2)
5269                    _p[0] = 0;
5270                    _p[1] = 0;
5271#if (SIZEOF_LONG == 8)
5272                    _p[2] = 0;
5273                    _p[3] = 0;
5274#endif
5275#endif
5276                    /* Issue #4916; UCS-4 builds on big endian machines must
5277                       fill the two last bytes of each 4-byte unit. */
5278#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5279# define OFF 2
5280#else
5281# define OFF 0
5282#endif
5283                    ((unsigned char *) _p)[OFF + 1] = _q[0];
5284                    ((unsigned char *) _p)[OFF + 0] = _q[1];
5285                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5286                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5287#if (SIZEOF_LONG == 8)
5288                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5289                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5290                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5291                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5292#endif
5293#undef OFF
5294                    _q += SIZEOF_LONG;
5295                    _p += SIZEOF_LONG / 2;
5296                }
5297            }
5298            p = _p;
5299            q = _q;
5300            if (q >= e)
5301                break;
5302        }
5303        ch = (q[ihi] << 8) | q[ilo];
5304
5305        q += 2;
5306
5307        if (ch < 0xD800 || ch > 0xDFFF) {
5308            *p++ = ch;
5309            continue;
5310        }
5311
5312        /* UTF-16 code pair: */
5313        if (q > e) {
5314            errmsg = "unexpected end of data";
5315            startinpos = (((const char *)q) - 2) - starts;
5316            endinpos = ((const char *)e) + 1 - starts;
5317            goto utf16Error;
5318        }
5319        if (0xD800 <= ch && ch <= 0xDBFF) {
5320            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5321            q += 2;
5322            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5323#ifndef Py_UNICODE_WIDE
5324                *p++ = ch;
5325                *p++ = ch2;
5326#else
5327                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5328#endif
5329                continue;
5330            }
5331            else {
5332                errmsg = "illegal UTF-16 surrogate";
5333                startinpos = (((const char *)q)-4)-starts;
5334                endinpos = startinpos+2;
5335                goto utf16Error;
5336            }
5337
5338        }
5339        errmsg = "illegal encoding";
5340        startinpos = (((const char *)q)-2)-starts;
5341        endinpos = startinpos+2;
5342        /* Fall through to report the error */
5343
5344      utf16Error:
5345        outpos = p - PyUnicode_AS_UNICODE(unicode);
5346        if (unicode_decode_call_errorhandler(
5347                errors,
5348                &errorHandler,
5349                "utf16", errmsg,
5350                &starts,
5351                (const char **)&e,
5352                &startinpos,
5353                &endinpos,
5354                &exc,
5355                (const char **)&q,
5356                &unicode,
5357                &outpos,
5358                &p))
5359            goto onError;
5360    }
5361    /* remaining byte at the end? (size should be even) */
5362    if (e == q) {
5363        if (!consumed) {
5364            errmsg = "truncated data";
5365            startinpos = ((const char *)q) - starts;
5366            endinpos = ((const char *)e) + 1 - starts;
5367            outpos = p - PyUnicode_AS_UNICODE(unicode);
5368            if (unicode_decode_call_errorhandler(
5369                    errors,
5370                    &errorHandler,
5371                    "utf16", errmsg,
5372                    &starts,
5373                    (const char **)&e,
5374                    &startinpos,
5375                    &endinpos,
5376                    &exc,
5377                    (const char **)&q,
5378                    &unicode,
5379                    &outpos,
5380                    &p))
5381                goto onError;
5382            /* The remaining input chars are ignored if the callback
5383               chooses to skip the input */
5384        }
5385    }
5386
5387    if (byteorder)
5388        *byteorder = bo;
5389
5390    if (consumed)
5391        *consumed = (const char *)q-starts;
5392
5393    /* Adjust length */
5394    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5395        goto onError;
5396
5397    Py_XDECREF(errorHandler);
5398    Py_XDECREF(exc);
5399#ifndef DONT_MAKE_RESULT_READY
5400    if (_PyUnicode_READY_REPLACE(&unicode)) {
5401        Py_DECREF(unicode);
5402        return NULL;
5403    }
5404#endif
5405    assert(_PyUnicode_CheckConsistency(unicode, 1));
5406    return (PyObject *)unicode;
5407
5408  onError:
5409    Py_DECREF(unicode);
5410    Py_XDECREF(errorHandler);
5411    Py_XDECREF(exc);
5412    return NULL;
5413}
5414
5415#undef FAST_CHAR_MASK
5416#undef SWAPPED_FAST_CHAR_MASK
5417
5418PyObject *
5419PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5420                      Py_ssize_t size,
5421                      const char *errors,
5422                      int byteorder)
5423{
5424    PyObject *v;
5425    unsigned char *p;
5426    Py_ssize_t nsize, bytesize;
5427#ifdef Py_UNICODE_WIDE
5428    Py_ssize_t i, pairs;
5429#else
5430    const int pairs = 0;
5431#endif
5432    /* Offsets from p for storing byte pairs in the right order. */
5433#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5434    int ihi = 1, ilo = 0;
5435#else
5436    int ihi = 0, ilo = 1;
5437#endif
5438
5439#define STORECHAR(CH)                           \
5440    do {                                        \
5441        p[ihi] = ((CH) >> 8) & 0xff;            \
5442        p[ilo] = (CH) & 0xff;                   \
5443        p += 2;                                 \
5444    } while(0)
5445
5446#ifdef Py_UNICODE_WIDE
5447    for (i = pairs = 0; i < size; i++)
5448        if (s[i] >= 0x10000)
5449            pairs++;
5450#endif
5451    /* 2 * (size + pairs + (byteorder == 0)) */
5452    if (size > PY_SSIZE_T_MAX ||
5453        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5454        return PyErr_NoMemory();
5455    nsize = size + pairs + (byteorder == 0);
5456    bytesize = nsize * 2;
5457    if (bytesize / 2 != nsize)
5458        return PyErr_NoMemory();
5459    v = PyBytes_FromStringAndSize(NULL, bytesize);
5460    if (v == NULL)
5461        return NULL;
5462
5463    p = (unsigned char *)PyBytes_AS_STRING(v);
5464    if (byteorder == 0)
5465        STORECHAR(0xFEFF);
5466    if (size == 0)
5467        goto done;
5468
5469    if (byteorder == -1) {
5470        /* force LE */
5471        ihi = 1;
5472        ilo = 0;
5473    }
5474    else if (byteorder == 1) {
5475        /* force BE */
5476        ihi = 0;
5477        ilo = 1;
5478    }
5479
5480    while (size-- > 0) {
5481        Py_UNICODE ch = *s++;
5482        Py_UNICODE ch2 = 0;
5483#ifdef Py_UNICODE_WIDE
5484        if (ch >= 0x10000) {
5485            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5486            ch  = 0xD800 | ((ch-0x10000) >> 10);
5487        }
5488#endif
5489        STORECHAR(ch);
5490        if (ch2)
5491            STORECHAR(ch2);
5492    }
5493
5494  done:
5495    return v;
5496#undef STORECHAR
5497}
5498
5499PyObject *
5500PyUnicode_AsUTF16String(PyObject *unicode)
5501{
5502    if (!PyUnicode_Check(unicode)) {
5503        PyErr_BadArgument();
5504        return NULL;
5505    }
5506    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
5507                                 PyUnicode_GET_SIZE(unicode),
5508                                 NULL,
5509                                 0);
5510}
5511
5512/* --- Unicode Escape Codec ----------------------------------------------- */
5513
5514/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5515   if all the escapes in the string make it still a valid ASCII string.
5516   Returns -1 if any escapes were found which cause the string to
5517   pop out of ASCII range.  Otherwise returns the length of the
5518   required buffer to hold the string.
5519   */
5520Py_ssize_t
5521length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5522{
5523    const unsigned char *p = (const unsigned char *)s;
5524    const unsigned char *end = p + size;
5525    Py_ssize_t length = 0;
5526
5527    if (size < 0)
5528        return -1;
5529
5530    for (; p < end; ++p) {
5531        if (*p > 127) {
5532            /* Non-ASCII */
5533            return -1;
5534        }
5535        else if (*p != '\\') {
5536            /* Normal character */
5537            ++length;
5538        }
5539        else {
5540            /* Backslash-escape, check next char */
5541            ++p;
5542            /* Escape sequence reaches till end of string or
5543               non-ASCII follow-up. */
5544            if (p >= end || *p > 127)
5545                return -1;
5546            switch (*p) {
5547            case '\n':
5548                /* backslash + \n result in zero characters */
5549                break;
5550            case '\\': case '\'': case '\"':
5551            case 'b': case 'f': case 't':
5552            case 'n': case 'r': case 'v': case 'a':
5553                ++length;
5554                break;
5555            case '0': case '1': case '2': case '3':
5556            case '4': case '5': case '6': case '7':
5557            case 'x': case 'u': case 'U': case 'N':
5558                /* these do not guarantee ASCII characters */
5559                return -1;
5560            default:
5561                /* count the backslash + the other character */
5562                length += 2;
5563            }
5564        }
5565    }
5566    return length;
5567}
5568
5569/* Similar to PyUnicode_WRITE but either write into wstr field
5570   or treat string as ASCII. */
5571#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5572    do { \
5573        if ((kind) != PyUnicode_WCHAR_KIND) \
5574            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5575        else \
5576            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5577    } while (0)
5578
5579#define WRITE_WSTR(buf, index, value) \
5580    assert(kind == PyUnicode_WCHAR_KIND), \
5581    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5582
5583
5584static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5585
5586PyObject *
5587PyUnicode_DecodeUnicodeEscape(const char *s,
5588                              Py_ssize_t size,
5589                              const char *errors)
5590{
5591    const char *starts = s;
5592    Py_ssize_t startinpos;
5593    Py_ssize_t endinpos;
5594    int j;
5595    PyUnicodeObject *v;
5596    Py_UNICODE *p;
5597    const char *end;
5598    char* message;
5599    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5600    PyObject *errorHandler = NULL;
5601    PyObject *exc = NULL;
5602    Py_ssize_t ascii_length;
5603    Py_ssize_t i;
5604    int kind;
5605    void *data;
5606
5607    ascii_length = length_of_escaped_ascii_string(s, size);
5608
5609    /* After length_of_escaped_ascii_string() there are two alternatives,
5610       either the string is pure ASCII with named escapes like \n, etc.
5611       and we determined it's exact size (common case)
5612       or it contains \x, \u, ... escape sequences.  then we create a
5613       legacy wchar string and resize it at the end of this function. */
5614    if (ascii_length >= 0) {
5615        v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5616        if (!v)
5617            goto onError;
5618        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5619        kind = PyUnicode_1BYTE_KIND;
5620        data = PyUnicode_DATA(v);
5621    }
5622    else {
5623        /* Escaped strings will always be longer than the resulting
5624           Unicode string, so we start with size here and then reduce the
5625           length after conversion to the true value.
5626           (but if the error callback returns a long replacement string
5627           we'll have to allocate more space) */
5628        v = _PyUnicode_New(size);
5629        if (!v)
5630            goto onError;
5631        kind = PyUnicode_WCHAR_KIND;
5632        data = PyUnicode_AS_UNICODE(v);
5633    }
5634
5635    if (size == 0)
5636        return (PyObject *)v;
5637    i = 0;
5638    end = s + size;
5639
5640    while (s < end) {
5641        unsigned char c;
5642        Py_UNICODE x;
5643        int digits;
5644
5645        if (kind == PyUnicode_WCHAR_KIND) {
5646            assert(i < _PyUnicode_WSTR_LENGTH(v));
5647        }
5648        else {
5649            /* The only case in which i == ascii_length is a backslash
5650               followed by a newline. */
5651            assert(i <= ascii_length);
5652        }
5653
5654        /* Non-escape characters are interpreted as Unicode ordinals */
5655        if (*s != '\\') {
5656            WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
5657            continue;
5658        }
5659
5660        startinpos = s-starts;
5661        /* \ - Escapes */
5662        s++;
5663        c = *s++;
5664        if (s > end)
5665            c = '\0'; /* Invalid after \ */
5666
5667        if (kind == PyUnicode_WCHAR_KIND) {
5668            assert(i < _PyUnicode_WSTR_LENGTH(v));
5669        }
5670        else {
5671            /* The only case in which i == ascii_length is a backslash
5672               followed by a newline. */
5673            assert(i < ascii_length || (i == ascii_length && c == '\n'));
5674        }
5675
5676        switch (c) {
5677
5678            /* \x escapes */
5679        case '\n': break;
5680        case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5681        case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5682        case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5683        case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5684        /* FF */
5685        case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5686        case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5687        case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5688        case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5689        /* VT */
5690        case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5691        /* BEL, not classic C */
5692        case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
5693
5694            /* \OOO (octal) escapes */
5695        case '0': case '1': case '2': case '3':
5696        case '4': case '5': case '6': case '7':
5697            x = s[-1] - '0';
5698            if (s < end && '0' <= *s && *s <= '7') {
5699                x = (x<<3) + *s++ - '0';
5700                if (s < end && '0' <= *s && *s <= '7')
5701                    x = (x<<3) + *s++ - '0';
5702            }
5703            WRITE_WSTR(data, i++, x);
5704            break;
5705
5706            /* hex escapes */
5707            /* \xXX */
5708        case 'x':
5709            digits = 2;
5710            message = "truncated \\xXX escape";
5711            goto hexescape;
5712
5713            /* \uXXXX */
5714        case 'u':
5715            digits = 4;
5716            message = "truncated \\uXXXX escape";
5717            goto hexescape;
5718
5719            /* \UXXXXXXXX */
5720        case 'U':
5721            digits = 8;
5722            message = "truncated \\UXXXXXXXX escape";
5723        hexescape:
5724            chr = 0;
5725            p = PyUnicode_AS_UNICODE(v) + i;
5726            if (s+digits>end) {
5727                endinpos = size;
5728                if (unicode_decode_call_errorhandler(
5729                        errors, &errorHandler,
5730                        "unicodeescape", "end of string in escape sequence",
5731                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5732                        &v, &i, &p))
5733                    goto onError;
5734                data = PyUnicode_AS_UNICODE(v);
5735                goto nextByte;
5736            }
5737            for (j = 0; j < digits; ++j) {
5738                c = (unsigned char) s[j];
5739                if (!Py_ISXDIGIT(c)) {
5740                    endinpos = (s+j+1)-starts;
5741                    p = PyUnicode_AS_UNICODE(v) + i;
5742                    if (unicode_decode_call_errorhandler(
5743                            errors, &errorHandler,
5744                            "unicodeescape", message,
5745                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5746                            &v, &i, &p))
5747                        goto onError;
5748                    data = PyUnicode_AS_UNICODE(v);
5749                    goto nextByte;
5750                }
5751                chr = (chr<<4) & ~0xF;
5752                if (c >= '0' && c <= '9')
5753                    chr += c - '0';
5754                else if (c >= 'a' && c <= 'f')
5755                    chr += 10 + c - 'a';
5756                else
5757                    chr += 10 + c - 'A';
5758            }
5759            s += j;
5760            if (chr == 0xffffffff && PyErr_Occurred())
5761                /* _decoding_error will have already written into the
5762                   target buffer. */
5763                break;
5764        store:
5765            /* when we get here, chr is a 32-bit unicode character */
5766            if (chr <= 0xffff)
5767                /* UCS-2 character */
5768                WRITE_WSTR(data, i++, chr);
5769            else if (chr <= 0x10ffff) {
5770                /* UCS-4 character. Either store directly, or as
5771                   surrogate pair. */
5772#ifdef Py_UNICODE_WIDE
5773                WRITE_WSTR(data, i++, chr);
5774#else
5775                chr -= 0x10000L;
5776                WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5777                WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
5778#endif
5779            } else {
5780                endinpos = s-starts;
5781                p = PyUnicode_AS_UNICODE(v) + i;
5782                if (unicode_decode_call_errorhandler(
5783                        errors, &errorHandler,
5784                        "unicodeescape", "illegal Unicode character",
5785                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5786                        &v, &i, &p))
5787                    goto onError;
5788                data = PyUnicode_AS_UNICODE(v);
5789            }
5790            break;
5791
5792            /* \N{name} */
5793        case 'N':
5794            message = "malformed \\N character escape";
5795            if (ucnhash_CAPI == NULL) {
5796                /* load the unicode data module */
5797                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5798                                                PyUnicodeData_CAPSULE_NAME, 1);
5799                if (ucnhash_CAPI == NULL)
5800                    goto ucnhashError;
5801            }
5802            if (*s == '{') {
5803                const char *start = s+1;
5804                /* look for the closing brace */
5805                while (*s != '}' && s < end)
5806                    s++;
5807                if (s > start && s < end && *s == '}') {
5808                    /* found a name.  look it up in the unicode database */
5809                    message = "unknown Unicode character name";
5810                    s++;
5811                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5812                                              &chr))
5813                        goto store;
5814                }
5815            }
5816            endinpos = s-starts;
5817            p = PyUnicode_AS_UNICODE(v) + i;
5818            if (unicode_decode_call_errorhandler(
5819                    errors, &errorHandler,
5820                    "unicodeescape", message,
5821                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5822                    &v, &i, &p))
5823                goto onError;
5824            data = PyUnicode_AS_UNICODE(v);
5825            break;
5826
5827        default:
5828            if (s > end) {
5829                assert(kind == PyUnicode_WCHAR_KIND);
5830                message = "\\ at end of string";
5831                s--;
5832                endinpos = s-starts;
5833                p = PyUnicode_AS_UNICODE(v) + i;
5834                if (unicode_decode_call_errorhandler(
5835                        errors, &errorHandler,
5836                        "unicodeescape", message,
5837                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5838                        &v, &i, &p))
5839                    goto onError;
5840                data = PyUnicode_AS_UNICODE(v);
5841            }
5842            else {
5843                WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5844                WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
5845            }
5846            break;
5847        }
5848      nextByte:
5849        ;
5850    }
5851    /* Ensure the length prediction worked in case of ASCII strings */
5852    assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5853
5854    if (kind == PyUnicode_WCHAR_KIND)
5855    {
5856        if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5857            goto onError;
5858    }
5859    Py_XDECREF(errorHandler);
5860    Py_XDECREF(exc);
5861#ifndef DONT_MAKE_RESULT_READY
5862    if (_PyUnicode_READY_REPLACE(&v)) {
5863        Py_DECREF(v);
5864        return NULL;
5865    }
5866#endif
5867    assert(_PyUnicode_CheckConsistency(v, 1));
5868    return (PyObject *)v;
5869
5870  ucnhashError:
5871    PyErr_SetString(
5872        PyExc_UnicodeError,
5873        "\\N escapes not supported (can't load unicodedata module)"
5874        );
5875    Py_XDECREF(v);
5876    Py_XDECREF(errorHandler);
5877    Py_XDECREF(exc);
5878    return NULL;
5879
5880  onError:
5881    Py_XDECREF(v);
5882    Py_XDECREF(errorHandler);
5883    Py_XDECREF(exc);
5884    return NULL;
5885}
5886
5887#undef WRITE_ASCII_OR_WSTR
5888#undef WRITE_WSTR
5889
5890/* Return a Unicode-Escape string version of the Unicode object.
5891
5892   If quotes is true, the string is enclosed in u"" or u'' quotes as
5893   appropriate.
5894
5895*/
5896
5897static const char *hexdigits = "0123456789abcdef";
5898
5899PyObject *
5900PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5901                              Py_ssize_t size)
5902{
5903    PyObject *repr;
5904    char *p;
5905
5906#ifdef Py_UNICODE_WIDE
5907    const Py_ssize_t expandsize = 10;
5908#else
5909    const Py_ssize_t expandsize = 6;
5910#endif
5911
5912    /* XXX(nnorwitz): rather than over-allocating, it would be
5913       better to choose a different scheme.  Perhaps scan the
5914       first N-chars of the string and allocate based on that size.
5915    */
5916    /* Initial allocation is based on the longest-possible unichr
5917       escape.
5918
5919       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5920       unichr, so in this case it's the longest unichr escape. In
5921       narrow (UTF-16) builds this is five chars per source unichr
5922       since there are two unichrs in the surrogate pair, so in narrow
5923       (UTF-16) builds it's not the longest unichr escape.
5924
5925       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5926       so in the narrow (UTF-16) build case it's the longest unichr
5927       escape.
5928    */
5929
5930    if (size == 0)
5931        return PyBytes_FromStringAndSize(NULL, 0);
5932
5933    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5934        return PyErr_NoMemory();
5935
5936    repr = PyBytes_FromStringAndSize(NULL,
5937                                     2
5938                                     + expandsize*size
5939                                     + 1);
5940    if (repr == NULL)
5941        return NULL;
5942
5943    p = PyBytes_AS_STRING(repr);
5944
5945    while (size-- > 0) {
5946        Py_UNICODE ch = *s++;
5947
5948        /* Escape backslashes */
5949        if (ch == '\\') {
5950            *p++ = '\\';
5951            *p++ = (char) ch;
5952            continue;
5953        }
5954
5955#ifdef Py_UNICODE_WIDE
5956        /* Map 21-bit characters to '\U00xxxxxx' */
5957        else if (ch >= 0x10000) {
5958            *p++ = '\\';
5959            *p++ = 'U';
5960            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5961            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5962            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5963            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5964            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5965            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5966            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5967            *p++ = hexdigits[ch & 0x0000000F];
5968            continue;
5969        }
5970#else
5971        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5972        else if (ch >= 0xD800 && ch < 0xDC00) {
5973            Py_UNICODE ch2;
5974            Py_UCS4 ucs;
5975
5976            ch2 = *s++;
5977            size--;
5978            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5979                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5980                *p++ = '\\';
5981                *p++ = 'U';
5982                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5983                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5984                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5985                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5986                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5987                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5988                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5989                *p++ = hexdigits[ucs & 0x0000000F];
5990                continue;
5991            }
5992            /* Fall through: isolated surrogates are copied as-is */
5993            s--;
5994            size++;
5995        }
5996#endif
5997
5998        /* Map 16-bit characters to '\uxxxx' */
5999        if (ch >= 256) {
6000            *p++ = '\\';
6001            *p++ = 'u';
6002            *p++ = hexdigits[(ch >> 12) & 0x000F];
6003            *p++ = hexdigits[(ch >> 8) & 0x000F];
6004            *p++ = hexdigits[(ch >> 4) & 0x000F];
6005            *p++ = hexdigits[ch & 0x000F];
6006        }
6007
6008        /* Map special whitespace to '\t', \n', '\r' */
6009        else if (ch == '\t') {
6010            *p++ = '\\';
6011            *p++ = 't';
6012        }
6013        else if (ch == '\n') {
6014            *p++ = '\\';
6015            *p++ = 'n';
6016        }
6017        else if (ch == '\r') {
6018            *p++ = '\\';
6019            *p++ = 'r';
6020        }
6021
6022        /* Map non-printable US ASCII to '\xhh' */
6023        else if (ch < ' ' || ch >= 0x7F) {
6024            *p++ = '\\';
6025            *p++ = 'x';
6026            *p++ = hexdigits[(ch >> 4) & 0x000F];
6027            *p++ = hexdigits[ch & 0x000F];
6028        }
6029
6030        /* Copy everything else as-is */
6031        else
6032            *p++ = (char) ch;
6033    }
6034
6035    assert(p - PyBytes_AS_STRING(repr) > 0);
6036    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6037        return NULL;
6038    return repr;
6039}
6040
6041PyObject *
6042PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6043{
6044    PyObject *s;
6045    if (!PyUnicode_Check(unicode)) {
6046        PyErr_BadArgument();
6047        return NULL;
6048    }
6049    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6050                                      PyUnicode_GET_SIZE(unicode));
6051    return s;
6052}
6053
6054/* --- Raw Unicode Escape Codec ------------------------------------------- */
6055
6056PyObject *
6057PyUnicode_DecodeRawUnicodeEscape(const char *s,
6058                                 Py_ssize_t size,
6059                                 const char *errors)
6060{
6061    const char *starts = s;
6062    Py_ssize_t startinpos;
6063    Py_ssize_t endinpos;
6064    Py_ssize_t outpos;
6065    PyUnicodeObject *v;
6066    Py_UNICODE *p;
6067    const char *end;
6068    const char *bs;
6069    PyObject *errorHandler = NULL;
6070    PyObject *exc = NULL;
6071
6072    /* Escaped strings will always be longer than the resulting
6073       Unicode string, so we start with size here and then reduce the
6074       length after conversion to the true value. (But decoding error
6075       handler might have to resize the string) */
6076    v = _PyUnicode_New(size);
6077    if (v == NULL)
6078        goto onError;
6079    if (size == 0)
6080        return (PyObject *)v;
6081    p = PyUnicode_AS_UNICODE(v);
6082    end = s + size;
6083    while (s < end) {
6084        unsigned char c;
6085        Py_UCS4 x;
6086        int i;
6087        int count;
6088
6089        /* Non-escape characters are interpreted as Unicode ordinals */
6090        if (*s != '\\') {
6091            *p++ = (unsigned char)*s++;
6092            continue;
6093        }
6094        startinpos = s-starts;
6095
6096        /* \u-escapes are only interpreted iff the number of leading
6097           backslashes if odd */
6098        bs = s;
6099        for (;s < end;) {
6100            if (*s != '\\')
6101                break;
6102            *p++ = (unsigned char)*s++;
6103        }
6104        if (((s - bs) & 1) == 0 ||
6105            s >= end ||
6106            (*s != 'u' && *s != 'U')) {
6107            continue;
6108        }
6109        p--;
6110        count = *s=='u' ? 4 : 8;
6111        s++;
6112
6113        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6114        outpos = p-PyUnicode_AS_UNICODE(v);
6115        for (x = 0, i = 0; i < count; ++i, ++s) {
6116            c = (unsigned char)*s;
6117            if (!Py_ISXDIGIT(c)) {
6118                endinpos = s-starts;
6119                if (unicode_decode_call_errorhandler(
6120                        errors, &errorHandler,
6121                        "rawunicodeescape", "truncated \\uXXXX",
6122                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6123                        &v, &outpos, &p))
6124                    goto onError;
6125                goto nextByte;
6126            }
6127            x = (x<<4) & ~0xF;
6128            if (c >= '0' && c <= '9')
6129                x += c - '0';
6130            else if (c >= 'a' && c <= 'f')
6131                x += 10 + c - 'a';
6132            else
6133                x += 10 + c - 'A';
6134        }
6135        if (x <= 0xffff)
6136            /* UCS-2 character */
6137            *p++ = (Py_UNICODE) x;
6138        else if (x <= 0x10ffff) {
6139            /* UCS-4 character. Either store directly, or as
6140               surrogate pair. */
6141#ifdef Py_UNICODE_WIDE
6142            *p++ = (Py_UNICODE) x;
6143#else
6144            x -= 0x10000L;
6145            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6146            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
6147#endif
6148        } else {
6149            endinpos = s-starts;
6150            outpos = p-PyUnicode_AS_UNICODE(v);
6151            if (unicode_decode_call_errorhandler(
6152                    errors, &errorHandler,
6153                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6154                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6155                    &v, &outpos, &p))
6156                goto onError;
6157        }
6158      nextByte:
6159        ;
6160    }
6161    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6162        goto onError;
6163    Py_XDECREF(errorHandler);
6164    Py_XDECREF(exc);
6165#ifndef DONT_MAKE_RESULT_READY
6166    if (_PyUnicode_READY_REPLACE(&v)) {
6167        Py_DECREF(v);
6168        return NULL;
6169    }
6170#endif
6171    assert(_PyUnicode_CheckConsistency(v, 1));
6172    return (PyObject *)v;
6173
6174  onError:
6175    Py_XDECREF(v);
6176    Py_XDECREF(errorHandler);
6177    Py_XDECREF(exc);
6178    return NULL;
6179}
6180
6181PyObject *
6182PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6183                                 Py_ssize_t size)
6184{
6185    PyObject *repr;
6186    char *p;
6187    char *q;
6188
6189#ifdef Py_UNICODE_WIDE
6190    const Py_ssize_t expandsize = 10;
6191#else
6192    const Py_ssize_t expandsize = 6;
6193#endif
6194
6195    if (size > PY_SSIZE_T_MAX / expandsize)
6196        return PyErr_NoMemory();
6197
6198    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
6199    if (repr == NULL)
6200        return NULL;
6201    if (size == 0)
6202        return repr;
6203
6204    p = q = PyBytes_AS_STRING(repr);
6205    while (size-- > 0) {
6206        Py_UNICODE ch = *s++;
6207#ifdef Py_UNICODE_WIDE
6208        /* Map 32-bit characters to '\Uxxxxxxxx' */
6209        if (ch >= 0x10000) {
6210            *p++ = '\\';
6211            *p++ = 'U';
6212            *p++ = hexdigits[(ch >> 28) & 0xf];
6213            *p++ = hexdigits[(ch >> 24) & 0xf];
6214            *p++ = hexdigits[(ch >> 20) & 0xf];
6215            *p++ = hexdigits[(ch >> 16) & 0xf];
6216            *p++ = hexdigits[(ch >> 12) & 0xf];
6217            *p++ = hexdigits[(ch >> 8) & 0xf];
6218            *p++ = hexdigits[(ch >> 4) & 0xf];
6219            *p++ = hexdigits[ch & 15];
6220        }
6221        else
6222#else
6223            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6224            if (ch >= 0xD800 && ch < 0xDC00) {
6225                Py_UNICODE ch2;
6226                Py_UCS4 ucs;
6227
6228                ch2 = *s++;
6229                size--;
6230                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6231                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6232                    *p++ = '\\';
6233                    *p++ = 'U';
6234                    *p++ = hexdigits[(ucs >> 28) & 0xf];
6235                    *p++ = hexdigits[(ucs >> 24) & 0xf];
6236                    *p++ = hexdigits[(ucs >> 20) & 0xf];
6237                    *p++ = hexdigits[(ucs >> 16) & 0xf];
6238                    *p++ = hexdigits[(ucs >> 12) & 0xf];
6239                    *p++ = hexdigits[(ucs >> 8) & 0xf];
6240                    *p++ = hexdigits[(ucs >> 4) & 0xf];
6241                    *p++ = hexdigits[ucs & 0xf];
6242                    continue;
6243                }
6244                /* Fall through: isolated surrogates are copied as-is */
6245                s--;
6246                size++;
6247            }
6248#endif
6249        /* Map 16-bit characters to '\uxxxx' */
6250        if (ch >= 256) {
6251            *p++ = '\\';
6252            *p++ = 'u';
6253            *p++ = hexdigits[(ch >> 12) & 0xf];
6254            *p++ = hexdigits[(ch >> 8) & 0xf];
6255            *p++ = hexdigits[(ch >> 4) & 0xf];
6256            *p++ = hexdigits[ch & 15];
6257        }
6258        /* Copy everything else as-is */
6259        else
6260            *p++ = (char) ch;
6261    }
6262    size = p - q;
6263
6264    assert(size > 0);
6265    if (_PyBytes_Resize(&repr, size) < 0)
6266        return NULL;
6267    return repr;
6268}
6269
6270PyObject *
6271PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6272{
6273    PyObject *s;
6274    if (!PyUnicode_Check(unicode)) {
6275        PyErr_BadArgument();
6276        return NULL;
6277    }
6278    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6279                                         PyUnicode_GET_SIZE(unicode));
6280
6281    return s;
6282}
6283
6284/* --- Unicode Internal Codec ------------------------------------------- */
6285
6286PyObject *
6287_PyUnicode_DecodeUnicodeInternal(const char *s,
6288                                 Py_ssize_t size,
6289                                 const char *errors)
6290{
6291    const char *starts = s;
6292    Py_ssize_t startinpos;
6293    Py_ssize_t endinpos;
6294    Py_ssize_t outpos;
6295    PyUnicodeObject *v;
6296    Py_UNICODE *p;
6297    const char *end;
6298    const char *reason;
6299    PyObject *errorHandler = NULL;
6300    PyObject *exc = NULL;
6301
6302#ifdef Py_UNICODE_WIDE
6303    Py_UNICODE unimax = PyUnicode_GetMax();
6304#endif
6305
6306    /* XXX overflow detection missing */
6307    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6308    if (v == NULL)
6309        goto onError;
6310    /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6311       as string was created with the old API. */
6312    if (PyUnicode_GET_SIZE(v) == 0)
6313        return (PyObject *)v;
6314    p = PyUnicode_AS_UNICODE(v);
6315    end = s + size;
6316
6317    while (s < end) {
6318        memcpy(p, s, sizeof(Py_UNICODE));
6319        /* We have to sanity check the raw data, otherwise doom looms for
6320           some malformed UCS-4 data. */
6321        if (
6322#ifdef Py_UNICODE_WIDE
6323            *p > unimax || *p < 0 ||
6324#endif
6325            end-s < Py_UNICODE_SIZE
6326            )
6327        {
6328            startinpos = s - starts;
6329            if (end-s < Py_UNICODE_SIZE) {
6330                endinpos = end-starts;
6331                reason = "truncated input";
6332            }
6333            else {
6334                endinpos = s - starts + Py_UNICODE_SIZE;
6335                reason = "illegal code point (> 0x10FFFF)";
6336            }
6337            outpos = p - PyUnicode_AS_UNICODE(v);
6338            if (unicode_decode_call_errorhandler(
6339                    errors, &errorHandler,
6340                    "unicode_internal", reason,
6341                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6342                    &v, &outpos, &p)) {
6343                goto onError;
6344            }
6345        }
6346        else {
6347            p++;
6348            s += Py_UNICODE_SIZE;
6349        }
6350    }
6351
6352    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6353        goto onError;
6354    Py_XDECREF(errorHandler);
6355    Py_XDECREF(exc);
6356#ifndef DONT_MAKE_RESULT_READY
6357    if (_PyUnicode_READY_REPLACE(&v)) {
6358        Py_DECREF(v);
6359        return NULL;
6360    }
6361#endif
6362    assert(_PyUnicode_CheckConsistency(v, 1));
6363    return (PyObject *)v;
6364
6365  onError:
6366    Py_XDECREF(v);
6367    Py_XDECREF(errorHandler);
6368    Py_XDECREF(exc);
6369    return NULL;
6370}
6371
6372/* --- Latin-1 Codec ------------------------------------------------------ */
6373
6374PyObject *
6375PyUnicode_DecodeLatin1(const char *s,
6376                       Py_ssize_t size,
6377                       const char *errors)
6378{
6379    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6380    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6381}
6382
6383/* create or adjust a UnicodeEncodeError */
6384static void
6385make_encode_exception(PyObject **exceptionObject,
6386                      const char *encoding,
6387                      const Py_UNICODE *unicode, Py_ssize_t size,
6388                      Py_ssize_t startpos, Py_ssize_t endpos,
6389                      const char *reason)
6390{
6391    if (*exceptionObject == NULL) {
6392        *exceptionObject = PyUnicodeEncodeError_Create(
6393            encoding, unicode, size, startpos, endpos, reason);
6394    }
6395    else {
6396        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6397            goto onError;
6398        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6399            goto onError;
6400        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6401            goto onError;
6402        return;
6403      onError:
6404        Py_DECREF(*exceptionObject);
6405        *exceptionObject = NULL;
6406    }
6407}
6408
6409/* raises a UnicodeEncodeError */
6410static void
6411raise_encode_exception(PyObject **exceptionObject,
6412                       const char *encoding,
6413                       const Py_UNICODE *unicode, Py_ssize_t size,
6414                       Py_ssize_t startpos, Py_ssize_t endpos,
6415                       const char *reason)
6416{
6417    make_encode_exception(exceptionObject,
6418                          encoding, unicode, size, startpos, endpos, reason);
6419    if (*exceptionObject != NULL)
6420        PyCodec_StrictErrors(*exceptionObject);
6421}
6422
6423/* error handling callback helper:
6424   build arguments, call the callback and check the arguments,
6425   put the result into newpos and return the replacement string, which
6426   has to be freed by the caller */
6427static PyObject *
6428unicode_encode_call_errorhandler(const char *errors,
6429                                 PyObject **errorHandler,
6430                                 const char *encoding, const char *reason,
6431                                 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6432                                 Py_ssize_t startpos, Py_ssize_t endpos,
6433                                 Py_ssize_t *newpos)
6434{
6435    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6436
6437    PyObject *restuple;
6438    PyObject *resunicode;
6439
6440    if (*errorHandler == NULL) {
6441        *errorHandler = PyCodec_LookupError(errors);
6442        if (*errorHandler == NULL)
6443            return NULL;
6444    }
6445
6446    make_encode_exception(exceptionObject,
6447                          encoding, unicode, size, startpos, endpos, reason);
6448    if (*exceptionObject == NULL)
6449        return NULL;
6450
6451    restuple = PyObject_CallFunctionObjArgs(
6452        *errorHandler, *exceptionObject, NULL);
6453    if (restuple == NULL)
6454        return NULL;
6455    if (!PyTuple_Check(restuple)) {
6456        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6457        Py_DECREF(restuple);
6458        return NULL;
6459    }
6460    if (!PyArg_ParseTuple(restuple, argparse,
6461                          &resunicode, newpos)) {
6462        Py_DECREF(restuple);
6463        return NULL;
6464    }
6465    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6466        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6467        Py_DECREF(restuple);
6468        return NULL;
6469    }
6470    if (*newpos<0)
6471        *newpos = size+*newpos;
6472    if (*newpos<0 || *newpos>size) {
6473        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6474        Py_DECREF(restuple);
6475        return NULL;
6476    }
6477    Py_INCREF(resunicode);
6478    Py_DECREF(restuple);
6479    return resunicode;
6480}
6481
6482static PyObject *
6483unicode_encode_ucs1(const Py_UNICODE *p,
6484                    Py_ssize_t size,
6485                    const char *errors,
6486                    int limit)
6487{
6488    /* output object */
6489    PyObject *res;
6490    /* pointers to the beginning and end+1 of input */
6491    const Py_UNICODE *startp = p;
6492    const Py_UNICODE *endp = p + size;
6493    /* pointer to the beginning of the unencodable characters */
6494    /* const Py_UNICODE *badp = NULL; */
6495    /* pointer into the output */
6496    char *str;
6497    /* current output position */
6498    Py_ssize_t ressize;
6499    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6500    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6501    PyObject *errorHandler = NULL;
6502    PyObject *exc = NULL;
6503    /* the following variable is used for caching string comparisons
6504     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6505    int known_errorHandler = -1;
6506
6507    /* allocate enough for a simple encoding without
6508       replacements, if we need more, we'll resize */
6509    if (size == 0)
6510        return PyBytes_FromStringAndSize(NULL, 0);
6511    res = PyBytes_FromStringAndSize(NULL, size);
6512    if (res == NULL)
6513        return NULL;
6514    str = PyBytes_AS_STRING(res);
6515    ressize = size;
6516
6517    while (p<endp) {
6518        Py_UNICODE c = *p;
6519
6520        /* can we encode this? */
6521        if (c<limit) {
6522            /* no overflow check, because we know that the space is enough */
6523            *str++ = (char)c;
6524            ++p;
6525        }
6526        else {
6527            Py_ssize_t unicodepos = p-startp;
6528            Py_ssize_t requiredsize;
6529            PyObject *repunicode;
6530            Py_ssize_t repsize;
6531            Py_ssize_t newpos;
6532            Py_ssize_t respos;
6533            Py_UNICODE *uni2;
6534            /* startpos for collecting unencodable chars */
6535            const Py_UNICODE *collstart = p;
6536            const Py_UNICODE *collend = p;
6537            /* find all unecodable characters */
6538            while ((collend < endp) && ((*collend)>=limit))
6539                ++collend;
6540            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6541            if (known_errorHandler==-1) {
6542                if ((errors==NULL) || (!strcmp(errors, "strict")))
6543                    known_errorHandler = 1;
6544                else if (!strcmp(errors, "replace"))
6545                    known_errorHandler = 2;
6546                else if (!strcmp(errors, "ignore"))
6547                    known_errorHandler = 3;
6548                else if (!strcmp(errors, "xmlcharrefreplace"))
6549                    known_errorHandler = 4;
6550                else
6551                    known_errorHandler = 0;
6552            }
6553            switch (known_errorHandler) {
6554            case 1: /* strict */
6555                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6556                goto onError;
6557            case 2: /* replace */
6558                while (collstart++<collend)
6559                    *str++ = '?'; /* fall through */
6560            case 3: /* ignore */
6561                p = collend;
6562                break;
6563            case 4: /* xmlcharrefreplace */
6564                respos = str - PyBytes_AS_STRING(res);
6565                /* determine replacement size (temporarily (mis)uses p) */
6566                for (p = collstart, repsize = 0; p < collend; ++p) {
6567                    if (*p<10)
6568                        repsize += 2+1+1;
6569                    else if (*p<100)
6570                        repsize += 2+2+1;
6571                    else if (*p<1000)
6572                        repsize += 2+3+1;
6573                    else if (*p<10000)
6574                        repsize += 2+4+1;
6575#ifndef Py_UNICODE_WIDE
6576                    else
6577                        repsize += 2+5+1;
6578#else
6579                    else if (*p<100000)
6580                        repsize += 2+5+1;
6581                    else if (*p<1000000)
6582                        repsize += 2+6+1;
6583                    else
6584                        repsize += 2+7+1;
6585#endif
6586                }
6587                requiredsize = respos+repsize+(endp-collend);
6588                if (requiredsize > ressize) {
6589                    if (requiredsize<2*ressize)
6590                        requiredsize = 2*ressize;
6591                    if (_PyBytes_Resize(&res, requiredsize))
6592                        goto onError;
6593                    str = PyBytes_AS_STRING(res) + respos;
6594                    ressize = requiredsize;
6595                }
6596                /* generate replacement (temporarily (mis)uses p) */
6597                for (p = collstart; p < collend; ++p) {
6598                    str += sprintf(str, "&#%d;", (int)*p);
6599                }
6600                p = collend;
6601                break;
6602            default:
6603                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6604                                                              encoding, reason, startp, size, &exc,
6605                                                              collstart-startp, collend-startp, &newpos);
6606                if (repunicode == NULL)
6607                    goto onError;
6608                if (PyBytes_Check(repunicode)) {
6609                    /* Directly copy bytes result to output. */
6610                    repsize = PyBytes_Size(repunicode);
6611                    if (repsize > 1) {
6612                        /* Make room for all additional bytes. */
6613                        respos = str - PyBytes_AS_STRING(res);
6614                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6615                            Py_DECREF(repunicode);
6616                            goto onError;
6617                        }
6618                        str = PyBytes_AS_STRING(res) + respos;
6619                        ressize += repsize-1;
6620                    }
6621                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6622                    str += repsize;
6623                    p = startp + newpos;
6624                    Py_DECREF(repunicode);
6625                    break;
6626                }
6627                /* need more space? (at least enough for what we
6628                   have+the replacement+the rest of the string, so
6629                   we won't have to check space for encodable characters) */
6630                respos = str - PyBytes_AS_STRING(res);
6631                repsize = PyUnicode_GET_SIZE(repunicode);
6632                requiredsize = respos+repsize+(endp-collend);
6633                if (requiredsize > ressize) {
6634                    if (requiredsize<2*ressize)
6635                        requiredsize = 2*ressize;
6636                    if (_PyBytes_Resize(&res, requiredsize)) {
6637                        Py_DECREF(repunicode);
6638                        goto onError;
6639                    }
6640                    str = PyBytes_AS_STRING(res) + respos;
6641                    ressize = requiredsize;
6642                }
6643                /* check if there is anything unencodable in the replacement
6644                   and copy it to the output */
6645                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6646                    c = *uni2;
6647                    if (c >= limit) {
6648                        raise_encode_exception(&exc, encoding, startp, size,
6649                                               unicodepos, unicodepos+1, reason);
6650                        Py_DECREF(repunicode);
6651                        goto onError;
6652                    }
6653                    *str = (char)c;
6654                }
6655                p = startp + newpos;
6656                Py_DECREF(repunicode);
6657            }
6658        }
6659    }
6660    /* Resize if we allocated to much */
6661    size = str - PyBytes_AS_STRING(res);
6662    if (size < ressize) { /* If this falls res will be NULL */
6663        assert(size >= 0);
6664        if (_PyBytes_Resize(&res, size) < 0)
6665            goto onError;
6666    }
6667
6668    Py_XDECREF(errorHandler);
6669    Py_XDECREF(exc);
6670    return res;
6671
6672  onError:
6673    Py_XDECREF(res);
6674    Py_XDECREF(errorHandler);
6675    Py_XDECREF(exc);
6676    return NULL;
6677}
6678
6679PyObject *
6680PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6681                       Py_ssize_t size,
6682                       const char *errors)
6683{
6684    return unicode_encode_ucs1(p, size, errors, 256);
6685}
6686
6687PyObject *
6688_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6689{
6690    if (!PyUnicode_Check(unicode)) {
6691        PyErr_BadArgument();
6692        return NULL;
6693    }
6694    if (PyUnicode_READY(unicode) == -1)
6695        return NULL;
6696    /* Fast path: if it is a one-byte string, construct
6697       bytes object directly. */
6698    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6699        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6700                                         PyUnicode_GET_LENGTH(unicode));
6701    /* Non-Latin-1 characters present. Defer to above function to
6702       raise the exception. */
6703    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
6704                                  PyUnicode_GET_SIZE(unicode),
6705                                  errors);
6706}
6707
6708PyObject*
6709PyUnicode_AsLatin1String(PyObject *unicode)
6710{
6711    return _PyUnicode_AsLatin1String(unicode, NULL);
6712}
6713
6714/* --- 7-bit ASCII Codec -------------------------------------------------- */
6715
6716PyObject *
6717PyUnicode_DecodeASCII(const char *s,
6718                      Py_ssize_t size,
6719                      const char *errors)
6720{
6721    const char *starts = s;
6722    PyUnicodeObject *v;
6723    Py_UNICODE *u;
6724    Py_ssize_t startinpos;
6725    Py_ssize_t endinpos;
6726    Py_ssize_t outpos;
6727    const char *e;
6728    int has_error;
6729    const unsigned char *p = (const unsigned char *)s;
6730    const unsigned char *end = p + size;
6731    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
6732    PyObject *errorHandler = NULL;
6733    PyObject *exc = NULL;
6734
6735    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6736    if (size == 1 && (unsigned char)s[0] < 128)
6737        return get_latin1_char((unsigned char)s[0]);
6738
6739    has_error = 0;
6740    while (p < end && !has_error) {
6741        /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6742           an explanation. */
6743        if (!((size_t) p & LONG_PTR_MASK)) {
6744            /* Help register allocation */
6745            register const unsigned char *_p = p;
6746            while (_p < aligned_end) {
6747                unsigned long value = *(unsigned long *) _p;
6748                if (value & ASCII_CHAR_MASK) {
6749                    has_error = 1;
6750                    break;
6751                }
6752                _p += SIZEOF_LONG;
6753            }
6754            if (_p == end)
6755                break;
6756            if (has_error)
6757                break;
6758            p = _p;
6759        }
6760        if (*p & 0x80) {
6761            has_error = 1;
6762            break;
6763        }
6764        else {
6765            ++p;
6766        }
6767    }
6768    if (!has_error)
6769        return unicode_fromascii((const unsigned char *)s, size);
6770
6771    v = _PyUnicode_New(size);
6772    if (v == NULL)
6773        goto onError;
6774    if (size == 0)
6775        return (PyObject *)v;
6776    u = PyUnicode_AS_UNICODE(v);
6777    e = s + size;
6778    while (s < e) {
6779        register unsigned char c = (unsigned char)*s;
6780        if (c < 128) {
6781            *u++ = c;
6782            ++s;
6783        }
6784        else {
6785            startinpos = s-starts;
6786            endinpos = startinpos + 1;
6787            outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6788            if (unicode_decode_call_errorhandler(
6789                    errors, &errorHandler,
6790                    "ascii", "ordinal not in range(128)",
6791                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6792                    &v, &outpos, &u))
6793                goto onError;
6794        }
6795    }
6796    if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6797        if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
6798            goto onError;
6799    Py_XDECREF(errorHandler);
6800    Py_XDECREF(exc);
6801#ifndef DONT_MAKE_RESULT_READY
6802    if (_PyUnicode_READY_REPLACE(&v)) {
6803        Py_DECREF(v);
6804        return NULL;
6805    }
6806#endif
6807    assert(_PyUnicode_CheckConsistency(v, 1));
6808    return (PyObject *)v;
6809
6810  onError:
6811    Py_XDECREF(v);
6812    Py_XDECREF(errorHandler);
6813    Py_XDECREF(exc);
6814    return NULL;
6815}
6816
6817PyObject *
6818PyUnicode_EncodeASCII(const Py_UNICODE *p,
6819                      Py_ssize_t size,
6820                      const char *errors)
6821{
6822    return unicode_encode_ucs1(p, size, errors, 128);
6823}
6824
6825PyObject *
6826_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6827{
6828    if (!PyUnicode_Check(unicode)) {
6829        PyErr_BadArgument();
6830        return NULL;
6831    }
6832    if (PyUnicode_READY(unicode) == -1)
6833        return NULL;
6834    /* Fast path: if it is an ASCII-only string, construct bytes object
6835       directly. Else defer to above function to raise the exception. */
6836    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6837        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6838                                         PyUnicode_GET_LENGTH(unicode));
6839    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
6840                                 PyUnicode_GET_SIZE(unicode),
6841                                 errors);
6842}
6843
6844PyObject *
6845PyUnicode_AsASCIIString(PyObject *unicode)
6846{
6847    return _PyUnicode_AsASCIIString(unicode, NULL);
6848}
6849
6850#ifdef HAVE_MBCS
6851
6852/* --- MBCS codecs for Windows -------------------------------------------- */
6853
6854#if SIZEOF_INT < SIZEOF_SIZE_T
6855#define NEED_RETRY
6856#endif
6857
6858/* XXX This code is limited to "true" double-byte encodings, as
6859   a) it assumes an incomplete character consists of a single byte, and
6860   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
6861   encodings, see IsDBCSLeadByteEx documentation. */
6862
6863static int
6864is_dbcs_lead_byte(const char *s, int offset)
6865{
6866    const char *curr = s + offset;
6867
6868    if (IsDBCSLeadByte(*curr)) {
6869        const char *prev = CharPrev(s, curr);
6870        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
6871    }
6872    return 0;
6873}
6874
6875/*
6876 * Decode MBCS string into unicode object. If 'final' is set, converts
6877 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6878 */
6879static int
6880decode_mbcs(PyUnicodeObject **v,
6881            const char *s, /* MBCS string */
6882            int size, /* sizeof MBCS string */
6883            int final,
6884            const char *errors)
6885{
6886    Py_UNICODE *p;
6887    Py_ssize_t n;
6888    DWORD usize;
6889    DWORD flags;
6890
6891    assert(size >= 0);
6892
6893    /* check and handle 'errors' arg */
6894    if (errors==NULL || strcmp(errors, "strict")==0)
6895        flags = MB_ERR_INVALID_CHARS;
6896    else if (strcmp(errors, "ignore")==0)
6897        flags = 0;
6898    else {
6899        PyErr_Format(PyExc_ValueError,
6900                     "mbcs encoding does not support errors='%s'",
6901                     errors);
6902        return -1;
6903    }
6904
6905    /* Skip trailing lead-byte unless 'final' is set */
6906    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
6907        --size;
6908
6909    /* First get the size of the result */
6910    if (size > 0) {
6911        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6912        if (usize==0)
6913            goto mbcs_decode_error;
6914    } else
6915        usize = 0;
6916
6917    if (*v == NULL) {
6918        /* Create unicode object */
6919        *v = _PyUnicode_New(usize);
6920        if (*v == NULL)
6921            return -1;
6922        n = 0;
6923    }
6924    else {
6925        /* Extend unicode object */
6926        n = PyUnicode_GET_SIZE(*v);
6927        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
6928            return -1;
6929    }
6930
6931    /* Do the conversion */
6932    if (usize > 0) {
6933        p = PyUnicode_AS_UNICODE(*v) + n;
6934        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6935            goto mbcs_decode_error;
6936        }
6937    }
6938    return size;
6939
6940mbcs_decode_error:
6941    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6942       we raise a UnicodeDecodeError - else it is a 'generic'
6943       windows error
6944     */
6945    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6946        /* Ideally, we should get reason from FormatMessage - this
6947           is the Windows 2000 English version of the message
6948        */
6949        PyObject *exc = NULL;
6950        const char *reason = "No mapping for the Unicode character exists "
6951                             "in the target multi-byte code page.";
6952        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6953        if (exc != NULL) {
6954            PyCodec_StrictErrors(exc);
6955            Py_DECREF(exc);
6956        }
6957    } else {
6958        PyErr_SetFromWindowsErrWithFilename(0, NULL);
6959    }
6960    return -1;
6961}
6962
6963PyObject *
6964PyUnicode_DecodeMBCSStateful(const char *s,
6965                             Py_ssize_t size,
6966                             const char *errors,
6967                             Py_ssize_t *consumed)
6968{
6969    PyUnicodeObject *v = NULL;
6970    int done;
6971
6972    if (consumed)
6973        *consumed = 0;
6974
6975#ifdef NEED_RETRY
6976  retry:
6977    if (size > INT_MAX)
6978        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
6979    else
6980#endif
6981        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
6982
6983    if (done < 0) {
6984        Py_XDECREF(v);
6985        return NULL;
6986    }
6987
6988    if (consumed)
6989        *consumed += done;
6990
6991#ifdef NEED_RETRY
6992    if (size > INT_MAX) {
6993        s += done;
6994        size -= done;
6995        goto retry;
6996    }
6997#endif
6998#ifndef DONT_MAKE_RESULT_READY
6999    if (_PyUnicode_READY_REPLACE(&v)) {
7000        Py_DECREF(v);
7001        return NULL;
7002    }
7003#endif
7004    assert(_PyUnicode_CheckConsistency(v, 1));
7005    return (PyObject *)v;
7006}
7007
7008PyObject *
7009PyUnicode_DecodeMBCS(const char *s,
7010                     Py_ssize_t size,
7011                     const char *errors)
7012{
7013    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7014}
7015
7016/*
7017 * Convert unicode into string object (MBCS).
7018 * Returns 0 if succeed, -1 otherwise.
7019 */
7020static int
7021encode_mbcs(PyObject **repr,
7022            const Py_UNICODE *p, /* unicode */
7023            int size, /* size of unicode */
7024            const char* errors)
7025{
7026    BOOL usedDefaultChar = FALSE;
7027    BOOL *pusedDefaultChar;
7028    int mbcssize;
7029    Py_ssize_t n;
7030    PyObject *exc = NULL;
7031    DWORD flags;
7032
7033    assert(size >= 0);
7034
7035    /* check and handle 'errors' arg */
7036    if (errors==NULL || strcmp(errors, "strict")==0) {
7037        flags = WC_NO_BEST_FIT_CHARS;
7038        pusedDefaultChar = &usedDefaultChar;
7039    } else if (strcmp(errors, "replace")==0) {
7040        flags = 0;
7041        pusedDefaultChar = NULL;
7042    } else {
7043         PyErr_Format(PyExc_ValueError,
7044                      "mbcs encoding does not support errors='%s'",
7045                      errors);
7046         return -1;
7047    }
7048
7049    /* First get the size of the result */
7050    if (size > 0) {
7051        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7052                                       NULL, pusedDefaultChar);
7053        if (mbcssize == 0) {
7054            PyErr_SetFromWindowsErrWithFilename(0, NULL);
7055            return -1;
7056        }
7057        /* If we used a default char, then we failed! */
7058        if (pusedDefaultChar && *pusedDefaultChar)
7059            goto mbcs_encode_error;
7060    } else {
7061        mbcssize = 0;
7062    }
7063
7064    if (*repr == NULL) {
7065        /* Create string object */
7066        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7067        if (*repr == NULL)
7068            return -1;
7069        n = 0;
7070    }
7071    else {
7072        /* Extend string object */
7073        n = PyBytes_Size(*repr);
7074        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7075            return -1;
7076    }
7077
7078    /* Do the conversion */
7079    if (size > 0) {
7080        char *s = PyBytes_AS_STRING(*repr) + n;
7081        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7082                                     NULL, pusedDefaultChar)) {
7083            PyErr_SetFromWindowsErrWithFilename(0, NULL);
7084            return -1;
7085        }
7086        if (pusedDefaultChar && *pusedDefaultChar)
7087            goto mbcs_encode_error;
7088    }
7089    return 0;
7090
7091mbcs_encode_error:
7092    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7093    Py_XDECREF(exc);
7094    return -1;
7095}
7096
7097PyObject *
7098PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7099                     Py_ssize_t size,
7100                     const char *errors)
7101{
7102    PyObject *repr = NULL;
7103    int ret;
7104
7105#ifdef NEED_RETRY
7106  retry:
7107    if (size > INT_MAX)
7108        ret = encode_mbcs(&repr, p, INT_MAX, errors);
7109    else
7110#endif
7111        ret = encode_mbcs(&repr, p, (int)size, errors);
7112
7113    if (ret < 0) {
7114        Py_XDECREF(repr);
7115        return NULL;
7116    }
7117
7118#ifdef NEED_RETRY
7119    if (size > INT_MAX) {
7120        p += INT_MAX;
7121        size -= INT_MAX;
7122        goto retry;
7123    }
7124#endif
7125
7126    return repr;
7127}
7128
7129PyObject *
7130PyUnicode_AsMBCSString(PyObject *unicode)
7131{
7132    if (!PyUnicode_Check(unicode)) {
7133        PyErr_BadArgument();
7134        return NULL;
7135    }
7136    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
7137                                PyUnicode_GET_SIZE(unicode),
7138                                NULL);
7139}
7140
7141#undef NEED_RETRY
7142
7143#endif /* HAVE_MBCS */
7144
7145/* --- Character Mapping Codec -------------------------------------------- */
7146
7147PyObject *
7148PyUnicode_DecodeCharmap(const char *s,
7149                        Py_ssize_t size,
7150                        PyObject *mapping,
7151                        const char *errors)
7152{
7153    const char *starts = s;
7154    Py_ssize_t startinpos;
7155    Py_ssize_t endinpos;
7156    Py_ssize_t outpos;
7157    const char *e;
7158    PyUnicodeObject *v;
7159    Py_UNICODE *p;
7160    Py_ssize_t extrachars = 0;
7161    PyObject *errorHandler = NULL;
7162    PyObject *exc = NULL;
7163    Py_UNICODE *mapstring = NULL;
7164    Py_ssize_t maplen = 0;
7165
7166    /* Default to Latin-1 */
7167    if (mapping == NULL)
7168        return PyUnicode_DecodeLatin1(s, size, errors);
7169
7170    v = _PyUnicode_New(size);
7171    if (v == NULL)
7172        goto onError;
7173    if (size == 0)
7174        return (PyObject *)v;
7175    p = PyUnicode_AS_UNICODE(v);
7176    e = s + size;
7177    if (PyUnicode_CheckExact(mapping)) {
7178        mapstring = PyUnicode_AS_UNICODE(mapping);
7179        maplen = PyUnicode_GET_SIZE(mapping);
7180        while (s < e) {
7181            unsigned char ch = *s;
7182            Py_UNICODE x = 0xfffe; /* illegal value */
7183
7184            if (ch < maplen)
7185                x = mapstring[ch];
7186
7187            if (x == 0xfffe) {
7188                /* undefined mapping */
7189                outpos = p-PyUnicode_AS_UNICODE(v);
7190                startinpos = s-starts;
7191                endinpos = startinpos+1;
7192                if (unicode_decode_call_errorhandler(
7193                        errors, &errorHandler,
7194                        "charmap", "character maps to <undefined>",
7195                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7196                        &v, &outpos, &p)) {
7197                    goto onError;
7198                }
7199                continue;
7200            }
7201            *p++ = x;
7202            ++s;
7203        }
7204    }
7205    else {
7206        while (s < e) {
7207            unsigned char ch = *s;
7208            PyObject *w, *x;
7209
7210            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7211            w = PyLong_FromLong((long)ch);
7212            if (w == NULL)
7213                goto onError;
7214            x = PyObject_GetItem(mapping, w);
7215            Py_DECREF(w);
7216            if (x == NULL) {
7217                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7218                    /* No mapping found means: mapping is undefined. */
7219                    PyErr_Clear();
7220                    x = Py_None;
7221                    Py_INCREF(x);
7222                } else
7223                    goto onError;
7224            }
7225
7226            /* Apply mapping */
7227            if (PyLong_Check(x)) {
7228                long value = PyLong_AS_LONG(x);
7229                if (value < 0 || value > 65535) {
7230                    PyErr_SetString(PyExc_TypeError,
7231                                    "character mapping must be in range(65536)");
7232                    Py_DECREF(x);
7233                    goto onError;
7234                }
7235                *p++ = (Py_UNICODE)value;
7236            }
7237            else if (x == Py_None) {
7238                /* undefined mapping */
7239                outpos = p-PyUnicode_AS_UNICODE(v);
7240                startinpos = s-starts;
7241                endinpos = startinpos+1;
7242                if (unicode_decode_call_errorhandler(
7243                        errors, &errorHandler,
7244                        "charmap", "character maps to <undefined>",
7245                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7246                        &v, &outpos, &p)) {
7247                    Py_DECREF(x);
7248                    goto onError;
7249                }
7250                Py_DECREF(x);
7251                continue;
7252            }
7253            else if (PyUnicode_Check(x)) {
7254                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
7255
7256                if (targetsize == 1)
7257                    /* 1-1 mapping */
7258                    *p++ = *PyUnicode_AS_UNICODE(x);
7259
7260                else if (targetsize > 1) {
7261                    /* 1-n mapping */
7262                    if (targetsize > extrachars) {
7263                        /* resize first */
7264                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7265                        Py_ssize_t needed = (targetsize - extrachars) + \
7266                            (targetsize << 2);
7267                        extrachars += needed;
7268                        /* XXX overflow detection missing */
7269                        if (PyUnicode_Resize((PyObject**)&v,
7270                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
7271                            Py_DECREF(x);
7272                            goto onError;
7273                        }
7274                        p = PyUnicode_AS_UNICODE(v) + oldpos;
7275                    }
7276                    Py_UNICODE_COPY(p,
7277                                    PyUnicode_AS_UNICODE(x),
7278                                    targetsize);
7279                    p += targetsize;
7280                    extrachars -= targetsize;
7281                }
7282                /* 1-0 mapping: skip the character */
7283            }
7284            else {
7285                /* wrong return value */
7286                PyErr_SetString(PyExc_TypeError,
7287                                "character mapping must return integer, None or str");
7288                Py_DECREF(x);
7289                goto onError;
7290            }
7291            Py_DECREF(x);
7292            ++s;
7293        }
7294    }
7295    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
7296        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
7297            goto onError;
7298    Py_XDECREF(errorHandler);
7299    Py_XDECREF(exc);
7300#ifndef DONT_MAKE_RESULT_READY
7301    if (_PyUnicode_READY_REPLACE(&v)) {
7302        Py_DECREF(v);
7303        return NULL;
7304    }
7305#endif
7306    assert(_PyUnicode_CheckConsistency(v, 1));
7307    return (PyObject *)v;
7308
7309  onError:
7310    Py_XDECREF(errorHandler);
7311    Py_XDECREF(exc);
7312    Py_XDECREF(v);
7313    return NULL;
7314}
7315
7316/* Charmap encoding: the lookup table */
7317
7318struct encoding_map {
7319    PyObject_HEAD
7320    unsigned char level1[32];
7321    int count2, count3;
7322    unsigned char level23[1];
7323};
7324
7325static PyObject*
7326encoding_map_size(PyObject *obj, PyObject* args)
7327{
7328    struct encoding_map *map = (struct encoding_map*)obj;
7329    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7330                           128*map->count3);
7331}
7332
7333static PyMethodDef encoding_map_methods[] = {
7334    {"size", encoding_map_size, METH_NOARGS,
7335     PyDoc_STR("Return the size (in bytes) of this object") },
7336    { 0 }
7337};
7338
7339static void
7340encoding_map_dealloc(PyObject* o)
7341{
7342    PyObject_FREE(o);
7343}
7344
7345static PyTypeObject EncodingMapType = {
7346    PyVarObject_HEAD_INIT(NULL, 0)
7347    "EncodingMap",          /*tp_name*/
7348    sizeof(struct encoding_map),   /*tp_basicsize*/
7349    0,                      /*tp_itemsize*/
7350    /* methods */
7351    encoding_map_dealloc,   /*tp_dealloc*/
7352    0,                      /*tp_print*/
7353    0,                      /*tp_getattr*/
7354    0,                      /*tp_setattr*/
7355    0,                      /*tp_reserved*/
7356    0,                      /*tp_repr*/
7357    0,                      /*tp_as_number*/
7358    0,                      /*tp_as_sequence*/
7359    0,                      /*tp_as_mapping*/
7360    0,                      /*tp_hash*/
7361    0,                      /*tp_call*/
7362    0,                      /*tp_str*/
7363    0,                      /*tp_getattro*/
7364    0,                      /*tp_setattro*/
7365    0,                      /*tp_as_buffer*/
7366    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7367    0,                      /*tp_doc*/
7368    0,                      /*tp_traverse*/
7369    0,                      /*tp_clear*/
7370    0,                      /*tp_richcompare*/
7371    0,                      /*tp_weaklistoffset*/
7372    0,                      /*tp_iter*/
7373    0,                      /*tp_iternext*/
7374    encoding_map_methods,   /*tp_methods*/
7375    0,                      /*tp_members*/
7376    0,                      /*tp_getset*/
7377    0,                      /*tp_base*/
7378    0,                      /*tp_dict*/
7379    0,                      /*tp_descr_get*/
7380    0,                      /*tp_descr_set*/
7381    0,                      /*tp_dictoffset*/
7382    0,                      /*tp_init*/
7383    0,                      /*tp_alloc*/
7384    0,                      /*tp_new*/
7385    0,                      /*tp_free*/
7386    0,                      /*tp_is_gc*/
7387};
7388
7389PyObject*
7390PyUnicode_BuildEncodingMap(PyObject* string)
7391{
7392    PyObject *result;
7393    struct encoding_map *mresult;
7394    int i;
7395    int need_dict = 0;
7396    unsigned char level1[32];
7397    unsigned char level2[512];
7398    unsigned char *mlevel1, *mlevel2, *mlevel3;
7399    int count2 = 0, count3 = 0;
7400    int kind;
7401    void *data;
7402    Py_UCS4 ch;
7403
7404    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7405        PyErr_BadArgument();
7406        return NULL;
7407    }
7408    kind = PyUnicode_KIND(string);
7409    data = PyUnicode_DATA(string);
7410    memset(level1, 0xFF, sizeof level1);
7411    memset(level2, 0xFF, sizeof level2);
7412
7413    /* If there isn't a one-to-one mapping of NULL to \0,
7414       or if there are non-BMP characters, we need to use
7415       a mapping dictionary. */
7416    if (PyUnicode_READ(kind, data, 0) != 0)
7417        need_dict = 1;
7418    for (i = 1; i < 256; i++) {
7419        int l1, l2;
7420        ch = PyUnicode_READ(kind, data, i);
7421        if (ch == 0 || ch > 0xFFFF) {
7422            need_dict = 1;
7423            break;
7424        }
7425        if (ch == 0xFFFE)
7426            /* unmapped character */
7427            continue;
7428        l1 = ch >> 11;
7429        l2 = ch >> 7;
7430        if (level1[l1] == 0xFF)
7431            level1[l1] = count2++;
7432        if (level2[l2] == 0xFF)
7433            level2[l2] = count3++;
7434    }
7435
7436    if (count2 >= 0xFF || count3 >= 0xFF)
7437        need_dict = 1;
7438
7439    if (need_dict) {
7440        PyObject *result = PyDict_New();
7441        PyObject *key, *value;
7442        if (!result)
7443            return NULL;
7444        for (i = 0; i < 256; i++) {
7445            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7446            value = PyLong_FromLong(i);
7447            if (!key || !value)
7448                goto failed1;
7449            if (PyDict_SetItem(result, key, value) == -1)
7450                goto failed1;
7451            Py_DECREF(key);
7452            Py_DECREF(value);
7453        }
7454        return result;
7455      failed1:
7456        Py_XDECREF(key);
7457        Py_XDECREF(value);
7458        Py_DECREF(result);
7459        return NULL;
7460    }
7461
7462    /* Create a three-level trie */
7463    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7464                             16*count2 + 128*count3 - 1);
7465    if (!result)
7466        return PyErr_NoMemory();
7467    PyObject_Init(result, &EncodingMapType);
7468    mresult = (struct encoding_map*)result;
7469    mresult->count2 = count2;
7470    mresult->count3 = count3;
7471    mlevel1 = mresult->level1;
7472    mlevel2 = mresult->level23;
7473    mlevel3 = mresult->level23 + 16*count2;
7474    memcpy(mlevel1, level1, 32);
7475    memset(mlevel2, 0xFF, 16*count2);
7476    memset(mlevel3, 0, 128*count3);
7477    count3 = 0;
7478    for (i = 1; i < 256; i++) {
7479        int o1, o2, o3, i2, i3;
7480        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7481            /* unmapped character */
7482            continue;
7483        o1 = PyUnicode_READ(kind, data, i)>>11;
7484        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7485        i2 = 16*mlevel1[o1] + o2;
7486        if (mlevel2[i2] == 0xFF)
7487            mlevel2[i2] = count3++;
7488        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7489        i3 = 128*mlevel2[i2] + o3;
7490        mlevel3[i3] = i;
7491    }
7492    return result;
7493}
7494
7495static int
7496encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7497{
7498    struct encoding_map *map = (struct encoding_map*)mapping;
7499    int l1 = c>>11;
7500    int l2 = (c>>7) & 0xF;
7501    int l3 = c & 0x7F;
7502    int i;
7503
7504#ifdef Py_UNICODE_WIDE
7505    if (c > 0xFFFF) {
7506        return -1;
7507    }
7508#endif
7509    if (c == 0)
7510        return 0;
7511    /* level 1*/
7512    i = map->level1[l1];
7513    if (i == 0xFF) {
7514        return -1;
7515    }
7516    /* level 2*/
7517    i = map->level23[16*i+l2];
7518    if (i == 0xFF) {
7519        return -1;
7520    }
7521    /* level 3 */
7522    i = map->level23[16*map->count2 + 128*i + l3];
7523    if (i == 0) {
7524        return -1;
7525    }
7526    return i;
7527}
7528
7529/* Lookup the character ch in the mapping. If the character
7530   can't be found, Py_None is returned (or NULL, if another
7531   error occurred). */
7532static PyObject *
7533charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
7534{
7535    PyObject *w = PyLong_FromLong((long)c);
7536    PyObject *x;
7537
7538    if (w == NULL)
7539        return NULL;
7540    x = PyObject_GetItem(mapping, w);
7541    Py_DECREF(w);
7542    if (x == NULL) {
7543        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7544            /* No mapping found means: mapping is undefined. */
7545            PyErr_Clear();
7546            x = Py_None;
7547            Py_INCREF(x);
7548            return x;
7549        } else
7550            return NULL;
7551    }
7552    else if (x == Py_None)
7553        return x;
7554    else if (PyLong_Check(x)) {
7555        long value = PyLong_AS_LONG(x);
7556        if (value < 0 || value > 255) {
7557            PyErr_SetString(PyExc_TypeError,
7558                            "character mapping must be in range(256)");
7559            Py_DECREF(x);
7560            return NULL;
7561        }
7562        return x;
7563    }
7564    else if (PyBytes_Check(x))
7565        return x;
7566    else {
7567        /* wrong return value */
7568        PyErr_Format(PyExc_TypeError,
7569                     "character mapping must return integer, bytes or None, not %.400s",
7570                     x->ob_type->tp_name);
7571        Py_DECREF(x);
7572        return NULL;
7573    }
7574}
7575
7576static int
7577charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7578{
7579    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7580    /* exponentially overallocate to minimize reallocations */
7581    if (requiredsize < 2*outsize)
7582        requiredsize = 2*outsize;
7583    if (_PyBytes_Resize(outobj, requiredsize))
7584        return -1;
7585    return 0;
7586}
7587
7588typedef enum charmapencode_result {
7589    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7590} charmapencode_result;
7591/* lookup the character, put the result in the output string and adjust
7592   various state variables. Resize the output bytes object if not enough
7593   space is available. Return a new reference to the object that
7594   was put in the output buffer, or Py_None, if the mapping was undefined
7595   (in which case no character was written) or NULL, if a
7596   reallocation error occurred. The caller must decref the result */
7597static charmapencode_result
7598charmapencode_output(Py_UNICODE c, PyObject *mapping,
7599                     PyObject **outobj, Py_ssize_t *outpos)
7600{
7601    PyObject *rep;
7602    char *outstart;
7603    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7604
7605    if (Py_TYPE(mapping) == &EncodingMapType) {
7606        int res = encoding_map_lookup(c, mapping);
7607        Py_ssize_t requiredsize = *outpos+1;
7608        if (res == -1)
7609            return enc_FAILED;
7610        if (outsize<requiredsize)
7611            if (charmapencode_resize(outobj, outpos, requiredsize))
7612                return enc_EXCEPTION;
7613        outstart = PyBytes_AS_STRING(*outobj);
7614        outstart[(*outpos)++] = (char)res;
7615        return enc_SUCCESS;
7616    }
7617
7618    rep = charmapencode_lookup(c, mapping);
7619    if (rep==NULL)
7620        return enc_EXCEPTION;
7621    else if (rep==Py_None) {
7622        Py_DECREF(rep);
7623        return enc_FAILED;
7624    } else {
7625        if (PyLong_Check(rep)) {
7626            Py_ssize_t requiredsize = *outpos+1;
7627            if (outsize<requiredsize)
7628                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7629                    Py_DECREF(rep);
7630                    return enc_EXCEPTION;
7631                }
7632            outstart = PyBytes_AS_STRING(*outobj);
7633            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7634        }
7635        else {
7636            const char *repchars = PyBytes_AS_STRING(rep);
7637            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7638            Py_ssize_t requiredsize = *outpos+repsize;
7639            if (outsize<requiredsize)
7640                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7641                    Py_DECREF(rep);
7642                    return enc_EXCEPTION;
7643                }
7644            outstart = PyBytes_AS_STRING(*outobj);
7645            memcpy(outstart + *outpos, repchars, repsize);
7646            *outpos += repsize;
7647        }
7648    }
7649    Py_DECREF(rep);
7650    return enc_SUCCESS;
7651}
7652
7653/* handle an error in PyUnicode_EncodeCharmap
7654   Return 0 on success, -1 on error */
7655static int
7656charmap_encoding_error(
7657    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
7658    PyObject **exceptionObject,
7659    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7660    PyObject **res, Py_ssize_t *respos)
7661{
7662    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7663    Py_ssize_t repsize;
7664    Py_ssize_t newpos;
7665    Py_UNICODE *uni2;
7666    /* startpos for collecting unencodable chars */
7667    Py_ssize_t collstartpos = *inpos;
7668    Py_ssize_t collendpos = *inpos+1;
7669    Py_ssize_t collpos;
7670    char *encoding = "charmap";
7671    char *reason = "character maps to <undefined>";
7672    charmapencode_result x;
7673
7674    /* find all unencodable characters */
7675    while (collendpos < size) {
7676        PyObject *rep;
7677        if (Py_TYPE(mapping) == &EncodingMapType) {
7678            int res = encoding_map_lookup(p[collendpos], mapping);
7679            if (res != -1)
7680                break;
7681            ++collendpos;
7682            continue;
7683        }
7684
7685        rep = charmapencode_lookup(p[collendpos], mapping);
7686        if (rep==NULL)
7687            return -1;
7688        else if (rep!=Py_None) {
7689            Py_DECREF(rep);
7690            break;
7691        }
7692        Py_DECREF(rep);
7693        ++collendpos;
7694    }
7695    /* cache callback name lookup
7696     * (if not done yet, i.e. it's the first error) */
7697    if (*known_errorHandler==-1) {
7698        if ((errors==NULL) || (!strcmp(errors, "strict")))
7699            *known_errorHandler = 1;
7700        else if (!strcmp(errors, "replace"))
7701            *known_errorHandler = 2;
7702        else if (!strcmp(errors, "ignore"))
7703            *known_errorHandler = 3;
7704        else if (!strcmp(errors, "xmlcharrefreplace"))
7705            *known_errorHandler = 4;
7706        else
7707            *known_errorHandler = 0;
7708    }
7709    switch (*known_errorHandler) {
7710    case 1: /* strict */
7711        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7712        return -1;
7713    case 2: /* replace */
7714        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7715            x = charmapencode_output('?', mapping, res, respos);
7716            if (x==enc_EXCEPTION) {
7717                return -1;
7718            }
7719            else if (x==enc_FAILED) {
7720                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7721                return -1;
7722            }
7723        }
7724        /* fall through */
7725    case 3: /* ignore */
7726        *inpos = collendpos;
7727        break;
7728    case 4: /* xmlcharrefreplace */
7729        /* generate replacement (temporarily (mis)uses p) */
7730        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7731            char buffer[2+29+1+1];
7732            char *cp;
7733            sprintf(buffer, "&#%d;", (int)p[collpos]);
7734            for (cp = buffer; *cp; ++cp) {
7735                x = charmapencode_output(*cp, mapping, res, respos);
7736                if (x==enc_EXCEPTION)
7737                    return -1;
7738                else if (x==enc_FAILED) {
7739                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7740                    return -1;
7741                }
7742            }
7743        }
7744        *inpos = collendpos;
7745        break;
7746    default:
7747        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7748                                                      encoding, reason, p, size, exceptionObject,
7749                                                      collstartpos, collendpos, &newpos);
7750        if (repunicode == NULL)
7751            return -1;
7752        if (PyBytes_Check(repunicode)) {
7753            /* Directly copy bytes result to output. */
7754            Py_ssize_t outsize = PyBytes_Size(*res);
7755            Py_ssize_t requiredsize;
7756            repsize = PyBytes_Size(repunicode);
7757            requiredsize = *respos + repsize;
7758            if (requiredsize > outsize)
7759                /* Make room for all additional bytes. */
7760                if (charmapencode_resize(res, respos, requiredsize)) {
7761                    Py_DECREF(repunicode);
7762                    return -1;
7763                }
7764            memcpy(PyBytes_AsString(*res) + *respos,
7765                   PyBytes_AsString(repunicode),  repsize);
7766            *respos += repsize;
7767            *inpos = newpos;
7768            Py_DECREF(repunicode);
7769            break;
7770        }
7771        /* generate replacement  */
7772        repsize = PyUnicode_GET_SIZE(repunicode);
7773        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7774            x = charmapencode_output(*uni2, mapping, res, respos);
7775            if (x==enc_EXCEPTION) {
7776                return -1;
7777            }
7778            else if (x==enc_FAILED) {
7779                Py_DECREF(repunicode);
7780                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7781                return -1;
7782            }
7783        }
7784        *inpos = newpos;
7785        Py_DECREF(repunicode);
7786    }
7787    return 0;
7788}
7789
7790PyObject *
7791PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7792                        Py_ssize_t size,
7793                        PyObject *mapping,
7794                        const char *errors)
7795{
7796    /* output object */
7797    PyObject *res = NULL;
7798    /* current input position */
7799    Py_ssize_t inpos = 0;
7800    /* current output position */
7801    Py_ssize_t respos = 0;
7802    PyObject *errorHandler = NULL;
7803    PyObject *exc = NULL;
7804    /* the following variable is used for caching string comparisons
7805     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7806     * 3=ignore, 4=xmlcharrefreplace */
7807    int known_errorHandler = -1;
7808
7809    /* Default to Latin-1 */
7810    if (mapping == NULL)
7811        return PyUnicode_EncodeLatin1(p, size, errors);
7812
7813    /* allocate enough for a simple encoding without
7814       replacements, if we need more, we'll resize */
7815    res = PyBytes_FromStringAndSize(NULL, size);
7816    if (res == NULL)
7817        goto onError;
7818    if (size == 0)
7819        return res;
7820
7821    while (inpos<size) {
7822        /* try to encode it */
7823        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7824        if (x==enc_EXCEPTION) /* error */
7825            goto onError;
7826        if (x==enc_FAILED) { /* unencodable character */
7827            if (charmap_encoding_error(p, size, &inpos, mapping,
7828                                       &exc,
7829                                       &known_errorHandler, &errorHandler, errors,
7830                                       &res, &respos)) {
7831                goto onError;
7832            }
7833        }
7834        else
7835            /* done with this character => adjust input position */
7836            ++inpos;
7837    }
7838
7839    /* Resize if we allocated to much */
7840    if (respos<PyBytes_GET_SIZE(res))
7841        if (_PyBytes_Resize(&res, respos) < 0)
7842            goto onError;
7843
7844    Py_XDECREF(exc);
7845    Py_XDECREF(errorHandler);
7846    return res;
7847
7848  onError:
7849    Py_XDECREF(res);
7850    Py_XDECREF(exc);
7851    Py_XDECREF(errorHandler);
7852    return NULL;
7853}
7854
7855PyObject *
7856PyUnicode_AsCharmapString(PyObject *unicode,
7857                          PyObject *mapping)
7858{
7859    if (!PyUnicode_Check(unicode) || mapping == NULL) {
7860        PyErr_BadArgument();
7861        return NULL;
7862    }
7863    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
7864                                   PyUnicode_GET_SIZE(unicode),
7865                                   mapping,
7866                                   NULL);
7867}
7868
7869/* create or adjust a UnicodeTranslateError */
7870static void
7871make_translate_exception(PyObject **exceptionObject,
7872                         PyObject *unicode,
7873                         Py_ssize_t startpos, Py_ssize_t endpos,
7874                         const char *reason)
7875{
7876    if (*exceptionObject == NULL) {
7877        *exceptionObject = _PyUnicodeTranslateError_Create(
7878            unicode, startpos, endpos, reason);
7879    }
7880    else {
7881        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7882            goto onError;
7883        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7884            goto onError;
7885        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7886            goto onError;
7887        return;
7888      onError:
7889        Py_DECREF(*exceptionObject);
7890        *exceptionObject = NULL;
7891    }
7892}
7893
7894/* raises a UnicodeTranslateError */
7895static void
7896raise_translate_exception(PyObject **exceptionObject,
7897                          PyObject *unicode,
7898                          Py_ssize_t startpos, Py_ssize_t endpos,
7899                          const char *reason)
7900{
7901    make_translate_exception(exceptionObject,
7902                             unicode, startpos, endpos, reason);
7903    if (*exceptionObject != NULL)
7904        PyCodec_StrictErrors(*exceptionObject);
7905}
7906
7907/* error handling callback helper:
7908   build arguments, call the callback and check the arguments,
7909   put the result into newpos and return the replacement string, which
7910   has to be freed by the caller */
7911static PyObject *
7912unicode_translate_call_errorhandler(const char *errors,
7913                                    PyObject **errorHandler,
7914                                    const char *reason,
7915                                    PyObject *unicode, PyObject **exceptionObject,
7916                                    Py_ssize_t startpos, Py_ssize_t endpos,
7917                                    Py_ssize_t *newpos)
7918{
7919    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
7920
7921    Py_ssize_t i_newpos;
7922    PyObject *restuple;
7923    PyObject *resunicode;
7924
7925    if (*errorHandler == NULL) {
7926        *errorHandler = PyCodec_LookupError(errors);
7927        if (*errorHandler == NULL)
7928            return NULL;
7929    }
7930
7931    make_translate_exception(exceptionObject,
7932                             unicode, startpos, endpos, reason);
7933    if (*exceptionObject == NULL)
7934        return NULL;
7935
7936    restuple = PyObject_CallFunctionObjArgs(
7937        *errorHandler, *exceptionObject, NULL);
7938    if (restuple == NULL)
7939        return NULL;
7940    if (!PyTuple_Check(restuple)) {
7941        PyErr_SetString(PyExc_TypeError, &argparse[4]);
7942        Py_DECREF(restuple);
7943        return NULL;
7944    }
7945    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
7946                          &resunicode, &i_newpos)) {
7947        Py_DECREF(restuple);
7948        return NULL;
7949    }
7950    if (i_newpos<0)
7951        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
7952    else
7953        *newpos = i_newpos;
7954    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
7955        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7956        Py_DECREF(restuple);
7957        return NULL;
7958    }
7959    Py_INCREF(resunicode);
7960    Py_DECREF(restuple);
7961    return resunicode;
7962}
7963
7964/* Lookup the character ch in the mapping and put the result in result,
7965   which must be decrefed by the caller.
7966   Return 0 on success, -1 on error */
7967static int
7968charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
7969{
7970    PyObject *w = PyLong_FromLong((long)c);
7971    PyObject *x;
7972
7973    if (w == NULL)
7974        return -1;
7975    x = PyObject_GetItem(mapping, w);
7976    Py_DECREF(w);
7977    if (x == NULL) {
7978        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7979            /* No mapping found means: use 1:1 mapping. */
7980            PyErr_Clear();
7981            *result = NULL;
7982            return 0;
7983        } else
7984            return -1;
7985    }
7986    else if (x == Py_None) {
7987        *result = x;
7988        return 0;
7989    }
7990    else if (PyLong_Check(x)) {
7991        long value = PyLong_AS_LONG(x);
7992        long max = PyUnicode_GetMax();
7993        if (value < 0 || value > max) {
7994            PyErr_Format(PyExc_TypeError,
7995                         "character mapping must be in range(0x%x)", max+1);
7996            Py_DECREF(x);
7997            return -1;
7998        }
7999        *result = x;
8000        return 0;
8001    }
8002    else if (PyUnicode_Check(x)) {
8003        *result = x;
8004        return 0;
8005    }
8006    else {
8007        /* wrong return value */
8008        PyErr_SetString(PyExc_TypeError,
8009                        "character mapping must return integer, None or str");
8010        Py_DECREF(x);
8011        return -1;
8012    }
8013}
8014/* ensure that *outobj is at least requiredsize characters long,
8015   if not reallocate and adjust various state variables.
8016   Return 0 on success, -1 on error */
8017static int
8018charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8019                               Py_ssize_t requiredsize)
8020{
8021    Py_ssize_t oldsize = *psize;
8022    if (requiredsize > oldsize) {
8023        /* exponentially overallocate to minimize reallocations */
8024        if (requiredsize < 2 * oldsize)
8025            requiredsize = 2 * oldsize;
8026        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8027        if (*outobj == 0)
8028            return -1;
8029        *psize = requiredsize;
8030    }
8031    return 0;
8032}
8033/* lookup the character, put the result in the output string and adjust
8034   various state variables. Return a new reference to the object that
8035   was put in the output buffer in *result, or Py_None, if the mapping was
8036   undefined (in which case no character was written).
8037   The called must decref result.
8038   Return 0 on success, -1 on error. */
8039static int
8040charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8041                        PyObject *mapping, Py_UCS4 **output,
8042                        Py_ssize_t *osize, Py_ssize_t *opos,
8043                        PyObject **res)
8044{
8045    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8046    if (charmaptranslate_lookup(curinp, mapping, res))
8047        return -1;
8048    if (*res==NULL) {
8049        /* not found => default to 1:1 mapping */
8050        (*output)[(*opos)++] = curinp;
8051    }
8052    else if (*res==Py_None)
8053        ;
8054    else if (PyLong_Check(*res)) {
8055        /* no overflow check, because we know that the space is enough */
8056        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8057    }
8058    else if (PyUnicode_Check(*res)) {
8059        Py_ssize_t repsize;
8060        if (PyUnicode_READY(*res) == -1)
8061            return -1;
8062        repsize = PyUnicode_GET_LENGTH(*res);
8063        if (repsize==1) {
8064            /* no overflow check, because we know that the space is enough */
8065            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8066        }
8067        else if (repsize!=0) {
8068            /* more than one character */
8069            Py_ssize_t requiredsize = *opos +
8070                (PyUnicode_GET_LENGTH(input) - ipos) +
8071                repsize - 1;
8072            Py_ssize_t i;
8073            if (charmaptranslate_makespace(output, osize, requiredsize))
8074                return -1;
8075            for(i = 0; i < repsize; i++)
8076                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8077        }
8078    }
8079    else
8080        return -1;
8081    return 0;
8082}
8083
8084PyObject *
8085_PyUnicode_TranslateCharmap(PyObject *input,
8086                            PyObject *mapping,
8087                            const char *errors)
8088{
8089    /* input object */
8090    char *idata;
8091    Py_ssize_t size, i;
8092    int kind;
8093    /* output buffer */
8094    Py_UCS4 *output = NULL;
8095    Py_ssize_t osize;
8096    PyObject *res;
8097    /* current output position */
8098    Py_ssize_t opos;
8099    char *reason = "character maps to <undefined>";
8100    PyObject *errorHandler = NULL;
8101    PyObject *exc = NULL;
8102    /* the following variable is used for caching string comparisons
8103     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8104     * 3=ignore, 4=xmlcharrefreplace */
8105    int known_errorHandler = -1;
8106
8107    if (mapping == NULL) {
8108        PyErr_BadArgument();
8109        return NULL;
8110    }
8111
8112    if (PyUnicode_READY(input) == -1)
8113        return NULL;
8114    idata = (char*)PyUnicode_DATA(input);
8115    kind = PyUnicode_KIND(input);
8116    size = PyUnicode_GET_LENGTH(input);
8117    i = 0;
8118
8119    if (size == 0) {
8120        Py_INCREF(input);
8121        return input;
8122    }
8123
8124    /* allocate enough for a simple 1:1 translation without
8125       replacements, if we need more, we'll resize */
8126    osize = size;
8127    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8128    opos = 0;
8129    if (output == NULL) {
8130        PyErr_NoMemory();
8131        goto onError;
8132    }
8133
8134    while (i<size) {
8135        /* try to encode it */
8136        PyObject *x = NULL;
8137        if (charmaptranslate_output(input, i, mapping,
8138                                    &output, &osize, &opos, &x)) {
8139            Py_XDECREF(x);
8140            goto onError;
8141        }
8142        Py_XDECREF(x);
8143        if (x!=Py_None) /* it worked => adjust input pointer */
8144            ++i;
8145        else { /* untranslatable character */
8146            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8147            Py_ssize_t repsize;
8148            Py_ssize_t newpos;
8149            Py_ssize_t uni2;
8150            /* startpos for collecting untranslatable chars */
8151            Py_ssize_t collstart = i;
8152            Py_ssize_t collend = i+1;
8153            Py_ssize_t coll;
8154
8155            /* find all untranslatable characters */
8156            while (collend < size) {
8157                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8158                    goto onError;
8159                Py_XDECREF(x);
8160                if (x!=Py_None)
8161                    break;
8162                ++collend;
8163            }
8164            /* cache callback name lookup
8165             * (if not done yet, i.e. it's the first error) */
8166            if (known_errorHandler==-1) {
8167                if ((errors==NULL) || (!strcmp(errors, "strict")))
8168                    known_errorHandler = 1;
8169                else if (!strcmp(errors, "replace"))
8170                    known_errorHandler = 2;
8171                else if (!strcmp(errors, "ignore"))
8172                    known_errorHandler = 3;
8173                else if (!strcmp(errors, "xmlcharrefreplace"))
8174                    known_errorHandler = 4;
8175                else
8176                    known_errorHandler = 0;
8177            }
8178            switch (known_errorHandler) {
8179            case 1: /* strict */
8180                raise_translate_exception(&exc, input, collstart,
8181                                          collend, reason);
8182                goto onError;
8183            case 2: /* replace */
8184                /* No need to check for space, this is a 1:1 replacement */
8185                for (coll = collstart; coll<collend; coll++)
8186                    output[opos++] = '?';
8187                /* fall through */
8188            case 3: /* ignore */
8189                i = collend;
8190                break;
8191            case 4: /* xmlcharrefreplace */
8192                /* generate replacement (temporarily (mis)uses i) */
8193                for (i = collstart; i < collend; ++i) {
8194                    char buffer[2+29+1+1];
8195                    char *cp;
8196                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8197                    if (charmaptranslate_makespace(&output, &osize,
8198                                                   opos+strlen(buffer)+(size-collend)))
8199                        goto onError;
8200                    for (cp = buffer; *cp; ++cp)
8201                        output[opos++] = *cp;
8202                }
8203                i = collend;
8204                break;
8205            default:
8206                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8207                                                                 reason, input, &exc,
8208                                                                 collstart, collend, &newpos);
8209                if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
8210                    goto onError;
8211                /* generate replacement  */
8212                repsize = PyUnicode_GET_LENGTH(repunicode);
8213                if (charmaptranslate_makespace(&output, &osize,
8214                                               opos+repsize+(size-collend))) {
8215                    Py_DECREF(repunicode);
8216                    goto onError;
8217                }
8218                for (uni2 = 0; repsize-->0; ++uni2)
8219                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8220                i = newpos;
8221                Py_DECREF(repunicode);
8222            }
8223        }
8224    }
8225    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8226    if (!res)
8227        goto onError;
8228    PyMem_Free(output);
8229    Py_XDECREF(exc);
8230    Py_XDECREF(errorHandler);
8231    return res;
8232
8233  onError:
8234    PyMem_Free(output);
8235    Py_XDECREF(exc);
8236    Py_XDECREF(errorHandler);
8237    return NULL;
8238}
8239
8240/* Deprecated. Use PyUnicode_Translate instead. */
8241PyObject *
8242PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8243                           Py_ssize_t size,
8244                           PyObject *mapping,
8245                           const char *errors)
8246{
8247    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8248    if (!unicode)
8249        return NULL;
8250    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8251}
8252
8253PyObject *
8254PyUnicode_Translate(PyObject *str,
8255                    PyObject *mapping,
8256                    const char *errors)
8257{
8258    PyObject *result;
8259
8260    str = PyUnicode_FromObject(str);
8261    if (str == NULL)
8262        goto onError;
8263    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8264    Py_DECREF(str);
8265    return result;
8266
8267  onError:
8268    Py_XDECREF(str);
8269    return NULL;
8270}
8271
8272static Py_UCS4
8273fix_decimal_and_space_to_ascii(PyObject *self)
8274{
8275    /* No need to call PyUnicode_READY(self) because this function is only
8276       called as a callback from fixup() which does it already. */
8277    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8278    const int kind = PyUnicode_KIND(self);
8279    void *data = PyUnicode_DATA(self);
8280    Py_UCS4 maxchar = 0, ch, fixed;
8281    Py_ssize_t i;
8282
8283    for (i = 0; i < len; ++i) {
8284        ch = PyUnicode_READ(kind, data, i);
8285        fixed = 0;
8286        if (ch > 127) {
8287            if (Py_UNICODE_ISSPACE(ch))
8288                fixed = ' ';
8289            else {
8290                const int decimal = Py_UNICODE_TODECIMAL(ch);
8291                if (decimal >= 0)
8292                    fixed = '0' + decimal;
8293            }
8294            if (fixed != 0) {
8295                if (fixed > maxchar)
8296                    maxchar = fixed;
8297                PyUnicode_WRITE(kind, data, i, fixed);
8298            }
8299            else if (ch > maxchar)
8300                maxchar = ch;
8301        }
8302        else if (ch > maxchar)
8303            maxchar = ch;
8304    }
8305
8306    return maxchar;
8307}
8308
8309PyObject *
8310_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8311{
8312    if (!PyUnicode_Check(unicode)) {
8313        PyErr_BadInternalCall();
8314        return NULL;
8315    }
8316    if (PyUnicode_READY(unicode) == -1)
8317        return NULL;
8318    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8319        /* If the string is already ASCII, just return the same string */
8320        Py_INCREF(unicode);
8321        return unicode;
8322    }
8323    return fixup(unicode, fix_decimal_and_space_to_ascii);
8324}
8325
8326PyObject *
8327PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8328                                  Py_ssize_t length)
8329{
8330    PyObject *result;
8331    Py_UNICODE *p; /* write pointer into result */
8332    Py_ssize_t i;
8333    /* Copy to a new string */
8334    result = (PyObject *)_PyUnicode_New(length);
8335    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8336    if (result == NULL)
8337        return result;
8338    p = PyUnicode_AS_UNICODE(result);
8339    /* Iterate over code points */
8340    for (i = 0; i < length; i++) {
8341        Py_UNICODE ch =s[i];
8342        if (ch > 127) {
8343            int decimal = Py_UNICODE_TODECIMAL(ch);
8344            if (decimal >= 0)
8345                p[i] = '0' + decimal;
8346        }
8347    }
8348#ifndef DONT_MAKE_RESULT_READY
8349    if (_PyUnicode_READY_REPLACE(&result)) {
8350        Py_DECREF(result);
8351        return NULL;
8352    }
8353#endif
8354    assert(_PyUnicode_CheckConsistency(result, 1));
8355    return result;
8356}
8357/* --- Decimal Encoder ---------------------------------------------------- */
8358
8359int
8360PyUnicode_EncodeDecimal(Py_UNICODE *s,
8361                        Py_ssize_t length,
8362                        char *output,
8363                        const char *errors)
8364{
8365    Py_UNICODE *p, *end;
8366    PyObject *errorHandler = NULL;
8367    PyObject *exc = NULL;
8368    const char *encoding = "decimal";
8369    const char *reason = "invalid decimal Unicode string";
8370    /* the following variable is used for caching string comparisons
8371     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8372    int known_errorHandler = -1;
8373
8374    if (output == NULL) {
8375        PyErr_BadArgument();
8376        return -1;
8377    }
8378
8379    p = s;
8380    end = s + length;
8381    while (p < end) {
8382        register Py_UNICODE ch = *p;
8383        int decimal;
8384        PyObject *repunicode;
8385        Py_ssize_t repsize;
8386        Py_ssize_t newpos;
8387        Py_UNICODE *uni2;
8388        Py_UNICODE *collstart;
8389        Py_UNICODE *collend;
8390
8391        if (Py_UNICODE_ISSPACE(ch)) {
8392            *output++ = ' ';
8393            ++p;
8394            continue;
8395        }
8396        decimal = Py_UNICODE_TODECIMAL(ch);
8397        if (decimal >= 0) {
8398            *output++ = '0' + decimal;
8399            ++p;
8400            continue;
8401        }
8402        if (0 < ch && ch < 256) {
8403            *output++ = (char)ch;
8404            ++p;
8405            continue;
8406        }
8407        /* All other characters are considered unencodable */
8408        collstart = p;
8409        collend = p+1;
8410        while (collend < end) {
8411            if ((0 < *collend && *collend < 256) ||
8412                !Py_UNICODE_ISSPACE(*collend) ||
8413                Py_UNICODE_TODECIMAL(*collend))
8414                break;
8415        }
8416        /* cache callback name lookup
8417         * (if not done yet, i.e. it's the first error) */
8418        if (known_errorHandler==-1) {
8419            if ((errors==NULL) || (!strcmp(errors, "strict")))
8420                known_errorHandler = 1;
8421            else if (!strcmp(errors, "replace"))
8422                known_errorHandler = 2;
8423            else if (!strcmp(errors, "ignore"))
8424                known_errorHandler = 3;
8425            else if (!strcmp(errors, "xmlcharrefreplace"))
8426                known_errorHandler = 4;
8427            else
8428                known_errorHandler = 0;
8429        }
8430        switch (known_errorHandler) {
8431        case 1: /* strict */
8432            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8433            goto onError;
8434        case 2: /* replace */
8435            for (p = collstart; p < collend; ++p)
8436                *output++ = '?';
8437            /* fall through */
8438        case 3: /* ignore */
8439            p = collend;
8440            break;
8441        case 4: /* xmlcharrefreplace */
8442            /* generate replacement (temporarily (mis)uses p) */
8443            for (p = collstart; p < collend; ++p)
8444                output += sprintf(output, "&#%d;", (int)*p);
8445            p = collend;
8446            break;
8447        default:
8448            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8449                                                          encoding, reason, s, length, &exc,
8450                                                          collstart-s, collend-s, &newpos);
8451            if (repunicode == NULL)
8452                goto onError;
8453            if (!PyUnicode_Check(repunicode)) {
8454                /* Byte results not supported, since they have no decimal property. */
8455                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8456                Py_DECREF(repunicode);
8457                goto onError;
8458            }
8459            /* generate replacement  */
8460            repsize = PyUnicode_GET_SIZE(repunicode);
8461            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8462                Py_UNICODE ch = *uni2;
8463                if (Py_UNICODE_ISSPACE(ch))
8464                    *output++ = ' ';
8465                else {
8466                    decimal = Py_UNICODE_TODECIMAL(ch);
8467                    if (decimal >= 0)
8468                        *output++ = '0' + decimal;
8469                    else if (0 < ch && ch < 256)
8470                        *output++ = (char)ch;
8471                    else {
8472                        Py_DECREF(repunicode);
8473                        raise_encode_exception(&exc, encoding,
8474                                               s, length, collstart-s, collend-s, reason);
8475                        goto onError;
8476                    }
8477                }
8478            }
8479            p = s + newpos;
8480            Py_DECREF(repunicode);
8481        }
8482    }
8483    /* 0-terminate the output string */
8484    *output++ = '\0';
8485    Py_XDECREF(exc);
8486    Py_XDECREF(errorHandler);
8487    return 0;
8488
8489  onError:
8490    Py_XDECREF(exc);
8491    Py_XDECREF(errorHandler);
8492    return -1;
8493}
8494
8495/* --- Helpers ------------------------------------------------------------ */
8496
8497#include "stringlib/asciilib.h"
8498#include "stringlib/fastsearch.h"
8499#include "stringlib/partition.h"
8500#include "stringlib/split.h"
8501#include "stringlib/count.h"
8502#include "stringlib/find.h"
8503#include "stringlib/localeutil.h"
8504#include "stringlib/undef.h"
8505
8506#include "stringlib/ucs1lib.h"
8507#include "stringlib/fastsearch.h"
8508#include "stringlib/partition.h"
8509#include "stringlib/split.h"
8510#include "stringlib/count.h"
8511#include "stringlib/find.h"
8512#include "stringlib/localeutil.h"
8513#include "stringlib/undef.h"
8514
8515#include "stringlib/ucs2lib.h"
8516#include "stringlib/fastsearch.h"
8517#include "stringlib/partition.h"
8518#include "stringlib/split.h"
8519#include "stringlib/count.h"
8520#include "stringlib/find.h"
8521#include "stringlib/localeutil.h"
8522#include "stringlib/undef.h"
8523
8524#include "stringlib/ucs4lib.h"
8525#include "stringlib/fastsearch.h"
8526#include "stringlib/partition.h"
8527#include "stringlib/split.h"
8528#include "stringlib/count.h"
8529#include "stringlib/find.h"
8530#include "stringlib/localeutil.h"
8531#include "stringlib/undef.h"
8532
8533static Py_ssize_t
8534any_find_slice(int direction, PyObject* s1, PyObject* s2,
8535               Py_ssize_t start,
8536               Py_ssize_t end)
8537{
8538    int kind1, kind2, kind;
8539    void *buf1, *buf2;
8540    Py_ssize_t len1, len2, result;
8541
8542    kind1 = PyUnicode_KIND(s1);
8543    kind2 = PyUnicode_KIND(s2);
8544    kind = kind1 > kind2 ? kind1 : kind2;
8545    buf1 = PyUnicode_DATA(s1);
8546    buf2 = PyUnicode_DATA(s2);
8547    if (kind1 != kind)
8548        buf1 = _PyUnicode_AsKind(s1, kind);
8549    if (!buf1)
8550        return -2;
8551    if (kind2 != kind)
8552        buf2 = _PyUnicode_AsKind(s2, kind);
8553    if (!buf2) {
8554        if (kind1 != kind) PyMem_Free(buf1);
8555        return -2;
8556    }
8557    len1 = PyUnicode_GET_LENGTH(s1);
8558    len2 = PyUnicode_GET_LENGTH(s2);
8559
8560    if (direction > 0) {
8561        switch(kind) {
8562        case PyUnicode_1BYTE_KIND:
8563            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8564                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8565            else
8566                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8567            break;
8568        case PyUnicode_2BYTE_KIND:
8569            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8570            break;
8571        case PyUnicode_4BYTE_KIND:
8572            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8573            break;
8574        default:
8575            assert(0); result = -2;
8576        }
8577    }
8578    else {
8579        switch(kind) {
8580        case PyUnicode_1BYTE_KIND:
8581            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8582                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8583            else
8584                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8585            break;
8586        case PyUnicode_2BYTE_KIND:
8587            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8588            break;
8589        case PyUnicode_4BYTE_KIND:
8590            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8591            break;
8592        default:
8593            assert(0); result = -2;
8594        }
8595    }
8596
8597    if (kind1 != kind)
8598        PyMem_Free(buf1);
8599    if (kind2 != kind)
8600        PyMem_Free(buf2);
8601
8602    return result;
8603}
8604
8605Py_ssize_t
8606_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
8607                                   Py_ssize_t n_buffer,
8608                                   void *digits, Py_ssize_t n_digits,
8609                                   Py_ssize_t min_width,
8610                                   const char *grouping,
8611                                   const char *thousands_sep)
8612{
8613    switch(kind) {
8614    case PyUnicode_1BYTE_KIND:
8615        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8616            return _PyUnicode_ascii_InsertThousandsGrouping(
8617                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8618                min_width, grouping, thousands_sep);
8619        else
8620            return _PyUnicode_ucs1_InsertThousandsGrouping(
8621                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8622                min_width, grouping, thousands_sep);
8623    case PyUnicode_2BYTE_KIND:
8624        return _PyUnicode_ucs2_InsertThousandsGrouping(
8625            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8626            min_width, grouping, thousands_sep);
8627    case PyUnicode_4BYTE_KIND:
8628        return _PyUnicode_ucs4_InsertThousandsGrouping(
8629            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8630            min_width, grouping, thousands_sep);
8631    }
8632    assert(0);
8633    return -1;
8634}
8635
8636
8637#include "stringlib/unicodedefs.h"
8638#include "stringlib/fastsearch.h"
8639
8640#include "stringlib/count.h"
8641#include "stringlib/find.h"
8642
8643/* helper macro to fixup start/end slice values */
8644#define ADJUST_INDICES(start, end, len)         \
8645    if (end > len)                              \
8646        end = len;                              \
8647    else if (end < 0) {                         \
8648        end += len;                             \
8649        if (end < 0)                            \
8650            end = 0;                            \
8651    }                                           \
8652    if (start < 0) {                            \
8653        start += len;                           \
8654        if (start < 0)                          \
8655            start = 0;                          \
8656    }
8657
8658Py_ssize_t
8659PyUnicode_Count(PyObject *str,
8660                PyObject *substr,
8661                Py_ssize_t start,
8662                Py_ssize_t end)
8663{
8664    Py_ssize_t result;
8665    PyUnicodeObject* str_obj;
8666    PyUnicodeObject* sub_obj;
8667    int kind1, kind2, kind;
8668    void *buf1 = NULL, *buf2 = NULL;
8669    Py_ssize_t len1, len2;
8670
8671    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
8672    if (!str_obj || PyUnicode_READY(str_obj) == -1)
8673        return -1;
8674    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
8675    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
8676        Py_DECREF(str_obj);
8677        return -1;
8678    }
8679
8680    kind1 = PyUnicode_KIND(str_obj);
8681    kind2 = PyUnicode_KIND(sub_obj);
8682    kind = kind1 > kind2 ? kind1 : kind2;
8683    buf1 = PyUnicode_DATA(str_obj);
8684    if (kind1 != kind)
8685        buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8686    if (!buf1)
8687        goto onError;
8688    buf2 = PyUnicode_DATA(sub_obj);
8689    if (kind2 != kind)
8690        buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8691    if (!buf2)
8692        goto onError;
8693    len1 = PyUnicode_GET_LENGTH(str_obj);
8694    len2 = PyUnicode_GET_LENGTH(sub_obj);
8695
8696    ADJUST_INDICES(start, end, len1);
8697    switch(kind) {
8698    case PyUnicode_1BYTE_KIND:
8699        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8700            result = asciilib_count(
8701                ((Py_UCS1*)buf1) + start, end - start,
8702                buf2, len2, PY_SSIZE_T_MAX
8703                );
8704        else
8705            result = ucs1lib_count(
8706                ((Py_UCS1*)buf1) + start, end - start,
8707                buf2, len2, PY_SSIZE_T_MAX
8708                );
8709        break;
8710    case PyUnicode_2BYTE_KIND:
8711        result = ucs2lib_count(
8712            ((Py_UCS2*)buf1) + start, end - start,
8713            buf2, len2, PY_SSIZE_T_MAX
8714            );
8715        break;
8716    case PyUnicode_4BYTE_KIND:
8717        result = ucs4lib_count(
8718            ((Py_UCS4*)buf1) + start, end - start,
8719            buf2, len2, PY_SSIZE_T_MAX
8720            );
8721        break;
8722    default:
8723        assert(0); result = 0;
8724    }
8725
8726    Py_DECREF(sub_obj);
8727    Py_DECREF(str_obj);
8728
8729    if (kind1 != kind)
8730        PyMem_Free(buf1);
8731    if (kind2 != kind)
8732        PyMem_Free(buf2);
8733
8734    return result;
8735  onError:
8736    Py_DECREF(sub_obj);
8737    Py_DECREF(str_obj);
8738    if (kind1 != kind && buf1)
8739        PyMem_Free(buf1);
8740    if (kind2 != kind && buf2)
8741        PyMem_Free(buf2);
8742    return -1;
8743}
8744
8745Py_ssize_t
8746PyUnicode_Find(PyObject *str,
8747               PyObject *sub,
8748               Py_ssize_t start,
8749               Py_ssize_t end,
8750               int direction)
8751{
8752    Py_ssize_t result;
8753
8754    str = PyUnicode_FromObject(str);
8755    if (!str || PyUnicode_READY(str) == -1)
8756        return -2;
8757    sub = PyUnicode_FromObject(sub);
8758    if (!sub || PyUnicode_READY(sub) == -1) {
8759        Py_DECREF(str);
8760        return -2;
8761    }
8762
8763    result = any_find_slice(direction,
8764        str, sub, start, end
8765        );
8766
8767    Py_DECREF(str);
8768    Py_DECREF(sub);
8769
8770    return result;
8771}
8772
8773Py_ssize_t
8774PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8775                   Py_ssize_t start, Py_ssize_t end,
8776                   int direction)
8777{
8778    char *result;
8779    int kind;
8780    if (PyUnicode_READY(str) == -1)
8781        return -2;
8782    if (start < 0 || end < 0) {
8783        PyErr_SetString(PyExc_IndexError, "string index out of range");
8784        return -2;
8785    }
8786    if (end > PyUnicode_GET_LENGTH(str))
8787        end = PyUnicode_GET_LENGTH(str);
8788    kind = PyUnicode_KIND(str);
8789    result = findchar(PyUnicode_1BYTE_DATA(str)
8790                      + kind*start,
8791                      kind,
8792                      end-start, ch, direction);
8793    if (!result)
8794        return -1;
8795    return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8796}
8797
8798static int
8799tailmatch(PyUnicodeObject *self,
8800          PyUnicodeObject *substring,
8801          Py_ssize_t start,
8802          Py_ssize_t end,
8803          int direction)
8804{
8805    int kind_self;
8806    int kind_sub;
8807    void *data_self;
8808    void *data_sub;
8809    Py_ssize_t offset;
8810    Py_ssize_t i;
8811    Py_ssize_t end_sub;
8812
8813    if (PyUnicode_READY(self) == -1 ||
8814        PyUnicode_READY(substring) == -1)
8815        return 0;
8816
8817    if (PyUnicode_GET_LENGTH(substring) == 0)
8818        return 1;
8819
8820    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8821    end -= PyUnicode_GET_LENGTH(substring);
8822    if (end < start)
8823        return 0;
8824
8825    kind_self = PyUnicode_KIND(self);
8826    data_self = PyUnicode_DATA(self);
8827    kind_sub = PyUnicode_KIND(substring);
8828    data_sub = PyUnicode_DATA(substring);
8829    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8830
8831    if (direction > 0)
8832        offset = end;
8833    else
8834        offset = start;
8835
8836    if (PyUnicode_READ(kind_self, data_self, offset) ==
8837        PyUnicode_READ(kind_sub, data_sub, 0) &&
8838        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8839        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8840        /* If both are of the same kind, memcmp is sufficient */
8841        if (kind_self == kind_sub) {
8842            return ! memcmp((char *)data_self +
8843                                (offset * PyUnicode_KIND(substring)),
8844                            data_sub,
8845                            PyUnicode_GET_LENGTH(substring) *
8846                                PyUnicode_KIND(substring));
8847        }
8848        /* otherwise we have to compare each character by first accesing it */
8849        else {
8850            /* We do not need to compare 0 and len(substring)-1 because
8851               the if statement above ensured already that they are equal
8852               when we end up here. */
8853            // TODO: honor direction and do a forward or backwards search
8854            for (i = 1; i < end_sub; ++i) {
8855                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8856                    PyUnicode_READ(kind_sub, data_sub, i))
8857                    return 0;
8858            }
8859            return 1;
8860        }
8861    }
8862
8863    return 0;
8864}
8865
8866Py_ssize_t
8867PyUnicode_Tailmatch(PyObject *str,
8868                    PyObject *substr,
8869                    Py_ssize_t start,
8870                    Py_ssize_t end,
8871                    int direction)
8872{
8873    Py_ssize_t result;
8874
8875    str = PyUnicode_FromObject(str);
8876    if (str == NULL)
8877        return -1;
8878    substr = PyUnicode_FromObject(substr);
8879    if (substr == NULL) {
8880        Py_DECREF(str);
8881        return -1;
8882    }
8883
8884    result = tailmatch((PyUnicodeObject *)str,
8885                       (PyUnicodeObject *)substr,
8886                       start, end, direction);
8887    Py_DECREF(str);
8888    Py_DECREF(substr);
8889    return result;
8890}
8891
8892/* Apply fixfct filter to the Unicode object self and return a
8893   reference to the modified object */
8894
8895static PyObject *
8896fixup(PyObject *self,
8897      Py_UCS4 (*fixfct)(PyObject *s))
8898{
8899    PyObject *u;
8900    Py_UCS4 maxchar_old, maxchar_new = 0;
8901
8902    if (PyUnicode_READY(self) == -1)
8903        return NULL;
8904    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8905    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8906                      maxchar_old);
8907    if (u == NULL)
8908        return NULL;
8909
8910    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8911              PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
8912
8913    /* fix functions return the new maximum character in a string,
8914       if the kind of the resulting unicode object does not change,
8915       everything is fine.  Otherwise we need to change the string kind
8916       and re-run the fix function. */
8917    maxchar_new = fixfct(u);
8918    if (maxchar_new == 0)
8919        /* do nothing, keep maxchar_new at 0 which means no changes. */;
8920    else if (maxchar_new <= 127)
8921        maxchar_new = 127;
8922    else if (maxchar_new <= 255)
8923        maxchar_new = 255;
8924    else if (maxchar_new <= 65535)
8925        maxchar_new = 65535;
8926    else
8927        maxchar_new = 1114111; /* 0x10ffff */
8928
8929    if (!maxchar_new && PyUnicode_CheckExact(self)) {
8930        /* fixfct should return TRUE if it modified the buffer. If
8931           FALSE, return a reference to the original buffer instead
8932           (to save space, not time) */
8933        Py_INCREF(self);
8934        Py_DECREF(u);
8935        return (PyObject*) self;
8936    }
8937    else if (maxchar_new == maxchar_old) {
8938        return u;
8939    }
8940    else {
8941        /* In case the maximum character changed, we need to
8942           convert the string to the new category. */
8943        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
8944        if (v == NULL) {
8945            Py_DECREF(u);
8946            return NULL;
8947        }
8948        if (maxchar_new > maxchar_old) {
8949            /* If the maxchar increased so that the kind changed, not all
8950               characters are representable anymore and we need to fix the
8951               string again. This only happens in very few cases. */
8952            copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
8953            maxchar_old = fixfct(v);
8954            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8955        }
8956        else {
8957            copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
8958        }
8959
8960        Py_DECREF(u);
8961        assert(_PyUnicode_CheckConsistency(v, 1));
8962        return v;
8963    }
8964}
8965
8966static Py_UCS4
8967fixupper(PyObject *self)
8968{
8969    /* No need to call PyUnicode_READY(self) because this function is only
8970       called as a callback from fixup() which does it already. */
8971    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8972    const int kind = PyUnicode_KIND(self);
8973    void *data = PyUnicode_DATA(self);
8974    int touched = 0;
8975    Py_UCS4 maxchar = 0;
8976    Py_ssize_t i;
8977
8978    for (i = 0; i < len; ++i) {
8979        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8980        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8981        if (up != ch) {
8982            if (up > maxchar)
8983                maxchar = up;
8984            PyUnicode_WRITE(kind, data, i, up);
8985            touched = 1;
8986        }
8987        else if (ch > maxchar)
8988            maxchar = ch;
8989    }
8990
8991    if (touched)
8992        return maxchar;
8993    else
8994        return 0;
8995}
8996
8997static Py_UCS4
8998fixlower(PyObject *self)
8999{
9000    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9001    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9002    const int kind = PyUnicode_KIND(self);
9003    void *data = PyUnicode_DATA(self);
9004    int touched = 0;
9005    Py_UCS4 maxchar = 0;
9006    Py_ssize_t i;
9007
9008    for(i = 0; i < len; ++i) {
9009        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9010        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9011        if (lo != ch) {
9012            if (lo > maxchar)
9013                maxchar = lo;
9014            PyUnicode_WRITE(kind, data, i, lo);
9015            touched = 1;
9016        }
9017        else if (ch > maxchar)
9018            maxchar = ch;
9019    }
9020
9021    if (touched)
9022        return maxchar;
9023    else
9024        return 0;
9025}
9026
9027static Py_UCS4
9028fixswapcase(PyObject *self)
9029{
9030    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9031    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9032    const int kind = PyUnicode_KIND(self);
9033    void *data = PyUnicode_DATA(self);
9034    int touched = 0;
9035    Py_UCS4 maxchar = 0;
9036    Py_ssize_t i;
9037
9038    for(i = 0; i < len; ++i) {
9039        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9040        Py_UCS4 nu = 0;
9041
9042        if (Py_UNICODE_ISUPPER(ch))
9043            nu = Py_UNICODE_TOLOWER(ch);
9044        else if (Py_UNICODE_ISLOWER(ch))
9045            nu = Py_UNICODE_TOUPPER(ch);
9046
9047        if (nu != 0) {
9048            if (nu > maxchar)
9049                maxchar = nu;
9050            PyUnicode_WRITE(kind, data, i, nu);
9051            touched = 1;
9052        }
9053        else if (ch > maxchar)
9054            maxchar = ch;
9055    }
9056
9057    if (touched)
9058        return maxchar;
9059    else
9060        return 0;
9061}
9062
9063static Py_UCS4
9064fixcapitalize(PyObject *self)
9065{
9066    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9067    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9068    const int kind = PyUnicode_KIND(self);
9069    void *data = PyUnicode_DATA(self);
9070    int touched = 0;
9071    Py_UCS4 maxchar = 0;
9072    Py_ssize_t i = 0;
9073    Py_UCS4 ch;
9074
9075    if (len == 0)
9076        return 0;
9077
9078    ch = PyUnicode_READ(kind, data, i);
9079    if (!Py_UNICODE_ISUPPER(ch)) {
9080        maxchar = Py_UNICODE_TOUPPER(ch);
9081        PyUnicode_WRITE(kind, data, i, maxchar);
9082        touched = 1;
9083    }
9084    ++i;
9085    for(; i < len; ++i) {
9086        ch = PyUnicode_READ(kind, data, i);
9087        if (!Py_UNICODE_ISLOWER(ch)) {
9088            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9089            if (lo > maxchar)
9090                maxchar = lo;
9091            PyUnicode_WRITE(kind, data, i, lo);
9092            touched = 1;
9093        }
9094        else if (ch > maxchar)
9095            maxchar = ch;
9096    }
9097
9098    if (touched)
9099        return maxchar;
9100    else
9101        return 0;
9102}
9103
9104static Py_UCS4
9105fixtitle(PyObject *self)
9106{
9107    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9108    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9109    const int kind = PyUnicode_KIND(self);
9110    void *data = PyUnicode_DATA(self);
9111    Py_UCS4 maxchar = 0;
9112    Py_ssize_t i = 0;
9113    int previous_is_cased;
9114
9115    /* Shortcut for single character strings */
9116    if (len == 1) {
9117        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9118        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9119        if (ti != ch) {
9120            PyUnicode_WRITE(kind, data, i, ti);
9121            return ti;
9122        }
9123        else
9124            return 0;
9125    }
9126    previous_is_cased = 0;
9127    for(; i < len; ++i) {
9128        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9129        Py_UCS4 nu;
9130
9131        if (previous_is_cased)
9132            nu = Py_UNICODE_TOLOWER(ch);
9133        else
9134            nu = Py_UNICODE_TOTITLE(ch);
9135
9136        if (nu > maxchar)
9137            maxchar = nu;
9138        PyUnicode_WRITE(kind, data, i, nu);
9139
9140        if (Py_UNICODE_ISLOWER(ch) ||
9141            Py_UNICODE_ISUPPER(ch) ||
9142            Py_UNICODE_ISTITLE(ch))
9143            previous_is_cased = 1;
9144        else
9145            previous_is_cased = 0;
9146    }
9147    return maxchar;
9148}
9149
9150PyObject *
9151PyUnicode_Join(PyObject *separator, PyObject *seq)
9152{
9153    PyObject *sep = NULL;
9154    Py_ssize_t seplen;
9155    PyObject *res = NULL; /* the result */
9156    PyObject *fseq;          /* PySequence_Fast(seq) */
9157    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9158    PyObject **items;
9159    PyObject *item;
9160    Py_ssize_t sz, i, res_offset;
9161    Py_UCS4 maxchar;
9162    Py_UCS4 item_maxchar;
9163    int use_memcpy;
9164    unsigned char *res_data = NULL, *sep_data = NULL;
9165    PyObject *last_obj;
9166    unsigned int kind = 0;
9167
9168    fseq = PySequence_Fast(seq, "");
9169    if (fseq == NULL) {
9170        return NULL;
9171    }
9172
9173    /* NOTE: the following code can't call back into Python code,
9174     * so we are sure that fseq won't be mutated.
9175     */
9176
9177    seqlen = PySequence_Fast_GET_SIZE(fseq);
9178    /* If empty sequence, return u"". */
9179    if (seqlen == 0) {
9180        Py_DECREF(fseq);
9181        Py_INCREF(unicode_empty);
9182        res = unicode_empty;
9183        return res;
9184    }
9185
9186    /* If singleton sequence with an exact Unicode, return that. */
9187    last_obj = NULL;
9188    items = PySequence_Fast_ITEMS(fseq);
9189    if (seqlen == 1) {
9190        if (PyUnicode_CheckExact(items[0])) {
9191            res = items[0];
9192            Py_INCREF(res);
9193            Py_DECREF(fseq);
9194            return res;
9195        }
9196        seplen = 0;
9197        maxchar = 0;
9198    }
9199    else {
9200        /* Set up sep and seplen */
9201        if (separator == NULL) {
9202            /* fall back to a blank space separator */
9203            sep = PyUnicode_FromOrdinal(' ');
9204            if (!sep)
9205                goto onError;
9206            seplen = 1;
9207            maxchar = 32;
9208        }
9209        else {
9210            if (!PyUnicode_Check(separator)) {
9211                PyErr_Format(PyExc_TypeError,
9212                             "separator: expected str instance,"
9213                             " %.80s found",
9214                             Py_TYPE(separator)->tp_name);
9215                goto onError;
9216            }
9217            if (PyUnicode_READY(separator))
9218                goto onError;
9219            sep = separator;
9220            seplen = PyUnicode_GET_LENGTH(separator);
9221            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9222            /* inc refcount to keep this code path symmetric with the
9223               above case of a blank separator */
9224            Py_INCREF(sep);
9225        }
9226        last_obj = sep;
9227    }
9228
9229    /* There are at least two things to join, or else we have a subclass
9230     * of str in the sequence.
9231     * Do a pre-pass to figure out the total amount of space we'll
9232     * need (sz), and see whether all argument are strings.
9233     */
9234    sz = 0;
9235#ifdef Py_DEBUG
9236    use_memcpy = 0;
9237#else
9238    use_memcpy = 1;
9239#endif
9240    for (i = 0; i < seqlen; i++) {
9241        const Py_ssize_t old_sz = sz;
9242        item = items[i];
9243        if (!PyUnicode_Check(item)) {
9244            PyErr_Format(PyExc_TypeError,
9245                         "sequence item %zd: expected str instance,"
9246                         " %.80s found",
9247                         i, Py_TYPE(item)->tp_name);
9248            goto onError;
9249        }
9250        if (PyUnicode_READY(item) == -1)
9251            goto onError;
9252        sz += PyUnicode_GET_LENGTH(item);
9253        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9254        maxchar = Py_MAX(maxchar, item_maxchar);
9255        if (i != 0)
9256            sz += seplen;
9257        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9258            PyErr_SetString(PyExc_OverflowError,
9259                            "join() result is too long for a Python string");
9260            goto onError;
9261        }
9262        if (use_memcpy && last_obj != NULL) {
9263            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9264                use_memcpy = 0;
9265        }
9266        last_obj = item;
9267    }
9268
9269    res = PyUnicode_New(sz, maxchar);
9270    if (res == NULL)
9271        goto onError;
9272
9273    /* Catenate everything. */
9274#ifdef Py_DEBUG
9275    use_memcpy = 0;
9276#else
9277    if (use_memcpy) {
9278        res_data = PyUnicode_1BYTE_DATA(res);
9279        kind = PyUnicode_KIND(res);
9280        if (seplen != 0)
9281            sep_data = PyUnicode_1BYTE_DATA(sep);
9282    }
9283#endif
9284    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9285        Py_ssize_t itemlen;
9286        item = items[i];
9287        /* Copy item, and maybe the separator. */
9288        if (i && seplen != 0) {
9289            if (use_memcpy) {
9290                Py_MEMCPY(res_data,
9291                          sep_data,
9292                          kind * seplen);
9293                res_data += kind * seplen;
9294            }
9295            else {
9296                copy_characters(res, res_offset, sep, 0, seplen);
9297                res_offset += seplen;
9298            }
9299        }
9300        itemlen = PyUnicode_GET_LENGTH(item);
9301        if (itemlen != 0) {
9302            if (use_memcpy) {
9303                Py_MEMCPY(res_data,
9304                          PyUnicode_DATA(item),
9305                          kind * itemlen);
9306                res_data += kind * itemlen;
9307            }
9308            else {
9309                copy_characters(res, res_offset, item, 0, itemlen);
9310                res_offset += itemlen;
9311            }
9312        }
9313    }
9314    if (use_memcpy)
9315        assert(res_data == PyUnicode_1BYTE_DATA(res)
9316                           + kind * PyUnicode_GET_LENGTH(res));
9317    else
9318        assert(res_offset == PyUnicode_GET_LENGTH(res));
9319
9320    Py_DECREF(fseq);
9321    Py_XDECREF(sep);
9322    assert(_PyUnicode_CheckConsistency(res, 1));
9323    return res;
9324
9325  onError:
9326    Py_DECREF(fseq);
9327    Py_XDECREF(sep);
9328    Py_XDECREF(res);
9329    return NULL;
9330}
9331
9332#define FILL(kind, data, value, start, length) \
9333    do { \
9334        Py_ssize_t i_ = 0; \
9335        assert(kind != PyUnicode_WCHAR_KIND); \
9336        switch ((kind)) { \
9337        case PyUnicode_1BYTE_KIND: { \
9338            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9339            memset(to_, (unsigned char)value, length); \
9340            break; \
9341        } \
9342        case PyUnicode_2BYTE_KIND: { \
9343            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9344            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9345            break; \
9346        } \
9347        default: { \
9348            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9349            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9350            break; \
9351        } \
9352        } \
9353    } while (0)
9354
9355static PyObject *
9356pad(PyObject *self,
9357    Py_ssize_t left,
9358    Py_ssize_t right,
9359    Py_UCS4 fill)
9360{
9361    PyObject *u;
9362    Py_UCS4 maxchar;
9363    int kind;
9364    void *data;
9365
9366    if (left < 0)
9367        left = 0;
9368    if (right < 0)
9369        right = 0;
9370
9371    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9372        Py_INCREF(self);
9373        return self;
9374    }
9375
9376    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9377        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9378        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9379        return NULL;
9380    }
9381    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9382    if (fill > maxchar)
9383        maxchar = fill;
9384    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9385    if (!u)
9386        return NULL;
9387
9388    kind = PyUnicode_KIND(u);
9389    data = PyUnicode_DATA(u);
9390    if (left)
9391        FILL(kind, data, fill, 0, left);
9392    if (right)
9393        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9394    copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
9395    assert(_PyUnicode_CheckConsistency(u, 1));
9396    return u;
9397}
9398#undef FILL
9399
9400PyObject *
9401PyUnicode_Splitlines(PyObject *string, int keepends)
9402{
9403    PyObject *list;
9404
9405    string = PyUnicode_FromObject(string);
9406    if (string == NULL || PyUnicode_READY(string) == -1)
9407        return NULL;
9408
9409    switch(PyUnicode_KIND(string)) {
9410    case PyUnicode_1BYTE_KIND:
9411        if (PyUnicode_IS_ASCII(string))
9412            list = asciilib_splitlines(
9413                (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9414                PyUnicode_GET_LENGTH(string), keepends);
9415        else
9416            list = ucs1lib_splitlines(
9417                (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9418                PyUnicode_GET_LENGTH(string), keepends);
9419        break;
9420    case PyUnicode_2BYTE_KIND:
9421        list = ucs2lib_splitlines(
9422            (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9423            PyUnicode_GET_LENGTH(string), keepends);
9424        break;
9425    case PyUnicode_4BYTE_KIND:
9426        list = ucs4lib_splitlines(
9427            (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9428            PyUnicode_GET_LENGTH(string), keepends);
9429        break;
9430    default:
9431        assert(0);
9432        list = 0;
9433    }
9434    Py_DECREF(string);
9435    return list;
9436}
9437
9438static PyObject *
9439split(PyObject *self,
9440      PyObject *substring,
9441      Py_ssize_t maxcount)
9442{
9443    int kind1, kind2, kind;
9444    void *buf1, *buf2;
9445    Py_ssize_t len1, len2;
9446    PyObject* out;
9447
9448    if (maxcount < 0)
9449        maxcount = PY_SSIZE_T_MAX;
9450
9451    if (PyUnicode_READY(self) == -1)
9452        return NULL;
9453
9454    if (substring == NULL)
9455        switch(PyUnicode_KIND(self)) {
9456        case PyUnicode_1BYTE_KIND:
9457            if (PyUnicode_IS_ASCII(self))
9458                return asciilib_split_whitespace(
9459                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9460                    PyUnicode_GET_LENGTH(self), maxcount
9461                    );
9462            else
9463                return ucs1lib_split_whitespace(
9464                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9465                    PyUnicode_GET_LENGTH(self), maxcount
9466                    );
9467        case PyUnicode_2BYTE_KIND:
9468            return ucs2lib_split_whitespace(
9469                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9470                PyUnicode_GET_LENGTH(self), maxcount
9471                );
9472        case PyUnicode_4BYTE_KIND:
9473            return ucs4lib_split_whitespace(
9474                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9475                PyUnicode_GET_LENGTH(self), maxcount
9476                );
9477        default:
9478            assert(0);
9479            return NULL;
9480        }
9481
9482    if (PyUnicode_READY(substring) == -1)
9483        return NULL;
9484
9485    kind1 = PyUnicode_KIND(self);
9486    kind2 = PyUnicode_KIND(substring);
9487    kind = kind1 > kind2 ? kind1 : kind2;
9488    buf1 = PyUnicode_DATA(self);
9489    buf2 = PyUnicode_DATA(substring);
9490    if (kind1 != kind)
9491        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9492    if (!buf1)
9493        return NULL;
9494    if (kind2 != kind)
9495        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9496    if (!buf2) {
9497        if (kind1 != kind) PyMem_Free(buf1);
9498        return NULL;
9499    }
9500    len1 = PyUnicode_GET_LENGTH(self);
9501    len2 = PyUnicode_GET_LENGTH(substring);
9502
9503    switch(kind) {
9504    case PyUnicode_1BYTE_KIND:
9505        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9506            out = asciilib_split(
9507                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9508        else
9509            out = ucs1lib_split(
9510                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9511        break;
9512    case PyUnicode_2BYTE_KIND:
9513        out = ucs2lib_split(
9514            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9515        break;
9516    case PyUnicode_4BYTE_KIND:
9517        out = ucs4lib_split(
9518            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9519        break;
9520    default:
9521        out = NULL;
9522    }
9523    if (kind1 != kind)
9524        PyMem_Free(buf1);
9525    if (kind2 != kind)
9526        PyMem_Free(buf2);
9527    return out;
9528}
9529
9530static PyObject *
9531rsplit(PyObject *self,
9532       PyObject *substring,
9533       Py_ssize_t maxcount)
9534{
9535    int kind1, kind2, kind;
9536    void *buf1, *buf2;
9537    Py_ssize_t len1, len2;
9538    PyObject* out;
9539
9540    if (maxcount < 0)
9541        maxcount = PY_SSIZE_T_MAX;
9542
9543    if (PyUnicode_READY(self) == -1)
9544        return NULL;
9545
9546    if (substring == NULL)
9547        switch(PyUnicode_KIND(self)) {
9548        case PyUnicode_1BYTE_KIND:
9549            if (PyUnicode_IS_ASCII(self))
9550                return asciilib_rsplit_whitespace(
9551                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9552                    PyUnicode_GET_LENGTH(self), maxcount
9553                    );
9554            else
9555                return ucs1lib_rsplit_whitespace(
9556                    (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9557                    PyUnicode_GET_LENGTH(self), maxcount
9558                    );
9559        case PyUnicode_2BYTE_KIND:
9560            return ucs2lib_rsplit_whitespace(
9561                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9562                PyUnicode_GET_LENGTH(self), maxcount
9563                );
9564        case PyUnicode_4BYTE_KIND:
9565            return ucs4lib_rsplit_whitespace(
9566                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9567                PyUnicode_GET_LENGTH(self), maxcount
9568                );
9569        default:
9570            assert(0);
9571            return NULL;
9572        }
9573
9574    if (PyUnicode_READY(substring) == -1)
9575        return NULL;
9576
9577    kind1 = PyUnicode_KIND(self);
9578    kind2 = PyUnicode_KIND(substring);
9579    kind = kind1 > kind2 ? kind1 : kind2;
9580    buf1 = PyUnicode_DATA(self);
9581    buf2 = PyUnicode_DATA(substring);
9582    if (kind1 != kind)
9583        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9584    if (!buf1)
9585        return NULL;
9586    if (kind2 != kind)
9587        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9588    if (!buf2) {
9589        if (kind1 != kind) PyMem_Free(buf1);
9590        return NULL;
9591    }
9592    len1 = PyUnicode_GET_LENGTH(self);
9593    len2 = PyUnicode_GET_LENGTH(substring);
9594
9595    switch(kind) {
9596    case PyUnicode_1BYTE_KIND:
9597        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9598            out = asciilib_rsplit(
9599                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9600        else
9601            out = ucs1lib_rsplit(
9602                (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9603        break;
9604    case PyUnicode_2BYTE_KIND:
9605        out = ucs2lib_rsplit(
9606            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9607        break;
9608    case PyUnicode_4BYTE_KIND:
9609        out = ucs4lib_rsplit(
9610            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9611        break;
9612    default:
9613        out = NULL;
9614    }
9615    if (kind1 != kind)
9616        PyMem_Free(buf1);
9617    if (kind2 != kind)
9618        PyMem_Free(buf2);
9619    return out;
9620}
9621
9622static Py_ssize_t
9623anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9624            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9625{
9626    switch(kind) {
9627    case PyUnicode_1BYTE_KIND:
9628        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9629            return asciilib_find(buf1, len1, buf2, len2, offset);
9630        else
9631            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9632    case PyUnicode_2BYTE_KIND:
9633        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9634    case PyUnicode_4BYTE_KIND:
9635        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9636    }
9637    assert(0);
9638    return -1;
9639}
9640
9641static Py_ssize_t
9642anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9643             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9644{
9645        switch(kind) {
9646        case PyUnicode_1BYTE_KIND:
9647            if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9648                return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9649            else
9650                return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9651        case PyUnicode_2BYTE_KIND:
9652            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9653        case PyUnicode_4BYTE_KIND:
9654            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9655        }
9656        assert(0);
9657        return 0;
9658}
9659
9660static PyObject *
9661replace(PyObject *self, PyObject *str1,
9662        PyObject *str2, Py_ssize_t maxcount)
9663{
9664    PyObject *u;
9665    char *sbuf = PyUnicode_DATA(self);
9666    char *buf1 = PyUnicode_DATA(str1);
9667    char *buf2 = PyUnicode_DATA(str2);
9668    int srelease = 0, release1 = 0, release2 = 0;
9669    int skind = PyUnicode_KIND(self);
9670    int kind1 = PyUnicode_KIND(str1);
9671    int kind2 = PyUnicode_KIND(str2);
9672    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9673    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9674    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9675
9676    if (maxcount < 0)
9677        maxcount = PY_SSIZE_T_MAX;
9678    else if (maxcount == 0 || slen == 0)
9679        goto nothing;
9680
9681    if (str1 == str2)
9682        goto nothing;
9683    if (skind < kind1)
9684        /* substring too wide to be present */
9685        goto nothing;
9686
9687    if (len1 == len2) {
9688        Py_ssize_t i;
9689        /* same length */
9690        if (len1 == 0)
9691            goto nothing;
9692        if (len1 == 1) {
9693            /* replace characters */
9694            Py_UCS4 u1, u2, maxchar;
9695            int mayshrink, rkind;
9696            u1 = PyUnicode_READ_CHAR(str1, 0);
9697            if (!findchar(sbuf, PyUnicode_KIND(self),
9698                          slen, u1, 1))
9699                goto nothing;
9700            u2 = PyUnicode_READ_CHAR(str2, 0);
9701            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9702            /* Replacing u1 with u2 may cause a maxchar reduction in the
9703               result string. */
9704            if (u2 > maxchar) {
9705                maxchar = u2;
9706                mayshrink = 0;
9707            }
9708            else
9709                mayshrink = maxchar > 127;
9710            u = PyUnicode_New(slen, maxchar);
9711            if (!u)
9712                goto error;
9713            copy_characters(u, 0, self, 0, slen);
9714            rkind = PyUnicode_KIND(u);
9715            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9716                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
9717                    if (--maxcount < 0)
9718                        break;
9719                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
9720                }
9721            if (mayshrink) {
9722                unicode_adjust_maxchar(&u);
9723                if (u == NULL)
9724                    goto error;
9725            }
9726        } else {
9727            int rkind = skind;
9728            char *res;
9729            PyObject *rstr;
9730            Py_UCS4 maxchar;
9731
9732            if (kind1 < rkind) {
9733                /* widen substring */
9734                buf1 = _PyUnicode_AsKind(str1, rkind);
9735                if (!buf1) goto error;
9736                release1 = 1;
9737            }
9738            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
9739            if (i < 0)
9740                goto nothing;
9741            if (rkind > kind2) {
9742                /* widen replacement */
9743                buf2 = _PyUnicode_AsKind(str2, rkind);
9744                if (!buf2) goto error;
9745                release2 = 1;
9746            }
9747            else if (rkind < kind2) {
9748                /* widen self and buf1 */
9749                rkind = kind2;
9750                if (release1) PyMem_Free(buf1);
9751                sbuf = _PyUnicode_AsKind(self, rkind);
9752                if (!sbuf) goto error;
9753                srelease = 1;
9754                buf1 = _PyUnicode_AsKind(str1, rkind);
9755                if (!buf1) goto error;
9756                release1 = 1;
9757            }
9758            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9759            maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9760            rstr = PyUnicode_New(slen, maxchar);
9761            if (!rstr)
9762                goto error;
9763            res = PyUnicode_DATA(rstr);
9764
9765            memcpy(res, sbuf, rkind * slen);
9766            /* change everything in-place, starting with this one */
9767            memcpy(res + rkind * i,
9768                   buf2,
9769                   rkind * len2);
9770            i += len1;
9771
9772            while ( --maxcount > 0) {
9773                i = anylib_find(rkind, self,
9774                                sbuf+rkind*i, slen-i,
9775                                str1, buf1, len1, i);
9776                if (i == -1)
9777                    break;
9778                memcpy(res + rkind * i,
9779                       buf2,
9780                       rkind * len2);
9781                i += len1;
9782            }
9783
9784            u = rstr;
9785            unicode_adjust_maxchar(&u);
9786            if (!u)
9787                goto error;
9788        }
9789    } else {
9790
9791        Py_ssize_t n, i, j, ires;
9792        Py_ssize_t product, new_size;
9793        int rkind = skind;
9794        PyObject *rstr;
9795        char *res;
9796        Py_UCS4 maxchar;
9797
9798        if (kind1 < rkind) {
9799            buf1 = _PyUnicode_AsKind(str1, rkind);
9800            if (!buf1) goto error;
9801            release1 = 1;
9802        }
9803        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
9804        if (n == 0)
9805            goto nothing;
9806        if (kind2 < rkind) {
9807            buf2 = _PyUnicode_AsKind(str2, rkind);
9808            if (!buf2) goto error;
9809            release2 = 1;
9810        }
9811        else if (kind2 > rkind) {
9812            rkind = kind2;
9813            sbuf = _PyUnicode_AsKind(self, rkind);
9814            if (!sbuf) goto error;
9815            srelease = 1;
9816            if (release1) PyMem_Free(buf1);
9817            buf1 = _PyUnicode_AsKind(str1, rkind);
9818            if (!buf1) goto error;
9819            release1 = 1;
9820        }
9821        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9822           PyUnicode_GET_LENGTH(str1))); */
9823        product = n * (len2-len1);
9824        if ((product / (len2-len1)) != n) {
9825                PyErr_SetString(PyExc_OverflowError,
9826                                "replace string is too long");
9827                goto error;
9828        }
9829        new_size = slen + product;
9830        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9831            PyErr_SetString(PyExc_OverflowError,
9832                            "replace string is too long");
9833            goto error;
9834        }
9835        maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9836        maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2));
9837        rstr = PyUnicode_New(new_size, maxchar);
9838        if (!rstr)
9839            goto error;
9840        res = PyUnicode_DATA(rstr);
9841        ires = i = 0;
9842        if (len1 > 0) {
9843            while (n-- > 0) {
9844                /* look for next match */
9845                j = anylib_find(rkind, self,
9846                                sbuf + rkind * i, slen-i,
9847                                str1, buf1, len1, i);
9848                if (j == -1)
9849                    break;
9850                else if (j > i) {
9851                    /* copy unchanged part [i:j] */
9852                    memcpy(res + rkind * ires,
9853                           sbuf + rkind * i,
9854                           rkind * (j-i));
9855                    ires += j - i;
9856                }
9857                /* copy substitution string */
9858                if (len2 > 0) {
9859                    memcpy(res + rkind * ires,
9860                           buf2,
9861                           rkind * len2);
9862                    ires += len2;
9863                }
9864                i = j + len1;
9865            }
9866            if (i < slen)
9867                /* copy tail [i:] */
9868                memcpy(res + rkind * ires,
9869                       sbuf + rkind * i,
9870                       rkind * (slen-i));
9871        } else {
9872            /* interleave */
9873            while (n > 0) {
9874                memcpy(res + rkind * ires,
9875                       buf2,
9876                       rkind * len2);
9877                ires += len2;
9878                if (--n <= 0)
9879                    break;
9880                memcpy(res + rkind * ires,
9881                       sbuf + rkind * i,
9882                       rkind);
9883                ires++;
9884                i++;
9885            }
9886            memcpy(res + rkind * ires,
9887                   sbuf + rkind * i,
9888                   rkind * (slen-i));
9889        }
9890        u = rstr;
9891        unicode_adjust_maxchar(&u);
9892        if (u == NULL)
9893            goto error;
9894    }
9895    if (srelease)
9896        PyMem_FREE(sbuf);
9897    if (release1)
9898        PyMem_FREE(buf1);
9899    if (release2)
9900        PyMem_FREE(buf2);
9901    assert(_PyUnicode_CheckConsistency(u, 1));
9902    return u;
9903
9904  nothing:
9905    /* nothing to replace; return original string (when possible) */
9906    if (srelease)
9907        PyMem_FREE(sbuf);
9908    if (release1)
9909        PyMem_FREE(buf1);
9910    if (release2)
9911        PyMem_FREE(buf2);
9912    if (PyUnicode_CheckExact(self)) {
9913        Py_INCREF(self);
9914        return (PyObject *) self;
9915    }
9916    return PyUnicode_Copy(self);
9917  error:
9918    if (srelease && sbuf)
9919        PyMem_FREE(sbuf);
9920    if (release1 && buf1)
9921        PyMem_FREE(buf1);
9922    if (release2 && buf2)
9923        PyMem_FREE(buf2);
9924    return NULL;
9925}
9926
9927/* --- Unicode Object Methods --------------------------------------------- */
9928
9929PyDoc_STRVAR(title__doc__,
9930             "S.title() -> str\n\
9931\n\
9932Return a titlecased version of S, i.e. words start with title case\n\
9933characters, all remaining cased characters have lower case.");
9934
9935static PyObject*
9936unicode_title(PyObject *self)
9937{
9938    return fixup(self, fixtitle);
9939}
9940
9941PyDoc_STRVAR(capitalize__doc__,
9942             "S.capitalize() -> str\n\
9943\n\
9944Return a capitalized version of S, i.e. make the first character\n\
9945have upper case and the rest lower case.");
9946
9947static PyObject*
9948unicode_capitalize(PyObject *self)
9949{
9950    return fixup(self, fixcapitalize);
9951}
9952
9953#if 0
9954PyDoc_STRVAR(capwords__doc__,
9955             "S.capwords() -> str\n\
9956\n\
9957Apply .capitalize() to all words in S and return the result with\n\
9958normalized whitespace (all whitespace strings are replaced by ' ').");
9959
9960static PyObject*
9961unicode_capwords(PyUnicodeObject *self)
9962{
9963    PyObject *list;
9964    PyObject *item;
9965    Py_ssize_t i;
9966
9967    /* Split into words */
9968    list = split(self, NULL, -1);
9969    if (!list)
9970        return NULL;
9971
9972    /* Capitalize each word */
9973    for (i = 0; i < PyList_GET_SIZE(list); i++) {
9974        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
9975                     fixcapitalize);
9976        if (item == NULL)
9977            goto onError;
9978        Py_DECREF(PyList_GET_ITEM(list, i));
9979        PyList_SET_ITEM(list, i, item);
9980    }
9981
9982    /* Join the words to form a new string */
9983    item = PyUnicode_Join(NULL, list);
9984
9985  onError:
9986    Py_DECREF(list);
9987    return (PyObject *)item;
9988}
9989#endif
9990
9991/* Argument converter.  Coerces to a single unicode character */
9992
9993static int
9994convert_uc(PyObject *obj, void *addr)
9995{
9996    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
9997    PyObject *uniobj;
9998
9999    uniobj = PyUnicode_FromObject(obj);
10000    if (uniobj == NULL) {
10001        PyErr_SetString(PyExc_TypeError,
10002                        "The fill character cannot be converted to Unicode");
10003        return 0;
10004    }
10005    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10006        PyErr_SetString(PyExc_TypeError,
10007                        "The fill character must be exactly one character long");
10008        Py_DECREF(uniobj);
10009        return 0;
10010    }
10011    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10012    Py_DECREF(uniobj);
10013    return 1;
10014}
10015
10016PyDoc_STRVAR(center__doc__,
10017             "S.center(width[, fillchar]) -> str\n\
10018\n\
10019Return S centered in a string of length width. Padding is\n\
10020done using the specified fill character (default is a space)");
10021
10022static PyObject *
10023unicode_center(PyObject *self, PyObject *args)
10024{
10025    Py_ssize_t marg, left;
10026    Py_ssize_t width;
10027    Py_UCS4 fillchar = ' ';
10028
10029    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10030        return NULL;
10031
10032    if (PyUnicode_READY(self) == -1)
10033        return NULL;
10034
10035    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10036        Py_INCREF(self);
10037        return (PyObject*) self;
10038    }
10039
10040    marg = width - _PyUnicode_LENGTH(self);
10041    left = marg / 2 + (marg & width & 1);
10042
10043    return pad(self, left, marg - left, fillchar);
10044}
10045
10046#if 0
10047
10048/* This code should go into some future Unicode collation support
10049   module. The basic comparison should compare ordinals on a naive
10050   basis (this is what Java does and thus Jython too). */
10051
10052/* speedy UTF-16 code point order comparison */
10053/* gleaned from: */
10054/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
10055
10056static short utf16Fixup[32] =
10057{
10058    0, 0, 0, 0, 0, 0, 0, 0,
10059    0, 0, 0, 0, 0, 0, 0, 0,
10060    0, 0, 0, 0, 0, 0, 0, 0,
10061    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
10062};
10063
10064static int
10065unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10066{
10067    Py_ssize_t len1, len2;
10068
10069    Py_UNICODE *s1 = str1->str;
10070    Py_UNICODE *s2 = str2->str;
10071
10072    len1 = str1->_base._base.length;
10073    len2 = str2->_base._base.length;
10074
10075    while (len1 > 0 && len2 > 0) {
10076        Py_UNICODE c1, c2;
10077
10078        c1 = *s1++;
10079        c2 = *s2++;
10080
10081        if (c1 > (1<<11) * 26)
10082            c1 += utf16Fixup[c1>>11];
10083        if (c2 > (1<<11) * 26)
10084            c2 += utf16Fixup[c2>>11];
10085        /* now c1 and c2 are in UTF-32-compatible order */
10086
10087        if (c1 != c2)
10088            return (c1 < c2) ? -1 : 1;
10089
10090        len1--; len2--;
10091    }
10092
10093    return (len1 < len2) ? -1 : (len1 != len2);
10094}
10095
10096#else
10097
10098/* This function assumes that str1 and str2 are readied by the caller. */
10099
10100static int
10101unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10102{
10103    int kind1, kind2;
10104    void *data1, *data2;
10105    Py_ssize_t len1, len2, i;
10106
10107    kind1 = PyUnicode_KIND(str1);
10108    kind2 = PyUnicode_KIND(str2);
10109    data1 = PyUnicode_DATA(str1);
10110    data2 = PyUnicode_DATA(str2);
10111    len1 = PyUnicode_GET_LENGTH(str1);
10112    len2 = PyUnicode_GET_LENGTH(str2);
10113
10114    for (i = 0; i < len1 && i < len2; ++i) {
10115        Py_UCS4 c1, c2;
10116        c1 = PyUnicode_READ(kind1, data1, i);
10117        c2 = PyUnicode_READ(kind2, data2, i);
10118
10119        if (c1 != c2)
10120            return (c1 < c2) ? -1 : 1;
10121    }
10122
10123    return (len1 < len2) ? -1 : (len1 != len2);
10124}
10125
10126#endif
10127
10128int
10129PyUnicode_Compare(PyObject *left, PyObject *right)
10130{
10131    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10132        if (PyUnicode_READY(left) == -1 ||
10133            PyUnicode_READY(right) == -1)
10134            return -1;
10135        return unicode_compare((PyUnicodeObject *)left,
10136                               (PyUnicodeObject *)right);
10137    }
10138    PyErr_Format(PyExc_TypeError,
10139                 "Can't compare %.100s and %.100s",
10140                 left->ob_type->tp_name,
10141                 right->ob_type->tp_name);
10142    return -1;
10143}
10144
10145int
10146PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10147{
10148    Py_ssize_t i;
10149    int kind;
10150    void *data;
10151    Py_UCS4 chr;
10152
10153    assert(_PyUnicode_CHECK(uni));
10154    if (PyUnicode_READY(uni) == -1)
10155        return -1;
10156    kind = PyUnicode_KIND(uni);
10157    data = PyUnicode_DATA(uni);
10158    /* Compare Unicode string and source character set string */
10159    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10160        if (chr != str[i])
10161            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10162    /* This check keeps Python strings that end in '\0' from comparing equal
10163     to C strings identical up to that point. */
10164    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10165        return 1; /* uni is longer */
10166    if (str[i])
10167        return -1; /* str is longer */
10168    return 0;
10169}
10170
10171
10172#define TEST_COND(cond)                         \
10173    ((cond) ? Py_True : Py_False)
10174
10175PyObject *
10176PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10177{
10178    int result;
10179
10180    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10181        PyObject *v;
10182        if (PyUnicode_READY(left) == -1 ||
10183            PyUnicode_READY(right) == -1)
10184            return NULL;
10185        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10186            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10187            if (op == Py_EQ) {
10188                Py_INCREF(Py_False);
10189                return Py_False;
10190            }
10191            if (op == Py_NE) {
10192                Py_INCREF(Py_True);
10193                return Py_True;
10194            }
10195        }
10196        if (left == right)
10197            result = 0;
10198        else
10199            result = unicode_compare((PyUnicodeObject *)left,
10200                                     (PyUnicodeObject *)right);
10201
10202        /* Convert the return value to a Boolean */
10203        switch (op) {
10204        case Py_EQ:
10205            v = TEST_COND(result == 0);
10206            break;
10207        case Py_NE:
10208            v = TEST_COND(result != 0);
10209            break;
10210        case Py_LE:
10211            v = TEST_COND(result <= 0);
10212            break;
10213        case Py_GE:
10214            v = TEST_COND(result >= 0);
10215            break;
10216        case Py_LT:
10217            v = TEST_COND(result == -1);
10218            break;
10219        case Py_GT:
10220            v = TEST_COND(result == 1);
10221            break;
10222        default:
10223            PyErr_BadArgument();
10224            return NULL;
10225        }
10226        Py_INCREF(v);
10227        return v;
10228    }
10229
10230    Py_RETURN_NOTIMPLEMENTED;
10231}
10232
10233int
10234PyUnicode_Contains(PyObject *container, PyObject *element)
10235{
10236    PyObject *str, *sub;
10237    int kind1, kind2, kind;
10238    void *buf1, *buf2;
10239    Py_ssize_t len1, len2;
10240    int result;
10241
10242    /* Coerce the two arguments */
10243    sub = PyUnicode_FromObject(element);
10244    if (!sub) {
10245        PyErr_Format(PyExc_TypeError,
10246                     "'in <string>' requires string as left operand, not %s",
10247                     element->ob_type->tp_name);
10248        return -1;
10249    }
10250    if (PyUnicode_READY(sub) == -1)
10251        return -1;
10252
10253    str = PyUnicode_FromObject(container);
10254    if (!str || PyUnicode_READY(str) == -1) {
10255        Py_DECREF(sub);
10256        return -1;
10257    }
10258
10259    kind1 = PyUnicode_KIND(str);
10260    kind2 = PyUnicode_KIND(sub);
10261    kind = kind1 > kind2 ? kind1 : kind2;
10262    buf1 = PyUnicode_DATA(str);
10263    buf2 = PyUnicode_DATA(sub);
10264    if (kind1 != kind)
10265        buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10266    if (!buf1) {
10267        Py_DECREF(sub);
10268        return -1;
10269    }
10270    if (kind2 != kind)
10271        buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10272    if (!buf2) {
10273        Py_DECREF(sub);
10274        if (kind1 != kind) PyMem_Free(buf1);
10275        return -1;
10276    }
10277    len1 = PyUnicode_GET_LENGTH(str);
10278    len2 = PyUnicode_GET_LENGTH(sub);
10279
10280    switch(kind) {
10281    case PyUnicode_1BYTE_KIND:
10282        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10283        break;
10284    case PyUnicode_2BYTE_KIND:
10285        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10286        break;
10287    case PyUnicode_4BYTE_KIND:
10288        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10289        break;
10290    default:
10291        result = -1;
10292        assert(0);
10293    }
10294
10295    Py_DECREF(str);
10296    Py_DECREF(sub);
10297
10298    if (kind1 != kind)
10299        PyMem_Free(buf1);
10300    if (kind2 != kind)
10301        PyMem_Free(buf2);
10302
10303    return result;
10304}
10305
10306/* Concat to string or Unicode object giving a new Unicode object. */
10307
10308PyObject *
10309PyUnicode_Concat(PyObject *left, PyObject *right)
10310{
10311    PyObject *u = NULL, *v = NULL, *w;
10312    Py_UCS4 maxchar;
10313
10314    /* Coerce the two arguments */
10315    u = PyUnicode_FromObject(left);
10316    if (u == NULL)
10317        goto onError;
10318    v = PyUnicode_FromObject(right);
10319    if (v == NULL)
10320        goto onError;
10321
10322    /* Shortcuts */
10323    if (v == unicode_empty) {
10324        Py_DECREF(v);
10325        return u;
10326    }
10327    if (u == unicode_empty) {
10328        Py_DECREF(u);
10329        return v;
10330    }
10331
10332    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10333    maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
10334
10335    /* Concat the two Unicode strings */
10336    w = PyUnicode_New(
10337        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10338        maxchar);
10339    if (w == NULL)
10340        goto onError;
10341    copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10342    copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
10343    Py_DECREF(u);
10344    Py_DECREF(v);
10345    assert(_PyUnicode_CheckConsistency(w, 1));
10346    return w;
10347
10348  onError:
10349    Py_XDECREF(u);
10350    Py_XDECREF(v);
10351    return NULL;
10352}
10353
10354static void
10355unicode_append_inplace(PyObject **p_left, PyObject *right)
10356{
10357    Py_ssize_t left_len, right_len, new_len;
10358
10359    assert(PyUnicode_IS_READY(*p_left));
10360    assert(PyUnicode_IS_READY(right));
10361
10362    left_len = PyUnicode_GET_LENGTH(*p_left);
10363    right_len = PyUnicode_GET_LENGTH(right);
10364    if (left_len > PY_SSIZE_T_MAX - right_len) {
10365        PyErr_SetString(PyExc_OverflowError,
10366                        "strings are too large to concat");
10367        goto error;
10368    }
10369    new_len = left_len + right_len;
10370
10371    /* Now we own the last reference to 'left', so we can resize it
10372     * in-place.
10373     */
10374    if (unicode_resize(p_left, new_len) != 0) {
10375        /* XXX if _PyUnicode_Resize() fails, 'left' has been
10376         * deallocated so it cannot be put back into
10377         * 'variable'.  The MemoryError is raised when there
10378         * is no value in 'variable', which might (very
10379         * remotely) be a cause of incompatibilities.
10380         */
10381        goto error;
10382    }
10383    /* copy 'right' into the newly allocated area of 'left' */
10384    copy_characters(*p_left, left_len, right, 0, right_len);
10385    _PyUnicode_DIRTY(*p_left);
10386    return;
10387
10388error:
10389    Py_DECREF(*p_left);
10390    *p_left = NULL;
10391}
10392
10393void
10394PyUnicode_Append(PyObject **p_left, PyObject *right)
10395{
10396    PyObject *left, *res;
10397
10398    if (p_left == NULL) {
10399        if (!PyErr_Occurred())
10400            PyErr_BadInternalCall();
10401        return;
10402    }
10403    left = *p_left;
10404    if (right == NULL || !PyUnicode_Check(left)) {
10405        if (!PyErr_Occurred())
10406            PyErr_BadInternalCall();
10407        goto error;
10408    }
10409
10410    if (PyUnicode_READY(left))
10411        goto error;
10412    if (PyUnicode_READY(right))
10413        goto error;
10414
10415    if (PyUnicode_CheckExact(left) && left != unicode_empty
10416        && PyUnicode_CheckExact(right) && right != unicode_empty
10417        && unicode_resizable(left)
10418        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10419            || _PyUnicode_WSTR(left) != NULL))
10420    {
10421        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10422           to change the structure size, but characters are stored just after
10423           the structure, and so it requires to move all characters which is
10424           not so different than duplicating the string. */
10425        if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10426        {
10427            unicode_append_inplace(p_left, right);
10428            if (p_left != NULL)
10429                assert(_PyUnicode_CheckConsistency(*p_left, 1));
10430            return;
10431        }
10432    }
10433
10434    res = PyUnicode_Concat(left, right);
10435    if (res == NULL)
10436        goto error;
10437    Py_DECREF(left);
10438    *p_left = res;
10439    return;
10440
10441error:
10442    Py_DECREF(*p_left);
10443    *p_left = NULL;
10444}
10445
10446void
10447PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10448{
10449    PyUnicode_Append(pleft, right);
10450    Py_XDECREF(right);
10451}
10452
10453PyDoc_STRVAR(count__doc__,
10454             "S.count(sub[, start[, end]]) -> int\n\
10455\n\
10456Return the number of non-overlapping occurrences of substring sub in\n\
10457string S[start:end].  Optional arguments start and end are\n\
10458interpreted as in slice notation.");
10459
10460static PyObject *
10461unicode_count(PyUnicodeObject *self, PyObject *args)
10462{
10463    PyUnicodeObject *substring;
10464    Py_ssize_t start = 0;
10465    Py_ssize_t end = PY_SSIZE_T_MAX;
10466    PyObject *result;
10467    int kind1, kind2, kind;
10468    void *buf1, *buf2;
10469    Py_ssize_t len1, len2, iresult;
10470
10471    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10472                                            &start, &end))
10473        return NULL;
10474
10475    kind1 = PyUnicode_KIND(self);
10476    kind2 = PyUnicode_KIND(substring);
10477    kind = kind1 > kind2 ? kind1 : kind2;
10478    buf1 = PyUnicode_DATA(self);
10479    buf2 = PyUnicode_DATA(substring);
10480    if (kind1 != kind)
10481        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10482    if (!buf1) {
10483        Py_DECREF(substring);
10484        return NULL;
10485    }
10486    if (kind2 != kind)
10487        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10488    if (!buf2) {
10489        Py_DECREF(substring);
10490        if (kind1 != kind) PyMem_Free(buf1);
10491        return NULL;
10492    }
10493    len1 = PyUnicode_GET_LENGTH(self);
10494    len2 = PyUnicode_GET_LENGTH(substring);
10495
10496    ADJUST_INDICES(start, end, len1);
10497    switch(kind) {
10498    case PyUnicode_1BYTE_KIND:
10499        iresult = ucs1lib_count(
10500            ((Py_UCS1*)buf1) + start, end - start,
10501            buf2, len2, PY_SSIZE_T_MAX
10502            );
10503        break;
10504    case PyUnicode_2BYTE_KIND:
10505        iresult = ucs2lib_count(
10506            ((Py_UCS2*)buf1) + start, end - start,
10507            buf2, len2, PY_SSIZE_T_MAX
10508            );
10509        break;
10510    case PyUnicode_4BYTE_KIND:
10511        iresult = ucs4lib_count(
10512            ((Py_UCS4*)buf1) + start, end - start,
10513            buf2, len2, PY_SSIZE_T_MAX
10514            );
10515        break;
10516    default:
10517        assert(0); iresult = 0;
10518    }
10519
10520    result = PyLong_FromSsize_t(iresult);
10521
10522    if (kind1 != kind)
10523        PyMem_Free(buf1);
10524    if (kind2 != kind)
10525        PyMem_Free(buf2);
10526
10527    Py_DECREF(substring);
10528
10529    return result;
10530}
10531
10532PyDoc_STRVAR(encode__doc__,
10533             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10534\n\
10535Encode S using the codec registered for encoding. Default encoding\n\
10536is 'utf-8'. errors may be given to set a different error\n\
10537handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10538a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10539'xmlcharrefreplace' as well as any other name registered with\n\
10540codecs.register_error that can handle UnicodeEncodeErrors.");
10541
10542static PyObject *
10543unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
10544{
10545    static char *kwlist[] = {"encoding", "errors", 0};
10546    char *encoding = NULL;
10547    char *errors = NULL;
10548
10549    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10550                                     kwlist, &encoding, &errors))
10551        return NULL;
10552    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
10553}
10554
10555PyDoc_STRVAR(expandtabs__doc__,
10556             "S.expandtabs([tabsize]) -> str\n\
10557\n\
10558Return a copy of S where all tab characters are expanded using spaces.\n\
10559If tabsize is not given, a tab size of 8 characters is assumed.");
10560
10561static PyObject*
10562unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10563{
10564    Py_ssize_t i, j, line_pos, src_len, incr;
10565    Py_UCS4 ch;
10566    PyObject *u;
10567    void *src_data, *dest_data;
10568    int tabsize = 8;
10569    int kind;
10570    int found;
10571
10572    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10573        return NULL;
10574
10575    if (PyUnicode_READY(self) == -1)
10576        return NULL;
10577
10578    /* First pass: determine size of output string */
10579    src_len = PyUnicode_GET_LENGTH(self);
10580    i = j = line_pos = 0;
10581    kind = PyUnicode_KIND(self);
10582    src_data = PyUnicode_DATA(self);
10583    found = 0;
10584    for (; i < src_len; i++) {
10585        ch = PyUnicode_READ(kind, src_data, i);
10586        if (ch == '\t') {
10587            found = 1;
10588            if (tabsize > 0) {
10589                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10590                if (j > PY_SSIZE_T_MAX - incr)
10591                    goto overflow;
10592                line_pos += incr;
10593                j += incr;
10594            }
10595        }
10596        else {
10597            if (j > PY_SSIZE_T_MAX - 1)
10598                goto overflow;
10599            line_pos++;
10600            j++;
10601            if (ch == '\n' || ch == '\r')
10602                line_pos = 0;
10603        }
10604    }
10605    if (!found && PyUnicode_CheckExact(self)) {
10606        Py_INCREF((PyObject *) self);
10607        return (PyObject *) self;
10608    }
10609
10610    /* Second pass: create output string and fill it */
10611    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10612    if (!u)
10613        return NULL;
10614    dest_data = PyUnicode_DATA(u);
10615
10616    i = j = line_pos = 0;
10617
10618    for (; i < src_len; i++) {
10619        ch = PyUnicode_READ(kind, src_data, i);
10620        if (ch == '\t') {
10621            if (tabsize > 0) {
10622                incr = tabsize - (line_pos % tabsize);
10623                line_pos += incr;
10624                while (incr--) {
10625                    PyUnicode_WRITE(kind, dest_data, j, ' ');
10626                    j++;
10627                }
10628            }
10629        }
10630        else {
10631            line_pos++;
10632            PyUnicode_WRITE(kind, dest_data, j, ch);
10633            j++;
10634            if (ch == '\n' || ch == '\r')
10635                line_pos = 0;
10636        }
10637    }
10638    assert (j == PyUnicode_GET_LENGTH(u));
10639#ifndef DONT_MAKE_RESULT_READY
10640    if (_PyUnicode_READY_REPLACE(&u)) {
10641        Py_DECREF(u);
10642        return NULL;
10643    }
10644#endif
10645    assert(_PyUnicode_CheckConsistency(u, 1));
10646    return (PyObject*) u;
10647
10648  overflow:
10649    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10650    return NULL;
10651}
10652
10653PyDoc_STRVAR(find__doc__,
10654             "S.find(sub[, start[, end]]) -> int\n\
10655\n\
10656Return the lowest index in S where substring sub is found,\n\
10657such that sub is contained within S[start:end].  Optional\n\
10658arguments start and end are interpreted as in slice notation.\n\
10659\n\
10660Return -1 on failure.");
10661
10662static PyObject *
10663unicode_find(PyObject *self, PyObject *args)
10664{
10665    PyUnicodeObject *substring;
10666    Py_ssize_t start;
10667    Py_ssize_t end;
10668    Py_ssize_t result;
10669
10670    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10671                                            &start, &end))
10672        return NULL;
10673
10674    if (PyUnicode_READY(self) == -1)
10675        return NULL;
10676    if (PyUnicode_READY(substring) == -1)
10677        return NULL;
10678
10679    result = any_find_slice(1,
10680        self, (PyObject*)substring, start, end
10681        );
10682
10683    Py_DECREF(substring);
10684
10685    if (result == -2)
10686        return NULL;
10687
10688    return PyLong_FromSsize_t(result);
10689}
10690
10691static PyObject *
10692unicode_getitem(PyObject *self, Py_ssize_t index)
10693{
10694    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10695    if (ch == (Py_UCS4)-1)
10696        return NULL;
10697    return PyUnicode_FromOrdinal(ch);
10698}
10699
10700/* Believe it or not, this produces the same value for ASCII strings
10701   as bytes_hash(). */
10702static Py_hash_t
10703unicode_hash(PyUnicodeObject *self)
10704{
10705    Py_ssize_t len;
10706    Py_uhash_t x;
10707
10708    if (_PyUnicode_HASH(self) != -1)
10709        return _PyUnicode_HASH(self);
10710    if (PyUnicode_READY(self) == -1)
10711        return -1;
10712    len = PyUnicode_GET_LENGTH(self);
10713
10714    /* The hash function as a macro, gets expanded three times below. */
10715#define HASH(P) \
10716    x = (Py_uhash_t)*P << 7; \
10717    while (--len >= 0) \
10718        x = (1000003*x) ^ (Py_uhash_t)*P++;
10719
10720    switch (PyUnicode_KIND(self)) {
10721    case PyUnicode_1BYTE_KIND: {
10722        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10723        HASH(c);
10724        break;
10725    }
10726    case PyUnicode_2BYTE_KIND: {
10727        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10728        HASH(s);
10729        break;
10730    }
10731    default: {
10732        Py_UCS4 *l;
10733        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10734               "Impossible switch case in unicode_hash");
10735        l = PyUnicode_4BYTE_DATA(self);
10736        HASH(l);
10737        break;
10738    }
10739    }
10740    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10741
10742    if (x == -1)
10743        x = -2;
10744    _PyUnicode_HASH(self) = x;
10745    return x;
10746}
10747#undef HASH
10748
10749PyDoc_STRVAR(index__doc__,
10750             "S.index(sub[, start[, end]]) -> int\n\
10751\n\
10752Like S.find() but raise ValueError when the substring is not found.");
10753
10754static PyObject *
10755unicode_index(PyObject *self, PyObject *args)
10756{
10757    Py_ssize_t result;
10758    PyUnicodeObject *substring;
10759    Py_ssize_t start;
10760    Py_ssize_t end;
10761
10762    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10763                                            &start, &end))
10764        return NULL;
10765
10766    if (PyUnicode_READY(self) == -1)
10767        return NULL;
10768    if (PyUnicode_READY(substring) == -1)
10769        return NULL;
10770
10771    result = any_find_slice(1,
10772        self, (PyObject*)substring, start, end
10773        );
10774
10775    Py_DECREF(substring);
10776
10777    if (result == -2)
10778        return NULL;
10779
10780    if (result < 0) {
10781        PyErr_SetString(PyExc_ValueError, "substring not found");
10782        return NULL;
10783    }
10784
10785    return PyLong_FromSsize_t(result);
10786}
10787
10788PyDoc_STRVAR(islower__doc__,
10789             "S.islower() -> bool\n\
10790\n\
10791Return True if all cased characters in S are lowercase and there is\n\
10792at least one cased character in S, False otherwise.");
10793
10794static PyObject*
10795unicode_islower(PyUnicodeObject *self)
10796{
10797    Py_ssize_t i, length;
10798    int kind;
10799    void *data;
10800    int cased;
10801
10802    if (PyUnicode_READY(self) == -1)
10803        return NULL;
10804    length = PyUnicode_GET_LENGTH(self);
10805    kind = PyUnicode_KIND(self);
10806    data = PyUnicode_DATA(self);
10807
10808    /* Shortcut for single character strings */
10809    if (length == 1)
10810        return PyBool_FromLong(
10811            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
10812
10813    /* Special case for empty strings */
10814    if (length == 0)
10815        return PyBool_FromLong(0);
10816
10817    cased = 0;
10818    for (i = 0; i < length; i++) {
10819        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10820
10821        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10822            return PyBool_FromLong(0);
10823        else if (!cased && Py_UNICODE_ISLOWER(ch))
10824            cased = 1;
10825    }
10826    return PyBool_FromLong(cased);
10827}
10828
10829PyDoc_STRVAR(isupper__doc__,
10830             "S.isupper() -> bool\n\
10831\n\
10832Return True if all cased characters in S are uppercase and there is\n\
10833at least one cased character in S, False otherwise.");
10834
10835static PyObject*
10836unicode_isupper(PyUnicodeObject *self)
10837{
10838    Py_ssize_t i, length;
10839    int kind;
10840    void *data;
10841    int cased;
10842
10843    if (PyUnicode_READY(self) == -1)
10844        return NULL;
10845    length = PyUnicode_GET_LENGTH(self);
10846    kind = PyUnicode_KIND(self);
10847    data = PyUnicode_DATA(self);
10848
10849    /* Shortcut for single character strings */
10850    if (length == 1)
10851        return PyBool_FromLong(
10852            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
10853
10854    /* Special case for empty strings */
10855    if (length == 0)
10856        return PyBool_FromLong(0);
10857
10858    cased = 0;
10859    for (i = 0; i < length; i++) {
10860        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10861
10862        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10863            return PyBool_FromLong(0);
10864        else if (!cased && Py_UNICODE_ISUPPER(ch))
10865            cased = 1;
10866    }
10867    return PyBool_FromLong(cased);
10868}
10869
10870PyDoc_STRVAR(istitle__doc__,
10871             "S.istitle() -> bool\n\
10872\n\
10873Return True if S is a titlecased string and there is at least one\n\
10874character in S, i.e. upper- and titlecase characters may only\n\
10875follow uncased characters and lowercase characters only cased ones.\n\
10876Return False otherwise.");
10877
10878static PyObject*
10879unicode_istitle(PyUnicodeObject *self)
10880{
10881    Py_ssize_t i, length;
10882    int kind;
10883    void *data;
10884    int cased, previous_is_cased;
10885
10886    if (PyUnicode_READY(self) == -1)
10887        return NULL;
10888    length = PyUnicode_GET_LENGTH(self);
10889    kind = PyUnicode_KIND(self);
10890    data = PyUnicode_DATA(self);
10891
10892    /* Shortcut for single character strings */
10893    if (length == 1) {
10894        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10895        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10896                               (Py_UNICODE_ISUPPER(ch) != 0));
10897    }
10898
10899    /* Special case for empty strings */
10900    if (length == 0)
10901        return PyBool_FromLong(0);
10902
10903    cased = 0;
10904    previous_is_cased = 0;
10905    for (i = 0; i < length; i++) {
10906        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10907
10908        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10909            if (previous_is_cased)
10910                return PyBool_FromLong(0);
10911            previous_is_cased = 1;
10912            cased = 1;
10913        }
10914        else if (Py_UNICODE_ISLOWER(ch)) {
10915            if (!previous_is_cased)
10916                return PyBool_FromLong(0);
10917            previous_is_cased = 1;
10918            cased = 1;
10919        }
10920        else
10921            previous_is_cased = 0;
10922    }
10923    return PyBool_FromLong(cased);
10924}
10925
10926PyDoc_STRVAR(isspace__doc__,
10927             "S.isspace() -> bool\n\
10928\n\
10929Return True if all characters in S are whitespace\n\
10930and there is at least one character in S, False otherwise.");
10931
10932static PyObject*
10933unicode_isspace(PyUnicodeObject *self)
10934{
10935    Py_ssize_t i, length;
10936    int kind;
10937    void *data;
10938
10939    if (PyUnicode_READY(self) == -1)
10940        return NULL;
10941    length = PyUnicode_GET_LENGTH(self);
10942    kind = PyUnicode_KIND(self);
10943    data = PyUnicode_DATA(self);
10944
10945    /* Shortcut for single character strings */
10946    if (length == 1)
10947        return PyBool_FromLong(
10948            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
10949
10950    /* Special case for empty strings */
10951    if (length == 0)
10952        return PyBool_FromLong(0);
10953
10954    for (i = 0; i < length; i++) {
10955        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10956        if (!Py_UNICODE_ISSPACE(ch))
10957            return PyBool_FromLong(0);
10958    }
10959    return PyBool_FromLong(1);
10960}
10961
10962PyDoc_STRVAR(isalpha__doc__,
10963             "S.isalpha() -> bool\n\
10964\n\
10965Return True if all characters in S are alphabetic\n\
10966and there is at least one character in S, False otherwise.");
10967
10968static PyObject*
10969unicode_isalpha(PyUnicodeObject *self)
10970{
10971    Py_ssize_t i, length;
10972    int kind;
10973    void *data;
10974
10975    if (PyUnicode_READY(self) == -1)
10976        return NULL;
10977    length = PyUnicode_GET_LENGTH(self);
10978    kind = PyUnicode_KIND(self);
10979    data = PyUnicode_DATA(self);
10980
10981    /* Shortcut for single character strings */
10982    if (length == 1)
10983        return PyBool_FromLong(
10984            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
10985
10986    /* Special case for empty strings */
10987    if (length == 0)
10988        return PyBool_FromLong(0);
10989
10990    for (i = 0; i < length; i++) {
10991        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
10992            return PyBool_FromLong(0);
10993    }
10994    return PyBool_FromLong(1);
10995}
10996
10997PyDoc_STRVAR(isalnum__doc__,
10998             "S.isalnum() -> bool\n\
10999\n\
11000Return True if all characters in S are alphanumeric\n\
11001and there is at least one character in S, False otherwise.");
11002
11003static PyObject*
11004unicode_isalnum(PyUnicodeObject *self)
11005{
11006    int kind;
11007    void *data;
11008    Py_ssize_t len, i;
11009
11010    if (PyUnicode_READY(self) == -1)
11011        return NULL;
11012
11013    kind = PyUnicode_KIND(self);
11014    data = PyUnicode_DATA(self);
11015    len = PyUnicode_GET_LENGTH(self);
11016
11017    /* Shortcut for single character strings */
11018    if (len == 1) {
11019        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11020        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11021    }
11022
11023    /* Special case for empty strings */
11024    if (len == 0)
11025        return PyBool_FromLong(0);
11026
11027    for (i = 0; i < len; i++) {
11028        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11029        if (!Py_UNICODE_ISALNUM(ch))
11030            return PyBool_FromLong(0);
11031    }
11032    return PyBool_FromLong(1);
11033}
11034
11035PyDoc_STRVAR(isdecimal__doc__,
11036             "S.isdecimal() -> bool\n\
11037\n\
11038Return True if there are only decimal characters in S,\n\
11039False otherwise.");
11040
11041static PyObject*
11042unicode_isdecimal(PyUnicodeObject *self)
11043{
11044    Py_ssize_t i, length;
11045    int kind;
11046    void *data;
11047
11048    if (PyUnicode_READY(self) == -1)
11049        return NULL;
11050    length = PyUnicode_GET_LENGTH(self);
11051    kind = PyUnicode_KIND(self);
11052    data = PyUnicode_DATA(self);
11053
11054    /* Shortcut for single character strings */
11055    if (length == 1)
11056        return PyBool_FromLong(
11057            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11058
11059    /* Special case for empty strings */
11060    if (length == 0)
11061        return PyBool_FromLong(0);
11062
11063    for (i = 0; i < length; i++) {
11064        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11065            return PyBool_FromLong(0);
11066    }
11067    return PyBool_FromLong(1);
11068}
11069
11070PyDoc_STRVAR(isdigit__doc__,
11071             "S.isdigit() -> bool\n\
11072\n\
11073Return True if all characters in S are digits\n\
11074and there is at least one character in S, False otherwise.");
11075
11076static PyObject*
11077unicode_isdigit(PyUnicodeObject *self)
11078{
11079    Py_ssize_t i, length;
11080    int kind;
11081    void *data;
11082
11083    if (PyUnicode_READY(self) == -1)
11084        return NULL;
11085    length = PyUnicode_GET_LENGTH(self);
11086    kind = PyUnicode_KIND(self);
11087    data = PyUnicode_DATA(self);
11088
11089    /* Shortcut for single character strings */
11090    if (length == 1) {
11091        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11092        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11093    }
11094
11095    /* Special case for empty strings */
11096    if (length == 0)
11097        return PyBool_FromLong(0);
11098
11099    for (i = 0; i < length; i++) {
11100        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11101            return PyBool_FromLong(0);
11102    }
11103    return PyBool_FromLong(1);
11104}
11105
11106PyDoc_STRVAR(isnumeric__doc__,
11107             "S.isnumeric() -> bool\n\
11108\n\
11109Return True if there are only numeric characters in S,\n\
11110False otherwise.");
11111
11112static PyObject*
11113unicode_isnumeric(PyUnicodeObject *self)
11114{
11115    Py_ssize_t i, length;
11116    int kind;
11117    void *data;
11118
11119    if (PyUnicode_READY(self) == -1)
11120        return NULL;
11121    length = PyUnicode_GET_LENGTH(self);
11122    kind = PyUnicode_KIND(self);
11123    data = PyUnicode_DATA(self);
11124
11125    /* Shortcut for single character strings */
11126    if (length == 1)
11127        return PyBool_FromLong(
11128            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11129
11130    /* Special case for empty strings */
11131    if (length == 0)
11132        return PyBool_FromLong(0);
11133
11134    for (i = 0; i < length; i++) {
11135        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11136            return PyBool_FromLong(0);
11137    }
11138    return PyBool_FromLong(1);
11139}
11140
11141int
11142PyUnicode_IsIdentifier(PyObject *self)
11143{
11144    int kind;
11145    void *data;
11146    Py_ssize_t i;
11147    Py_UCS4 first;
11148
11149    if (PyUnicode_READY(self) == -1) {
11150        Py_FatalError("identifier not ready");
11151        return 0;
11152    }
11153
11154    /* Special case for empty strings */
11155    if (PyUnicode_GET_LENGTH(self) == 0)
11156        return 0;
11157    kind = PyUnicode_KIND(self);
11158    data = PyUnicode_DATA(self);
11159
11160    /* PEP 3131 says that the first character must be in
11161       XID_Start and subsequent characters in XID_Continue,
11162       and for the ASCII range, the 2.x rules apply (i.e
11163       start with letters and underscore, continue with
11164       letters, digits, underscore). However, given the current
11165       definition of XID_Start and XID_Continue, it is sufficient
11166       to check just for these, except that _ must be allowed
11167       as starting an identifier.  */
11168    first = PyUnicode_READ(kind, data, 0);
11169    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11170        return 0;
11171
11172    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11173        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11174            return 0;
11175    return 1;
11176}
11177
11178PyDoc_STRVAR(isidentifier__doc__,
11179             "S.isidentifier() -> bool\n\
11180\n\
11181Return True if S is a valid identifier according\n\
11182to the language definition.");
11183
11184static PyObject*
11185unicode_isidentifier(PyObject *self)
11186{
11187    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11188}
11189
11190PyDoc_STRVAR(isprintable__doc__,
11191             "S.isprintable() -> bool\n\
11192\n\
11193Return True if all characters in S are considered\n\
11194printable in repr() or S is empty, False otherwise.");
11195
11196static PyObject*
11197unicode_isprintable(PyObject *self)
11198{
11199    Py_ssize_t i, length;
11200    int kind;
11201    void *data;
11202
11203    if (PyUnicode_READY(self) == -1)
11204        return NULL;
11205    length = PyUnicode_GET_LENGTH(self);
11206    kind = PyUnicode_KIND(self);
11207    data = PyUnicode_DATA(self);
11208
11209    /* Shortcut for single character strings */
11210    if (length == 1)
11211        return PyBool_FromLong(
11212            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11213
11214    for (i = 0; i < length; i++) {
11215        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11216            Py_RETURN_FALSE;
11217        }
11218    }
11219    Py_RETURN_TRUE;
11220}
11221
11222PyDoc_STRVAR(join__doc__,
11223             "S.join(iterable) -> str\n\
11224\n\
11225Return a string which is the concatenation of the strings in the\n\
11226iterable.  The separator between elements is S.");
11227
11228static PyObject*
11229unicode_join(PyObject *self, PyObject *data)
11230{
11231    return PyUnicode_Join(self, data);
11232}
11233
11234static Py_ssize_t
11235unicode_length(PyUnicodeObject *self)
11236{
11237    if (PyUnicode_READY(self) == -1)
11238        return -1;
11239    return PyUnicode_GET_LENGTH(self);
11240}
11241
11242PyDoc_STRVAR(ljust__doc__,
11243             "S.ljust(width[, fillchar]) -> str\n\
11244\n\
11245Return S left-justified in a Unicode string of length width. Padding is\n\
11246done using the specified fill character (default is a space).");
11247
11248static PyObject *
11249unicode_ljust(PyObject *self, PyObject *args)
11250{
11251    Py_ssize_t width;
11252    Py_UCS4 fillchar = ' ';
11253
11254    if (PyUnicode_READY(self) == -1)
11255        return NULL;
11256
11257    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11258        return NULL;
11259
11260    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11261        Py_INCREF(self);
11262        return (PyObject*) self;
11263    }
11264
11265    return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
11266}
11267
11268PyDoc_STRVAR(lower__doc__,
11269             "S.lower() -> str\n\
11270\n\
11271Return a copy of the string S converted to lowercase.");
11272
11273static PyObject*
11274unicode_lower(PyObject *self)
11275{
11276    return fixup(self, fixlower);
11277}
11278
11279#define LEFTSTRIP 0
11280#define RIGHTSTRIP 1
11281#define BOTHSTRIP 2
11282
11283/* Arrays indexed by above */
11284static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11285
11286#define STRIPNAME(i) (stripformat[i]+3)
11287
11288/* externally visible for str.strip(unicode) */
11289PyObject *
11290_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11291{
11292    void *data;
11293    int kind;
11294    Py_ssize_t i, j, len;
11295    BLOOM_MASK sepmask;
11296
11297    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11298        return NULL;
11299
11300    kind = PyUnicode_KIND(self);
11301    data = PyUnicode_DATA(self);
11302    len = PyUnicode_GET_LENGTH(self);
11303    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11304                              PyUnicode_DATA(sepobj),
11305                              PyUnicode_GET_LENGTH(sepobj));
11306
11307    i = 0;
11308    if (striptype != RIGHTSTRIP) {
11309        while (i < len &&
11310               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11311            i++;
11312        }
11313    }
11314
11315    j = len;
11316    if (striptype != LEFTSTRIP) {
11317        do {
11318            j--;
11319        } while (j >= i &&
11320                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11321        j++;
11322    }
11323
11324    return PyUnicode_Substring((PyObject*)self, i, j);
11325}
11326
11327PyObject*
11328PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11329{
11330    unsigned char *data;
11331    int kind;
11332    Py_ssize_t length;
11333
11334    if (PyUnicode_READY(self) == -1)
11335        return NULL;
11336
11337    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11338
11339    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
11340    {
11341        if (PyUnicode_CheckExact(self)) {
11342            Py_INCREF(self);
11343            return self;
11344        }
11345        else
11346            return PyUnicode_Copy(self);
11347    }
11348
11349    length = end - start;
11350    if (length == 1)
11351        return unicode_getitem(self, start);
11352
11353    if (start < 0 || end < 0) {
11354        PyErr_SetString(PyExc_IndexError, "string index out of range");
11355        return NULL;
11356    }
11357
11358    if (PyUnicode_IS_ASCII(self)) {
11359        kind = PyUnicode_KIND(self);
11360        data = PyUnicode_1BYTE_DATA(self);
11361        return unicode_fromascii(data + start, length);
11362    }
11363    else {
11364        kind = PyUnicode_KIND(self);
11365        data = PyUnicode_1BYTE_DATA(self);
11366        return PyUnicode_FromKindAndData(kind,
11367                                         data + kind * start,
11368                                         length);
11369    }
11370}
11371
11372static PyObject *
11373do_strip(PyUnicodeObject *self, int striptype)
11374{
11375    int kind;
11376    void *data;
11377    Py_ssize_t len, i, j;
11378
11379    if (PyUnicode_READY(self) == -1)
11380        return NULL;
11381
11382    kind = PyUnicode_KIND(self);
11383    data = PyUnicode_DATA(self);
11384    len = PyUnicode_GET_LENGTH(self);
11385
11386    i = 0;
11387    if (striptype != RIGHTSTRIP) {
11388        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11389            i++;
11390        }
11391    }
11392
11393    j = len;
11394    if (striptype != LEFTSTRIP) {
11395        do {
11396            j--;
11397        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11398        j++;
11399    }
11400
11401    return PyUnicode_Substring((PyObject*)self, i, j);
11402}
11403
11404
11405static PyObject *
11406do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11407{
11408    PyObject *sep = NULL;
11409
11410    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11411        return NULL;
11412
11413    if (sep != NULL && sep != Py_None) {
11414        if (PyUnicode_Check(sep))
11415            return _PyUnicode_XStrip(self, striptype, sep);
11416        else {
11417            PyErr_Format(PyExc_TypeError,
11418                         "%s arg must be None or str",
11419                         STRIPNAME(striptype));
11420            return NULL;
11421        }
11422    }
11423
11424    return do_strip(self, striptype);
11425}
11426
11427
11428PyDoc_STRVAR(strip__doc__,
11429             "S.strip([chars]) -> str\n\
11430\n\
11431Return a copy of the string S with leading and trailing\n\
11432whitespace removed.\n\
11433If chars is given and not None, remove characters in chars instead.");
11434
11435static PyObject *
11436unicode_strip(PyUnicodeObject *self, PyObject *args)
11437{
11438    if (PyTuple_GET_SIZE(args) == 0)
11439        return do_strip(self, BOTHSTRIP); /* Common case */
11440    else
11441        return do_argstrip(self, BOTHSTRIP, args);
11442}
11443
11444
11445PyDoc_STRVAR(lstrip__doc__,
11446             "S.lstrip([chars]) -> str\n\
11447\n\
11448Return a copy of the string S with leading whitespace removed.\n\
11449If chars is given and not None, remove characters in chars instead.");
11450
11451static PyObject *
11452unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11453{
11454    if (PyTuple_GET_SIZE(args) == 0)
11455        return do_strip(self, LEFTSTRIP); /* Common case */
11456    else
11457        return do_argstrip(self, LEFTSTRIP, args);
11458}
11459
11460
11461PyDoc_STRVAR(rstrip__doc__,
11462             "S.rstrip([chars]) -> str\n\
11463\n\
11464Return a copy of the string S with trailing whitespace removed.\n\
11465If chars is given and not None, remove characters in chars instead.");
11466
11467static PyObject *
11468unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11469{
11470    if (PyTuple_GET_SIZE(args) == 0)
11471        return do_strip(self, RIGHTSTRIP); /* Common case */
11472    else
11473        return do_argstrip(self, RIGHTSTRIP, args);
11474}
11475
11476
11477static PyObject*
11478unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
11479{
11480    PyUnicodeObject *u;
11481    Py_ssize_t nchars, n;
11482
11483    if (len < 1) {
11484        Py_INCREF(unicode_empty);
11485        return unicode_empty;
11486    }
11487
11488    if (len == 1 && PyUnicode_CheckExact(str)) {
11489        /* no repeat, return original string */
11490        Py_INCREF(str);
11491        return (PyObject*) str;
11492    }
11493
11494    if (PyUnicode_READY(str) == -1)
11495        return NULL;
11496
11497    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11498        PyErr_SetString(PyExc_OverflowError,
11499                        "repeated string is too long");
11500        return NULL;
11501    }
11502    nchars = len * PyUnicode_GET_LENGTH(str);
11503
11504    u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11505    if (!u)
11506        return NULL;
11507    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11508
11509    if (PyUnicode_GET_LENGTH(str) == 1) {
11510        const int kind = PyUnicode_KIND(str);
11511        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11512        void *to = PyUnicode_DATA(u);
11513        if (kind == PyUnicode_1BYTE_KIND)
11514            memset(to, (unsigned char)fill_char, len);
11515        else {
11516            for (n = 0; n < len; ++n)
11517                PyUnicode_WRITE(kind, to, n, fill_char);
11518        }
11519    }
11520    else {
11521        /* number of characters copied this far */
11522        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11523        const Py_ssize_t char_size = PyUnicode_KIND(str);
11524        char *to = (char *) PyUnicode_DATA(u);
11525        Py_MEMCPY(to, PyUnicode_DATA(str),
11526                  PyUnicode_GET_LENGTH(str) * char_size);
11527        while (done < nchars) {
11528            n = (done <= nchars-done) ? done : nchars-done;
11529            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11530            done += n;
11531        }
11532    }
11533
11534    assert(_PyUnicode_CheckConsistency(u, 1));
11535    return (PyObject*) u;
11536}
11537
11538PyObject *
11539PyUnicode_Replace(PyObject *obj,
11540                  PyObject *subobj,
11541                  PyObject *replobj,
11542                  Py_ssize_t maxcount)
11543{
11544    PyObject *self;
11545    PyObject *str1;
11546    PyObject *str2;
11547    PyObject *result;
11548
11549    self = PyUnicode_FromObject(obj);
11550    if (self == NULL || PyUnicode_READY(self) == -1)
11551        return NULL;
11552    str1 = PyUnicode_FromObject(subobj);
11553    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11554        Py_DECREF(self);
11555        return NULL;
11556    }
11557    str2 = PyUnicode_FromObject(replobj);
11558    if (str2 == NULL || PyUnicode_READY(str2)) {
11559        Py_DECREF(self);
11560        Py_DECREF(str1);
11561        return NULL;
11562    }
11563    result = replace(self, str1, str2, maxcount);
11564    Py_DECREF(self);
11565    Py_DECREF(str1);
11566    Py_DECREF(str2);
11567    return result;
11568}
11569
11570PyDoc_STRVAR(replace__doc__,
11571             "S.replace(old, new[, count]) -> str\n\
11572\n\
11573Return a copy of S with all occurrences of substring\n\
11574old replaced by new.  If the optional argument count is\n\
11575given, only the first count occurrences are replaced.");
11576
11577static PyObject*
11578unicode_replace(PyObject *self, PyObject *args)
11579{
11580    PyObject *str1;
11581    PyObject *str2;
11582    Py_ssize_t maxcount = -1;
11583    PyObject *result;
11584
11585    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11586        return NULL;
11587    if (!PyUnicode_READY(self) == -1)
11588        return NULL;
11589    str1 = PyUnicode_FromObject(str1);
11590    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11591        return NULL;
11592    str2 = PyUnicode_FromObject(str2);
11593    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11594        Py_DECREF(str1);
11595        return NULL;
11596    }
11597
11598    result = replace(self, str1, str2, maxcount);
11599
11600    Py_DECREF(str1);
11601    Py_DECREF(str2);
11602    return result;
11603}
11604
11605static PyObject *
11606unicode_repr(PyObject *unicode)
11607{
11608    PyObject *repr;
11609    Py_ssize_t isize;
11610    Py_ssize_t osize, squote, dquote, i, o;
11611    Py_UCS4 max, quote;
11612    int ikind, okind;
11613    void *idata, *odata;
11614
11615    if (PyUnicode_READY(unicode) == -1)
11616        return NULL;
11617
11618    isize = PyUnicode_GET_LENGTH(unicode);
11619    idata = PyUnicode_DATA(unicode);
11620
11621    /* Compute length of output, quote characters, and
11622       maximum character */
11623    osize = 2; /* quotes */
11624    max = 127;
11625    squote = dquote = 0;
11626    ikind = PyUnicode_KIND(unicode);
11627    for (i = 0; i < isize; i++) {
11628        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11629        switch (ch) {
11630        case '\'': squote++; osize++; break;
11631        case '"':  dquote++; osize++; break;
11632        case '\\': case '\t': case '\r': case '\n':
11633            osize += 2; break;
11634        default:
11635            /* Fast-path ASCII */
11636            if (ch < ' ' || ch == 0x7f)
11637                osize += 4; /* \xHH */
11638            else if (ch < 0x7f)
11639                osize++;
11640            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11641                osize++;
11642                max = ch > max ? ch : max;
11643            }
11644            else if (ch < 0x100)
11645                osize += 4; /* \xHH */
11646            else if (ch < 0x10000)
11647                osize += 6; /* \uHHHH */
11648            else
11649                osize += 10; /* \uHHHHHHHH */
11650        }
11651    }
11652
11653    quote = '\'';
11654    if (squote) {
11655        if (dquote)
11656            /* Both squote and dquote present. Use squote,
11657               and escape them */
11658            osize += squote;
11659        else
11660            quote = '"';
11661    }
11662
11663    repr = PyUnicode_New(osize, max);
11664    if (repr == NULL)
11665        return NULL;
11666    okind = PyUnicode_KIND(repr);
11667    odata = PyUnicode_DATA(repr);
11668
11669    PyUnicode_WRITE(okind, odata, 0, quote);
11670    PyUnicode_WRITE(okind, odata, osize-1, quote);
11671
11672    for (i = 0, o = 1; i < isize; i++) {
11673        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11674
11675        /* Escape quotes and backslashes */
11676        if ((ch == quote) || (ch == '\\')) {
11677            PyUnicode_WRITE(okind, odata, o++, '\\');
11678            PyUnicode_WRITE(okind, odata, o++, ch);
11679            continue;
11680        }
11681
11682        /* Map special whitespace to '\t', \n', '\r' */
11683        if (ch == '\t') {
11684            PyUnicode_WRITE(okind, odata, o++, '\\');
11685            PyUnicode_WRITE(okind, odata, o++, 't');
11686        }
11687        else if (ch == '\n') {
11688            PyUnicode_WRITE(okind, odata, o++, '\\');
11689            PyUnicode_WRITE(okind, odata, o++, 'n');
11690        }
11691        else if (ch == '\r') {
11692            PyUnicode_WRITE(okind, odata, o++, '\\');
11693            PyUnicode_WRITE(okind, odata, o++, 'r');
11694        }
11695
11696        /* Map non-printable US ASCII to '\xhh' */
11697        else if (ch < ' ' || ch == 0x7F) {
11698            PyUnicode_WRITE(okind, odata, o++, '\\');
11699            PyUnicode_WRITE(okind, odata, o++, 'x');
11700            PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11701            PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11702        }
11703
11704        /* Copy ASCII characters as-is */
11705        else if (ch < 0x7F) {
11706            PyUnicode_WRITE(okind, odata, o++, ch);
11707        }
11708
11709        /* Non-ASCII characters */
11710        else {
11711            /* Map Unicode whitespace and control characters
11712               (categories Z* and C* except ASCII space)
11713            */
11714            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11715                /* Map 8-bit characters to '\xhh' */
11716                if (ch <= 0xff) {
11717                    PyUnicode_WRITE(okind, odata, o++, '\\');
11718                    PyUnicode_WRITE(okind, odata, o++, 'x');
11719                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11720                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11721                }
11722                /* Map 21-bit characters to '\U00xxxxxx' */
11723                else if (ch >= 0x10000) {
11724                    PyUnicode_WRITE(okind, odata, o++, '\\');
11725                    PyUnicode_WRITE(okind, odata, o++, 'U');
11726                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11727                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11728                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11729                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11730                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11731                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11732                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11733                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11734                }
11735                /* Map 16-bit characters to '\uxxxx' */
11736                else {
11737                    PyUnicode_WRITE(okind, odata, o++, '\\');
11738                    PyUnicode_WRITE(okind, odata, o++, 'u');
11739                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11740                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11741                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11742                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11743                }
11744            }
11745            /* Copy characters as-is */
11746            else {
11747                PyUnicode_WRITE(okind, odata, o++, ch);
11748            }
11749        }
11750    }
11751    /* Closing quote already added at the beginning */
11752    assert(_PyUnicode_CheckConsistency(repr, 1));
11753    return repr;
11754}
11755
11756PyDoc_STRVAR(rfind__doc__,
11757             "S.rfind(sub[, start[, end]]) -> int\n\
11758\n\
11759Return the highest index in S where substring sub is found,\n\
11760such that sub is contained within S[start:end].  Optional\n\
11761arguments start and end are interpreted as in slice notation.\n\
11762\n\
11763Return -1 on failure.");
11764
11765static PyObject *
11766unicode_rfind(PyObject *self, PyObject *args)
11767{
11768    PyUnicodeObject *substring;
11769    Py_ssize_t start;
11770    Py_ssize_t end;
11771    Py_ssize_t result;
11772
11773    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11774                                            &start, &end))
11775        return NULL;
11776
11777    if (PyUnicode_READY(self) == -1)
11778        return NULL;
11779    if (PyUnicode_READY(substring) == -1)
11780        return NULL;
11781
11782    result = any_find_slice(-1,
11783        self, (PyObject*)substring, start, end
11784        );
11785
11786    Py_DECREF(substring);
11787
11788    if (result == -2)
11789        return NULL;
11790
11791    return PyLong_FromSsize_t(result);
11792}
11793
11794PyDoc_STRVAR(rindex__doc__,
11795             "S.rindex(sub[, start[, end]]) -> int\n\
11796\n\
11797Like S.rfind() but raise ValueError when the substring is not found.");
11798
11799static PyObject *
11800unicode_rindex(PyObject *self, PyObject *args)
11801{
11802    PyUnicodeObject *substring;
11803    Py_ssize_t start;
11804    Py_ssize_t end;
11805    Py_ssize_t result;
11806
11807    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11808                                            &start, &end))
11809        return NULL;
11810
11811    if (PyUnicode_READY(self) == -1)
11812        return NULL;
11813    if (PyUnicode_READY(substring) == -1)
11814        return NULL;
11815
11816    result = any_find_slice(-1,
11817        self, (PyObject*)substring, start, end
11818        );
11819
11820    Py_DECREF(substring);
11821
11822    if (result == -2)
11823        return NULL;
11824
11825    if (result < 0) {
11826        PyErr_SetString(PyExc_ValueError, "substring not found");
11827        return NULL;
11828    }
11829
11830    return PyLong_FromSsize_t(result);
11831}
11832
11833PyDoc_STRVAR(rjust__doc__,
11834             "S.rjust(width[, fillchar]) -> str\n\
11835\n\
11836Return S right-justified in a string of length width. Padding is\n\
11837done using the specified fill character (default is a space).");
11838
11839static PyObject *
11840unicode_rjust(PyObject *self, PyObject *args)
11841{
11842    Py_ssize_t width;
11843    Py_UCS4 fillchar = ' ';
11844
11845    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
11846        return NULL;
11847
11848    if (PyUnicode_READY(self) == -1)
11849        return NULL;
11850
11851    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11852        Py_INCREF(self);
11853        return (PyObject*) self;
11854    }
11855
11856    return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
11857}
11858
11859PyObject *
11860PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11861{
11862    PyObject *result;
11863
11864    s = PyUnicode_FromObject(s);
11865    if (s == NULL)
11866        return NULL;
11867    if (sep != NULL) {
11868        sep = PyUnicode_FromObject(sep);
11869        if (sep == NULL) {
11870            Py_DECREF(s);
11871            return NULL;
11872        }
11873    }
11874
11875    result = split(s, sep, maxsplit);
11876
11877    Py_DECREF(s);
11878    Py_XDECREF(sep);
11879    return result;
11880}
11881
11882PyDoc_STRVAR(split__doc__,
11883             "S.split([sep[, maxsplit]]) -> list of strings\n\
11884\n\
11885Return a list of the words in S, using sep as the\n\
11886delimiter string.  If maxsplit is given, at most maxsplit\n\
11887splits are done. If sep is not specified or is None, any\n\
11888whitespace string is a separator and empty strings are\n\
11889removed from the result.");
11890
11891static PyObject*
11892unicode_split(PyObject *self, PyObject *args)
11893{
11894    PyObject *substring = Py_None;
11895    Py_ssize_t maxcount = -1;
11896
11897    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
11898        return NULL;
11899
11900    if (substring == Py_None)
11901        return split(self, NULL, maxcount);
11902    else if (PyUnicode_Check(substring))
11903        return split(self, substring, maxcount);
11904    else
11905        return PyUnicode_Split((PyObject *)self, substring, maxcount);
11906}
11907
11908PyObject *
11909PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11910{
11911    PyObject* str_obj;
11912    PyObject* sep_obj;
11913    PyObject* out;
11914    int kind1, kind2, kind;
11915    void *buf1 = NULL, *buf2 = NULL;
11916    Py_ssize_t len1, len2;
11917
11918    str_obj = PyUnicode_FromObject(str_in);
11919    if (!str_obj || PyUnicode_READY(str_obj) == -1)
11920        return NULL;
11921    sep_obj = PyUnicode_FromObject(sep_in);
11922    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
11923        Py_DECREF(str_obj);
11924        return NULL;
11925    }
11926
11927    kind1 = PyUnicode_KIND(str_obj);
11928    kind2 = PyUnicode_KIND(sep_obj);
11929    kind = Py_MAX(kind1, kind2);
11930    buf1 = PyUnicode_DATA(str_obj);
11931    if (kind1 != kind)
11932        buf1 = _PyUnicode_AsKind(str_obj, kind);
11933    if (!buf1)
11934        goto onError;
11935    buf2 = PyUnicode_DATA(sep_obj);
11936    if (kind2 != kind)
11937        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11938    if (!buf2)
11939        goto onError;
11940    len1 = PyUnicode_GET_LENGTH(str_obj);
11941    len2 = PyUnicode_GET_LENGTH(sep_obj);
11942
11943    switch(PyUnicode_KIND(str_obj)) {
11944    case PyUnicode_1BYTE_KIND:
11945        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11946            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11947        else
11948            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11949        break;
11950    case PyUnicode_2BYTE_KIND:
11951        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11952        break;
11953    case PyUnicode_4BYTE_KIND:
11954        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11955        break;
11956    default:
11957        assert(0);
11958        out = 0;
11959    }
11960
11961    Py_DECREF(sep_obj);
11962    Py_DECREF(str_obj);
11963    if (kind1 != kind)
11964        PyMem_Free(buf1);
11965    if (kind2 != kind)
11966        PyMem_Free(buf2);
11967
11968    return out;
11969  onError:
11970    Py_DECREF(sep_obj);
11971    Py_DECREF(str_obj);
11972    if (kind1 != kind && buf1)
11973        PyMem_Free(buf1);
11974    if (kind2 != kind && buf2)
11975        PyMem_Free(buf2);
11976    return NULL;
11977}
11978
11979
11980PyObject *
11981PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11982{
11983    PyObject* str_obj;
11984    PyObject* sep_obj;
11985    PyObject* out;
11986    int kind1, kind2, kind;
11987    void *buf1 = NULL, *buf2 = NULL;
11988    Py_ssize_t len1, len2;
11989
11990    str_obj = PyUnicode_FromObject(str_in);
11991    if (!str_obj)
11992        return NULL;
11993    sep_obj = PyUnicode_FromObject(sep_in);
11994    if (!sep_obj) {
11995        Py_DECREF(str_obj);
11996        return NULL;
11997    }
11998
11999    kind1 = PyUnicode_KIND(str_in);
12000    kind2 = PyUnicode_KIND(sep_obj);
12001    kind = Py_MAX(kind1, kind2);
12002    buf1 = PyUnicode_DATA(str_in);
12003    if (kind1 != kind)
12004        buf1 = _PyUnicode_AsKind(str_in, kind);
12005    if (!buf1)
12006        goto onError;
12007    buf2 = PyUnicode_DATA(sep_obj);
12008    if (kind2 != kind)
12009        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12010    if (!buf2)
12011        goto onError;
12012    len1 = PyUnicode_GET_LENGTH(str_obj);
12013    len2 = PyUnicode_GET_LENGTH(sep_obj);
12014
12015    switch(PyUnicode_KIND(str_in)) {
12016    case PyUnicode_1BYTE_KIND:
12017        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12018            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12019        else
12020            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12021        break;
12022    case PyUnicode_2BYTE_KIND:
12023        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12024        break;
12025    case PyUnicode_4BYTE_KIND:
12026        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12027        break;
12028    default:
12029        assert(0);
12030        out = 0;
12031    }
12032
12033    Py_DECREF(sep_obj);
12034    Py_DECREF(str_obj);
12035    if (kind1 != kind)
12036        PyMem_Free(buf1);
12037    if (kind2 != kind)
12038        PyMem_Free(buf2);
12039
12040    return out;
12041  onError:
12042    Py_DECREF(sep_obj);
12043    Py_DECREF(str_obj);
12044    if (kind1 != kind && buf1)
12045        PyMem_Free(buf1);
12046    if (kind2 != kind && buf2)
12047        PyMem_Free(buf2);
12048    return NULL;
12049}
12050
12051PyDoc_STRVAR(partition__doc__,
12052             "S.partition(sep) -> (head, sep, tail)\n\
12053\n\
12054Search for the separator sep in S, and return the part before it,\n\
12055the separator itself, and the part after it.  If the separator is not\n\
12056found, return S and two empty strings.");
12057
12058static PyObject*
12059unicode_partition(PyObject *self, PyObject *separator)
12060{
12061    return PyUnicode_Partition(self, separator);
12062}
12063
12064PyDoc_STRVAR(rpartition__doc__,
12065             "S.rpartition(sep) -> (head, sep, tail)\n\
12066\n\
12067Search for the separator sep in S, starting at the end of S, and return\n\
12068the part before it, the separator itself, and the part after it.  If the\n\
12069separator is not found, return two empty strings and S.");
12070
12071static PyObject*
12072unicode_rpartition(PyObject *self, PyObject *separator)
12073{
12074    return PyUnicode_RPartition(self, separator);
12075}
12076
12077PyObject *
12078PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12079{
12080    PyObject *result;
12081
12082    s = PyUnicode_FromObject(s);
12083    if (s == NULL)
12084        return NULL;
12085    if (sep != NULL) {
12086        sep = PyUnicode_FromObject(sep);
12087        if (sep == NULL) {
12088            Py_DECREF(s);
12089            return NULL;
12090        }
12091    }
12092
12093    result = rsplit(s, sep, maxsplit);
12094
12095    Py_DECREF(s);
12096    Py_XDECREF(sep);
12097    return result;
12098}
12099
12100PyDoc_STRVAR(rsplit__doc__,
12101             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
12102\n\
12103Return a list of the words in S, using sep as the\n\
12104delimiter string, starting at the end of the string and\n\
12105working to the front.  If maxsplit is given, at most maxsplit\n\
12106splits are done. If sep is not specified, any whitespace string\n\
12107is a separator.");
12108
12109static PyObject*
12110unicode_rsplit(PyObject *self, PyObject *args)
12111{
12112    PyObject *substring = Py_None;
12113    Py_ssize_t maxcount = -1;
12114
12115    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
12116        return NULL;
12117
12118    if (substring == Py_None)
12119        return rsplit(self, NULL, maxcount);
12120    else if (PyUnicode_Check(substring))
12121        return rsplit(self, substring, maxcount);
12122    else
12123        return PyUnicode_RSplit(self, substring, maxcount);
12124}
12125
12126PyDoc_STRVAR(splitlines__doc__,
12127             "S.splitlines([keepends]) -> list of strings\n\
12128\n\
12129Return a list of the lines in S, breaking at line boundaries.\n\
12130Line breaks are not included in the resulting list unless keepends\n\
12131is given and true.");
12132
12133static PyObject*
12134unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
12135{
12136    static char *kwlist[] = {"keepends", 0};
12137    int keepends = 0;
12138
12139    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12140                                     kwlist, &keepends))
12141        return NULL;
12142
12143    return PyUnicode_Splitlines((PyObject *)self, keepends);
12144}
12145
12146static
12147PyObject *unicode_str(PyObject *self)
12148{
12149    if (PyUnicode_CheckExact(self)) {
12150        Py_INCREF(self);
12151        return self;
12152    } else
12153        /* Subtype -- return genuine unicode string with the same value. */
12154        return PyUnicode_Copy(self);
12155}
12156
12157PyDoc_STRVAR(swapcase__doc__,
12158             "S.swapcase() -> str\n\
12159\n\
12160Return a copy of S with uppercase characters converted to lowercase\n\
12161and vice versa.");
12162
12163static PyObject*
12164unicode_swapcase(PyObject *self)
12165{
12166    return fixup(self, fixswapcase);
12167}
12168
12169PyDoc_STRVAR(maketrans__doc__,
12170             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12171\n\
12172Return a translation table usable for str.translate().\n\
12173If there is only one argument, it must be a dictionary mapping Unicode\n\
12174ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12175Character keys will be then converted to ordinals.\n\
12176If there are two arguments, they must be strings of equal length, and\n\
12177in the resulting dictionary, each character in x will be mapped to the\n\
12178character at the same position in y. If there is a third argument, it\n\
12179must be a string, whose characters will be mapped to None in the result.");
12180
12181static PyObject*
12182unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12183{
12184    PyObject *x, *y = NULL, *z = NULL;
12185    PyObject *new = NULL, *key, *value;
12186    Py_ssize_t i = 0;
12187    int res;
12188
12189    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12190        return NULL;
12191    new = PyDict_New();
12192    if (!new)
12193        return NULL;
12194    if (y != NULL) {
12195        int x_kind, y_kind, z_kind;
12196        void *x_data, *y_data, *z_data;
12197
12198        /* x must be a string too, of equal length */
12199        if (!PyUnicode_Check(x)) {
12200            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12201                            "be a string if there is a second argument");
12202            goto err;
12203        }
12204        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12205            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12206                            "arguments must have equal length");
12207            goto err;
12208        }
12209        /* create entries for translating chars in x to those in y */
12210        x_kind = PyUnicode_KIND(x);
12211        y_kind = PyUnicode_KIND(y);
12212        x_data = PyUnicode_DATA(x);
12213        y_data = PyUnicode_DATA(y);
12214        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12215            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12216            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12217            if (!key || !value)
12218                goto err;
12219            res = PyDict_SetItem(new, key, value);
12220            Py_DECREF(key);
12221            Py_DECREF(value);
12222            if (res < 0)
12223                goto err;
12224        }
12225        /* create entries for deleting chars in z */
12226        if (z != NULL) {
12227            z_kind = PyUnicode_KIND(z);
12228            z_data = PyUnicode_DATA(z);
12229            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
12230                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12231                if (!key)
12232                    goto err;
12233                res = PyDict_SetItem(new, key, Py_None);
12234                Py_DECREF(key);
12235                if (res < 0)
12236                    goto err;
12237            }
12238        }
12239    } else {
12240        int kind;
12241        void *data;
12242
12243        /* x must be a dict */
12244        if (!PyDict_CheckExact(x)) {
12245            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12246                            "to maketrans it must be a dict");
12247            goto err;
12248        }
12249        /* copy entries into the new dict, converting string keys to int keys */
12250        while (PyDict_Next(x, &i, &key, &value)) {
12251            if (PyUnicode_Check(key)) {
12252                /* convert string keys to integer keys */
12253                PyObject *newkey;
12254                if (PyUnicode_GET_SIZE(key) != 1) {
12255                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12256                                    "table must be of length 1");
12257                    goto err;
12258                }
12259                kind = PyUnicode_KIND(key);
12260                data = PyUnicode_DATA(key);
12261                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12262                if (!newkey)
12263                    goto err;
12264                res = PyDict_SetItem(new, newkey, value);
12265                Py_DECREF(newkey);
12266                if (res < 0)
12267                    goto err;
12268            } else if (PyLong_Check(key)) {
12269                /* just keep integer keys */
12270                if (PyDict_SetItem(new, key, value) < 0)
12271                    goto err;
12272            } else {
12273                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12274                                "be strings or integers");
12275                goto err;
12276            }
12277        }
12278    }
12279    return new;
12280  err:
12281    Py_DECREF(new);
12282    return NULL;
12283}
12284
12285PyDoc_STRVAR(translate__doc__,
12286             "S.translate(table) -> str\n\
12287\n\
12288Return a copy of the string S, where all characters have been mapped\n\
12289through the given translation table, which must be a mapping of\n\
12290Unicode ordinals to Unicode ordinals, strings, or None.\n\
12291Unmapped characters are left untouched. Characters mapped to None\n\
12292are deleted.");
12293
12294static PyObject*
12295unicode_translate(PyObject *self, PyObject *table)
12296{
12297    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12298}
12299
12300PyDoc_STRVAR(upper__doc__,
12301             "S.upper() -> str\n\
12302\n\
12303Return a copy of S converted to uppercase.");
12304
12305static PyObject*
12306unicode_upper(PyObject *self)
12307{
12308    return fixup(self, fixupper);
12309}
12310
12311PyDoc_STRVAR(zfill__doc__,
12312             "S.zfill(width) -> str\n\
12313\n\
12314Pad a numeric string S with zeros on the left, to fill a field\n\
12315of the specified width. The string S is never truncated.");
12316
12317static PyObject *
12318unicode_zfill(PyObject *self, PyObject *args)
12319{
12320    Py_ssize_t fill;
12321    PyObject *u;
12322    Py_ssize_t width;
12323    int kind;
12324    void *data;
12325    Py_UCS4 chr;
12326
12327    if (PyUnicode_READY(self) == -1)
12328        return NULL;
12329
12330    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12331        return NULL;
12332
12333    if (PyUnicode_GET_LENGTH(self) >= width) {
12334        if (PyUnicode_CheckExact(self)) {
12335            Py_INCREF(self);
12336            return (PyObject*) self;
12337        }
12338        else
12339            return PyUnicode_Copy((PyObject*)self);
12340    }
12341
12342    fill = width - _PyUnicode_LENGTH(self);
12343
12344    u = pad(self, fill, 0, '0');
12345
12346    if (u == NULL)
12347        return NULL;
12348
12349    kind = PyUnicode_KIND(u);
12350    data = PyUnicode_DATA(u);
12351    chr = PyUnicode_READ(kind, data, fill);
12352
12353    if (chr == '+' || chr == '-') {
12354        /* move sign to beginning of string */
12355        PyUnicode_WRITE(kind, data, 0, chr);
12356        PyUnicode_WRITE(kind, data, fill, '0');
12357    }
12358
12359    assert(_PyUnicode_CheckConsistency(u, 1));
12360    return (PyObject*) u;
12361}
12362
12363#if 0
12364static PyObject *
12365unicode__decimal2ascii(PyObject *self)
12366{
12367    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12368}
12369#endif
12370
12371PyDoc_STRVAR(startswith__doc__,
12372             "S.startswith(prefix[, start[, end]]) -> bool\n\
12373\n\
12374Return True if S starts with the specified prefix, False otherwise.\n\
12375With optional start, test S beginning at that position.\n\
12376With optional end, stop comparing S at that position.\n\
12377prefix can also be a tuple of strings to try.");
12378
12379static PyObject *
12380unicode_startswith(PyUnicodeObject *self,
12381                   PyObject *args)
12382{
12383    PyObject *subobj;
12384    PyUnicodeObject *substring;
12385    Py_ssize_t start = 0;
12386    Py_ssize_t end = PY_SSIZE_T_MAX;
12387    int result;
12388
12389    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12390        return NULL;
12391    if (PyTuple_Check(subobj)) {
12392        Py_ssize_t i;
12393        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12394            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12395                PyTuple_GET_ITEM(subobj, i));
12396            if (substring == NULL)
12397                return NULL;
12398            result = tailmatch(self, substring, start, end, -1);
12399            Py_DECREF(substring);
12400            if (result) {
12401                Py_RETURN_TRUE;
12402            }
12403        }
12404        /* nothing matched */
12405        Py_RETURN_FALSE;
12406    }
12407    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12408    if (substring == NULL) {
12409        if (PyErr_ExceptionMatches(PyExc_TypeError))
12410            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12411                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12412        return NULL;
12413    }
12414    result = tailmatch(self, substring, start, end, -1);
12415    Py_DECREF(substring);
12416    return PyBool_FromLong(result);
12417}
12418
12419
12420PyDoc_STRVAR(endswith__doc__,
12421             "S.endswith(suffix[, start[, end]]) -> bool\n\
12422\n\
12423Return True if S ends with the specified suffix, False otherwise.\n\
12424With optional start, test S beginning at that position.\n\
12425With optional end, stop comparing S at that position.\n\
12426suffix can also be a tuple of strings to try.");
12427
12428static PyObject *
12429unicode_endswith(PyUnicodeObject *self,
12430                 PyObject *args)
12431{
12432    PyObject *subobj;
12433    PyUnicodeObject *substring;
12434    Py_ssize_t start = 0;
12435    Py_ssize_t end = PY_SSIZE_T_MAX;
12436    int result;
12437
12438    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12439        return NULL;
12440    if (PyTuple_Check(subobj)) {
12441        Py_ssize_t i;
12442        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12443            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12444                PyTuple_GET_ITEM(subobj, i));
12445            if (substring == NULL)
12446                return NULL;
12447            result = tailmatch(self, substring, start, end, +1);
12448            Py_DECREF(substring);
12449            if (result) {
12450                Py_RETURN_TRUE;
12451            }
12452        }
12453        Py_RETURN_FALSE;
12454    }
12455    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12456    if (substring == NULL) {
12457        if (PyErr_ExceptionMatches(PyExc_TypeError))
12458            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12459                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12460        return NULL;
12461    }
12462    result = tailmatch(self, substring, start, end, +1);
12463    Py_DECREF(substring);
12464    return PyBool_FromLong(result);
12465}
12466
12467#include "stringlib/unicode_format.h"
12468
12469PyDoc_STRVAR(format__doc__,
12470             "S.format(*args, **kwargs) -> str\n\
12471\n\
12472Return a formatted version of S, using substitutions from args and kwargs.\n\
12473The substitutions are identified by braces ('{' and '}').");
12474
12475PyDoc_STRVAR(format_map__doc__,
12476             "S.format_map(mapping) -> str\n\
12477\n\
12478Return a formatted version of S, using substitutions from mapping.\n\
12479The substitutions are identified by braces ('{' and '}').");
12480
12481static PyObject *
12482unicode__format__(PyObject* self, PyObject* args)
12483{
12484    PyObject *format_spec, *out;
12485
12486    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12487        return NULL;
12488
12489    out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
12490                                     PyUnicode_GET_LENGTH(format_spec));
12491    return out;
12492}
12493
12494PyDoc_STRVAR(p_format__doc__,
12495             "S.__format__(format_spec) -> str\n\
12496\n\
12497Return a formatted version of S as described by format_spec.");
12498
12499static PyObject *
12500unicode__sizeof__(PyUnicodeObject *v)
12501{
12502    Py_ssize_t size;
12503
12504    /* If it's a compact object, account for base structure +
12505       character data. */
12506    if (PyUnicode_IS_COMPACT_ASCII(v))
12507        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12508    else if (PyUnicode_IS_COMPACT(v))
12509        size = sizeof(PyCompactUnicodeObject) +
12510            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12511    else {
12512        /* If it is a two-block object, account for base object, and
12513           for character block if present. */
12514        size = sizeof(PyUnicodeObject);
12515        if (_PyUnicode_DATA_ANY(v))
12516            size += (PyUnicode_GET_LENGTH(v) + 1) *
12517                PyUnicode_KIND(v);
12518    }
12519    /* If the wstr pointer is present, account for it unless it is shared
12520       with the data pointer. Check if the data is not shared. */
12521    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12522        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12523    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12524        size += PyUnicode_UTF8_LENGTH(v) + 1;
12525
12526    return PyLong_FromSsize_t(size);
12527}
12528
12529PyDoc_STRVAR(sizeof__doc__,
12530             "S.__sizeof__() -> size of S in memory, in bytes");
12531
12532static PyObject *
12533unicode_getnewargs(PyObject *v)
12534{
12535    PyObject *copy = PyUnicode_Copy(v);
12536    if (!copy)
12537        return NULL;
12538    return Py_BuildValue("(N)", copy);
12539}
12540
12541static PyMethodDef unicode_methods[] = {
12542
12543    /* Order is according to common usage: often used methods should
12544       appear first, since lookup is done sequentially. */
12545
12546    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12547    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12548    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12549    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12550    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12551    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12552    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12553    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12554    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12555    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12556    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12557    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12558    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12559    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12560    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12561    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12562    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12563    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12564    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12565    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12566    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12567    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12568    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12569    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12570    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12571    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12572    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12573    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12574    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12575    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12576    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12577    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12578    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12579    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12580    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12581    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12582    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12583    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12584    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12585    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12586    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12587    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12588    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12589    {"maketrans", (PyCFunction) unicode_maketrans,
12590     METH_VARARGS | METH_STATIC, maketrans__doc__},
12591    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12592#if 0
12593    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12594#endif
12595
12596#if 0
12597    /* These methods are just used for debugging the implementation. */
12598    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12599#endif
12600
12601    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12602    {NULL, NULL}
12603};
12604
12605static PyObject *
12606unicode_mod(PyObject *v, PyObject *w)
12607{
12608    if (!PyUnicode_Check(v))
12609        Py_RETURN_NOTIMPLEMENTED;
12610    return PyUnicode_Format(v, w);
12611}
12612
12613static PyNumberMethods unicode_as_number = {
12614    0,              /*nb_add*/
12615    0,              /*nb_subtract*/
12616    0,              /*nb_multiply*/
12617    unicode_mod,            /*nb_remainder*/
12618};
12619
12620static PySequenceMethods unicode_as_sequence = {
12621    (lenfunc) unicode_length,       /* sq_length */
12622    PyUnicode_Concat,           /* sq_concat */
12623    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12624    (ssizeargfunc) unicode_getitem,     /* sq_item */
12625    0,                  /* sq_slice */
12626    0,                  /* sq_ass_item */
12627    0,                  /* sq_ass_slice */
12628    PyUnicode_Contains,         /* sq_contains */
12629};
12630
12631static PyObject*
12632unicode_subscript(PyUnicodeObject* self, PyObject* item)
12633{
12634    if (PyUnicode_READY(self) == -1)
12635        return NULL;
12636
12637    if (PyIndex_Check(item)) {
12638        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12639        if (i == -1 && PyErr_Occurred())
12640            return NULL;
12641        if (i < 0)
12642            i += PyUnicode_GET_LENGTH(self);
12643        return unicode_getitem((PyObject*)self, i);
12644    } else if (PySlice_Check(item)) {
12645        Py_ssize_t start, stop, step, slicelength, cur, i;
12646        PyObject *result;
12647        void *src_data, *dest_data;
12648        int src_kind, dest_kind;
12649        Py_UCS4 ch, max_char, kind_limit;
12650
12651        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12652                                 &start, &stop, &step, &slicelength) < 0) {
12653            return NULL;
12654        }
12655
12656        if (slicelength <= 0) {
12657            return PyUnicode_New(0, 0);
12658        } else if (start == 0 && step == 1 &&
12659                   slicelength == PyUnicode_GET_LENGTH(self) &&
12660                   PyUnicode_CheckExact(self)) {
12661            Py_INCREF(self);
12662            return (PyObject *)self;
12663        } else if (step == 1) {
12664            return PyUnicode_Substring((PyObject*)self,
12665                                       start, start + slicelength);
12666        }
12667        /* General case */
12668        max_char = 0;
12669        src_kind = PyUnicode_KIND(self);
12670        kind_limit = kind_maxchar_limit(src_kind);
12671        src_data = PyUnicode_DATA(self);
12672        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12673            ch = PyUnicode_READ(src_kind, src_data, cur);
12674            if (ch > max_char) {
12675                max_char = ch;
12676                if (max_char >= kind_limit)
12677                    break;
12678            }
12679        }
12680        result = PyUnicode_New(slicelength, max_char);
12681        if (result == NULL)
12682            return NULL;
12683        dest_kind = PyUnicode_KIND(result);
12684        dest_data = PyUnicode_DATA(result);
12685
12686        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12687            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12688            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
12689        }
12690        assert(_PyUnicode_CheckConsistency(result, 1));
12691        return result;
12692    } else {
12693        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12694        return NULL;
12695    }
12696}
12697
12698static PyMappingMethods unicode_as_mapping = {
12699    (lenfunc)unicode_length,        /* mp_length */
12700    (binaryfunc)unicode_subscript,  /* mp_subscript */
12701    (objobjargproc)0,           /* mp_ass_subscript */
12702};
12703
12704
12705/* Helpers for PyUnicode_Format() */
12706
12707static PyObject *
12708getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12709{
12710    Py_ssize_t argidx = *p_argidx;
12711    if (argidx < arglen) {
12712        (*p_argidx)++;
12713        if (arglen < 0)
12714            return args;
12715        else
12716            return PyTuple_GetItem(args, argidx);
12717    }
12718    PyErr_SetString(PyExc_TypeError,
12719                    "not enough arguments for format string");
12720    return NULL;
12721}
12722
12723/* Returns a new reference to a PyUnicode object, or NULL on failure. */
12724
12725static PyObject *
12726formatfloat(PyObject *v, int flags, int prec, int type)
12727{
12728    char *p;
12729    PyObject *result;
12730    double x;
12731
12732    x = PyFloat_AsDouble(v);
12733    if (x == -1.0 && PyErr_Occurred())
12734        return NULL;
12735
12736    if (prec < 0)
12737        prec = 6;
12738
12739    p = PyOS_double_to_string(x, type, prec,
12740                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
12741    if (p == NULL)
12742        return NULL;
12743    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
12744    PyMem_Free(p);
12745    return result;
12746}
12747
12748static PyObject*
12749formatlong(PyObject *val, int flags, int prec, int type)
12750{
12751    char *buf;
12752    int len;
12753    PyObject *str; /* temporary string object. */
12754    PyObject *result;
12755
12756    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12757    if (!str)
12758        return NULL;
12759    result = PyUnicode_DecodeASCII(buf, len, NULL);
12760    Py_DECREF(str);
12761    return result;
12762}
12763
12764static Py_UCS4
12765formatchar(PyObject *v)
12766{
12767    /* presume that the buffer is at least 3 characters long */
12768    if (PyUnicode_Check(v)) {
12769        if (PyUnicode_GET_LENGTH(v) == 1) {
12770            return PyUnicode_READ_CHAR(v, 0);
12771        }
12772        goto onError;
12773    }
12774    else {
12775        /* Integer input truncated to a character */
12776        long x;
12777        x = PyLong_AsLong(v);
12778        if (x == -1 && PyErr_Occurred())
12779            goto onError;
12780
12781        if (x < 0 || x > 0x10ffff) {
12782            PyErr_SetString(PyExc_OverflowError,
12783                            "%c arg not in range(0x110000)");
12784            return (Py_UCS4) -1;
12785        }
12786
12787        return (Py_UCS4) x;
12788    }
12789
12790  onError:
12791    PyErr_SetString(PyExc_TypeError,
12792                    "%c requires int or char");
12793    return (Py_UCS4) -1;
12794}
12795
12796static int
12797repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12798{
12799    int r;
12800    assert(count > 0);
12801    assert(PyUnicode_Check(obj));
12802    if (count > 5) {
12803        PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
12804        if (repeated == NULL)
12805            return -1;
12806        r = _PyAccu_Accumulate(acc, repeated);
12807        Py_DECREF(repeated);
12808        return r;
12809    }
12810    else {
12811        do {
12812            if (_PyAccu_Accumulate(acc, obj))
12813                return -1;
12814        } while (--count);
12815        return 0;
12816    }
12817}
12818
12819PyObject *
12820PyUnicode_Format(PyObject *format, PyObject *args)
12821{
12822    void *fmt;
12823    int fmtkind;
12824    PyObject *result;
12825    int kind;
12826    int r;
12827    Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
12828    int args_owned = 0;
12829    PyObject *dict = NULL;
12830    PyObject *temp = NULL;
12831    PyObject *second = NULL;
12832    PyUnicodeObject *uformat;
12833    _PyAccu acc;
12834    static PyObject *plus, *minus, *blank, *zero, *percent;
12835
12836    if (!plus && !(plus = get_latin1_char('+')))
12837        return NULL;
12838    if (!minus && !(minus = get_latin1_char('-')))
12839        return NULL;
12840    if (!blank && !(blank = get_latin1_char(' ')))
12841        return NULL;
12842    if (!zero && !(zero = get_latin1_char('0')))
12843        return NULL;
12844    if (!percent && !(percent = get_latin1_char('%')))
12845        return NULL;
12846
12847    if (format == NULL || args == NULL) {
12848        PyErr_BadInternalCall();
12849        return NULL;
12850    }
12851    uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12852    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
12853        return NULL;
12854    if (_PyAccu_Init(&acc))
12855        goto onError;
12856    fmt = PyUnicode_DATA(uformat);
12857    fmtkind = PyUnicode_KIND(uformat);
12858    fmtcnt = PyUnicode_GET_LENGTH(uformat);
12859    fmtpos = 0;
12860
12861    if (PyTuple_Check(args)) {
12862        arglen = PyTuple_Size(args);
12863        argidx = 0;
12864    }
12865    else {
12866        arglen = -1;
12867        argidx = -2;
12868    }
12869    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
12870        !PyUnicode_Check(args))
12871        dict = args;
12872
12873    while (--fmtcnt >= 0) {
12874        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12875            PyObject *nonfmt;
12876            Py_ssize_t nonfmtpos;
12877            nonfmtpos = fmtpos++;
12878            while (fmtcnt >= 0 &&
12879                   PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12880                fmtpos++;
12881                fmtcnt--;
12882            }
12883            nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12884            if (nonfmt == NULL)
12885                goto onError;
12886            r = _PyAccu_Accumulate(&acc, nonfmt);
12887            Py_DECREF(nonfmt);
12888            if (r)
12889                goto onError;
12890        }
12891        else {
12892            /* Got a format specifier */
12893            int flags = 0;
12894            Py_ssize_t width = -1;
12895            int prec = -1;
12896            Py_UCS4 c = '\0';
12897            Py_UCS4 fill, sign;
12898            int isnumok;
12899            PyObject *v = NULL;
12900            void *pbuf = NULL;
12901            Py_ssize_t pindex, len;
12902            PyObject *signobj = NULL, *fillobj = NULL;
12903
12904            fmtpos++;
12905            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12906                Py_ssize_t keystart;
12907                Py_ssize_t keylen;
12908                PyObject *key;
12909                int pcount = 1;
12910
12911                if (dict == NULL) {
12912                    PyErr_SetString(PyExc_TypeError,
12913                                    "format requires a mapping");
12914                    goto onError;
12915                }
12916                ++fmtpos;
12917                --fmtcnt;
12918                keystart = fmtpos;
12919                /* Skip over balanced parentheses */
12920                while (pcount > 0 && --fmtcnt >= 0) {
12921                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
12922                        --pcount;
12923                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
12924                        ++pcount;
12925                    fmtpos++;
12926                }
12927                keylen = fmtpos - keystart - 1;
12928                if (fmtcnt < 0 || pcount > 0) {
12929                    PyErr_SetString(PyExc_ValueError,
12930                                    "incomplete format key");
12931                    goto onError;
12932                }
12933                key = PyUnicode_Substring((PyObject*)uformat,
12934                                          keystart, keystart + keylen);
12935                if (key == NULL)
12936                    goto onError;
12937                if (args_owned) {
12938                    Py_DECREF(args);
12939                    args_owned = 0;
12940                }
12941                args = PyObject_GetItem(dict, key);
12942                Py_DECREF(key);
12943                if (args == NULL) {
12944                    goto onError;
12945                }
12946                args_owned = 1;
12947                arglen = -1;
12948                argidx = -2;
12949            }
12950            while (--fmtcnt >= 0) {
12951                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
12952                case '-': flags |= F_LJUST; continue;
12953                case '+': flags |= F_SIGN; continue;
12954                case ' ': flags |= F_BLANK; continue;
12955                case '#': flags |= F_ALT; continue;
12956                case '0': flags |= F_ZERO; continue;
12957                }
12958                break;
12959            }
12960            if (c == '*') {
12961                v = getnextarg(args, arglen, &argidx);
12962                if (v == NULL)
12963                    goto onError;
12964                if (!PyLong_Check(v)) {
12965                    PyErr_SetString(PyExc_TypeError,
12966                                    "* wants int");
12967                    goto onError;
12968                }
12969                width = PyLong_AsLong(v);
12970                if (width == -1 && PyErr_Occurred())
12971                    goto onError;
12972                if (width < 0) {
12973                    flags |= F_LJUST;
12974                    width = -width;
12975                }
12976                if (--fmtcnt >= 0)
12977                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12978            }
12979            else if (c >= '0' && c <= '9') {
12980                width = c - '0';
12981                while (--fmtcnt >= 0) {
12982                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12983                    if (c < '0' || c > '9')
12984                        break;
12985                    if ((width*10) / 10 != width) {
12986                        PyErr_SetString(PyExc_ValueError,
12987                                        "width too big");
12988                        goto onError;
12989                    }
12990                    width = width*10 + (c - '0');
12991                }
12992            }
12993            if (c == '.') {
12994                prec = 0;
12995                if (--fmtcnt >= 0)
12996                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12997                if (c == '*') {
12998                    v = getnextarg(args, arglen, &argidx);
12999                    if (v == NULL)
13000                        goto onError;
13001                    if (!PyLong_Check(v)) {
13002                        PyErr_SetString(PyExc_TypeError,
13003                                        "* wants int");
13004                        goto onError;
13005                    }
13006                    prec = PyLong_AsLong(v);
13007                    if (prec == -1 && PyErr_Occurred())
13008                        goto onError;
13009                    if (prec < 0)
13010                        prec = 0;
13011                    if (--fmtcnt >= 0)
13012                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13013                }
13014                else if (c >= '0' && c <= '9') {
13015                    prec = c - '0';
13016                    while (--fmtcnt >= 0) {
13017                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13018                        if (c < '0' || c > '9')
13019                            break;
13020                        if ((prec*10) / 10 != prec) {
13021                            PyErr_SetString(PyExc_ValueError,
13022                                            "prec too big");
13023                            goto onError;
13024                        }
13025                        prec = prec*10 + (c - '0');
13026                    }
13027                }
13028            } /* prec */
13029            if (fmtcnt >= 0) {
13030                if (c == 'h' || c == 'l' || c == 'L') {
13031                    if (--fmtcnt >= 0)
13032                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13033                }
13034            }
13035            if (fmtcnt < 0) {
13036                PyErr_SetString(PyExc_ValueError,
13037                                "incomplete format");
13038                goto onError;
13039            }
13040            if (c != '%') {
13041                v = getnextarg(args, arglen, &argidx);
13042                if (v == NULL)
13043                    goto onError;
13044            }
13045            sign = 0;
13046            fill = ' ';
13047            fillobj = blank;
13048            switch (c) {
13049
13050            case '%':
13051                _PyAccu_Accumulate(&acc, percent);
13052                continue;
13053
13054            case 's':
13055            case 'r':
13056            case 'a':
13057                if (PyUnicode_CheckExact(v) && c == 's') {
13058                    temp = v;
13059                    Py_INCREF(temp);
13060                }
13061                else {
13062                    if (c == 's')
13063                        temp = PyObject_Str(v);
13064                    else if (c == 'r')
13065                        temp = PyObject_Repr(v);
13066                    else
13067                        temp = PyObject_ASCII(v);
13068                    if (temp == NULL)
13069                        goto onError;
13070                    if (PyUnicode_Check(temp))
13071                        /* nothing to do */;
13072                    else {
13073                        Py_DECREF(temp);
13074                        PyErr_SetString(PyExc_TypeError,
13075                                        "%s argument has non-string str()");
13076                        goto onError;
13077                    }
13078                }
13079                if (PyUnicode_READY(temp) == -1) {
13080                    Py_CLEAR(temp);
13081                    goto onError;
13082                }
13083                pbuf = PyUnicode_DATA(temp);
13084                kind = PyUnicode_KIND(temp);
13085                len = PyUnicode_GET_LENGTH(temp);
13086                if (prec >= 0 && len > prec)
13087                    len = prec;
13088                break;
13089
13090            case 'i':
13091            case 'd':
13092            case 'u':
13093            case 'o':
13094            case 'x':
13095            case 'X':
13096                isnumok = 0;
13097                if (PyNumber_Check(v)) {
13098                    PyObject *iobj=NULL;
13099
13100                    if (PyLong_Check(v)) {
13101                        iobj = v;
13102                        Py_INCREF(iobj);
13103                    }
13104                    else {
13105                        iobj = PyNumber_Long(v);
13106                    }
13107                    if (iobj!=NULL) {
13108                        if (PyLong_Check(iobj)) {
13109                            isnumok = 1;
13110                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13111                            Py_DECREF(iobj);
13112                            if (!temp)
13113                                goto onError;
13114                            if (PyUnicode_READY(temp) == -1) {
13115                                Py_CLEAR(temp);
13116                                goto onError;
13117                            }
13118                            pbuf = PyUnicode_DATA(temp);
13119                            kind = PyUnicode_KIND(temp);
13120                            len = PyUnicode_GET_LENGTH(temp);
13121                            sign = 1;
13122                        }
13123                        else {
13124                            Py_DECREF(iobj);
13125                        }
13126                    }
13127                }
13128                if (!isnumok) {
13129                    PyErr_Format(PyExc_TypeError,
13130                                 "%%%c format: a number is required, "
13131                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13132                    goto onError;
13133                }
13134                if (flags & F_ZERO) {
13135                    fill = '0';
13136                    fillobj = zero;
13137                }
13138                break;
13139
13140            case 'e':
13141            case 'E':
13142            case 'f':
13143            case 'F':
13144            case 'g':
13145            case 'G':
13146                temp = formatfloat(v, flags, prec, c);
13147                if (!temp)
13148                    goto onError;
13149                if (PyUnicode_READY(temp) == -1) {
13150                    Py_CLEAR(temp);
13151                    goto onError;
13152                }
13153                pbuf = PyUnicode_DATA(temp);
13154                kind = PyUnicode_KIND(temp);
13155                len = PyUnicode_GET_LENGTH(temp);
13156                sign = 1;
13157                if (flags & F_ZERO) {
13158                    fill = '0';
13159                    fillobj = zero;
13160                }
13161                break;
13162
13163            case 'c':
13164            {
13165                Py_UCS4 ch = formatchar(v);
13166                if (ch == (Py_UCS4) -1)
13167                    goto onError;
13168                temp = _PyUnicode_FromUCS4(&ch, 1);
13169                if (temp == NULL)
13170                    goto onError;
13171                pbuf = PyUnicode_DATA(temp);
13172                kind = PyUnicode_KIND(temp);
13173                len = PyUnicode_GET_LENGTH(temp);
13174                break;
13175            }
13176
13177            default:
13178                PyErr_Format(PyExc_ValueError,
13179                             "unsupported format character '%c' (0x%x) "
13180                             "at index %zd",
13181                             (31<=c && c<=126) ? (char)c : '?',
13182                             (int)c,
13183                             fmtpos - 1);
13184                goto onError;
13185            }
13186            /* pbuf is initialized here. */
13187            pindex = 0;
13188            if (sign) {
13189                if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13190                    signobj = minus;
13191                    len--;
13192                    pindex++;
13193                }
13194                else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13195                    signobj = plus;
13196                    len--;
13197                    pindex++;
13198                }
13199                else if (flags & F_SIGN)
13200                    signobj = plus;
13201                else if (flags & F_BLANK)
13202                    signobj = blank;
13203                else
13204                    sign = 0;
13205            }
13206            if (width < len)
13207                width = len;
13208            if (sign) {
13209                if (fill != ' ') {
13210                    assert(signobj != NULL);
13211                    if (_PyAccu_Accumulate(&acc, signobj))
13212                        goto onError;
13213                }
13214                if (width > len)
13215                    width--;
13216            }
13217            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13218                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13219                assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13220                if (fill != ' ') {
13221                    second = get_latin1_char(
13222                        PyUnicode_READ(kind, pbuf, pindex + 1));
13223                    pindex += 2;
13224                    if (second == NULL ||
13225                        _PyAccu_Accumulate(&acc, zero) ||
13226                        _PyAccu_Accumulate(&acc, second))
13227                        goto onError;
13228                    Py_CLEAR(second);
13229                }
13230                width -= 2;
13231                if (width < 0)
13232                    width = 0;
13233                len -= 2;
13234            }
13235            if (width > len && !(flags & F_LJUST)) {
13236                assert(fillobj != NULL);
13237                if (repeat_accumulate(&acc, fillobj, width - len))
13238                    goto onError;
13239                width = len;
13240            }
13241            if (fill == ' ') {
13242                if (sign) {
13243                    assert(signobj != NULL);
13244                    if (_PyAccu_Accumulate(&acc, signobj))
13245                        goto onError;
13246                }
13247                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13248                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13249                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13250                    second = get_latin1_char(
13251                        PyUnicode_READ(kind, pbuf, pindex + 1));
13252                    pindex += 2;
13253                    if (second == NULL ||
13254                        _PyAccu_Accumulate(&acc, zero) ||
13255                        _PyAccu_Accumulate(&acc, second))
13256                        goto onError;
13257                    Py_CLEAR(second);
13258                }
13259            }
13260            /* Copy all characters, preserving len */
13261            if (temp != NULL) {
13262                assert(pbuf == PyUnicode_DATA(temp));
13263                v = PyUnicode_Substring(temp, pindex, pindex + len);
13264            }
13265            else {
13266                const char *p = (const char *) pbuf;
13267                assert(pbuf != NULL);
13268                p += kind * pindex;
13269                v = PyUnicode_FromKindAndData(kind, p, len);
13270            }
13271            if (v == NULL)
13272                goto onError;
13273            r = _PyAccu_Accumulate(&acc, v);
13274            Py_DECREF(v);
13275            if (r)
13276                goto onError;
13277            if (width > len && repeat_accumulate(&acc, blank, width - len))
13278                goto onError;
13279            if (dict && (argidx < arglen) && c != '%') {
13280                PyErr_SetString(PyExc_TypeError,
13281                                "not all arguments converted during string formatting");
13282                goto onError;
13283            }
13284            Py_CLEAR(temp);
13285        } /* '%' */
13286    } /* until end */
13287    if (argidx < arglen && !dict) {
13288        PyErr_SetString(PyExc_TypeError,
13289                        "not all arguments converted during string formatting");
13290        goto onError;
13291    }
13292
13293    result = _PyAccu_Finish(&acc);
13294    if (args_owned) {
13295        Py_DECREF(args);
13296    }
13297    Py_DECREF(uformat);
13298    Py_XDECREF(temp);
13299    Py_XDECREF(second);
13300    return (PyObject *)result;
13301
13302  onError:
13303    Py_DECREF(uformat);
13304    Py_XDECREF(temp);
13305    Py_XDECREF(second);
13306    _PyAccu_Destroy(&acc);
13307    if (args_owned) {
13308        Py_DECREF(args);
13309    }
13310    return NULL;
13311}
13312
13313static PyObject *
13314unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13315
13316static PyObject *
13317unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13318{
13319    PyObject *x = NULL;
13320    static char *kwlist[] = {"object", "encoding", "errors", 0};
13321    char *encoding = NULL;
13322    char *errors = NULL;
13323
13324    if (type != &PyUnicode_Type)
13325        return unicode_subtype_new(type, args, kwds);
13326    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13327                                     kwlist, &x, &encoding, &errors))
13328        return NULL;
13329    if (x == NULL)
13330        return (PyObject *)PyUnicode_New(0, 0);
13331    if (encoding == NULL && errors == NULL)
13332        return PyObject_Str(x);
13333    else
13334        return PyUnicode_FromEncodedObject(x, encoding, errors);
13335}
13336
13337static PyObject *
13338unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13339{
13340    PyUnicodeObject *unicode, *self;
13341    Py_ssize_t length, char_size;
13342    int share_wstr, share_utf8;
13343    unsigned int kind;
13344    void *data;
13345
13346    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13347
13348    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13349    if (unicode == NULL)
13350        return NULL;
13351    assert(_PyUnicode_CHECK(unicode));
13352    if (PyUnicode_READY(unicode))
13353        return NULL;
13354
13355    self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13356    if (self == NULL) {
13357        Py_DECREF(unicode);
13358        return NULL;
13359    }
13360    kind = PyUnicode_KIND(unicode);
13361    length = PyUnicode_GET_LENGTH(unicode);
13362
13363    _PyUnicode_LENGTH(self) = length;
13364#ifdef Py_DEBUG
13365    _PyUnicode_HASH(self) = -1;
13366#else
13367    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13368#endif
13369    _PyUnicode_STATE(self).interned = 0;
13370    _PyUnicode_STATE(self).kind = kind;
13371    _PyUnicode_STATE(self).compact = 0;
13372    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13373    _PyUnicode_STATE(self).ready = 1;
13374    _PyUnicode_WSTR(self) = NULL;
13375    _PyUnicode_UTF8_LENGTH(self) = 0;
13376    _PyUnicode_UTF8(self) = NULL;
13377    _PyUnicode_WSTR_LENGTH(self) = 0;
13378    _PyUnicode_DATA_ANY(self) = NULL;
13379
13380    share_utf8 = 0;
13381    share_wstr = 0;
13382    if (kind == PyUnicode_1BYTE_KIND) {
13383        char_size = 1;
13384        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13385            share_utf8 = 1;
13386    }
13387    else if (kind == PyUnicode_2BYTE_KIND) {
13388        char_size = 2;
13389        if (sizeof(wchar_t) == 2)
13390            share_wstr = 1;
13391    }
13392    else {
13393        assert(kind == PyUnicode_4BYTE_KIND);
13394        char_size = 4;
13395        if (sizeof(wchar_t) == 4)
13396            share_wstr = 1;
13397    }
13398
13399    /* Ensure we won't overflow the length. */
13400    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13401        PyErr_NoMemory();
13402        goto onError;
13403    }
13404    data = PyObject_MALLOC((length + 1) * char_size);
13405    if (data == NULL) {
13406        PyErr_NoMemory();
13407        goto onError;
13408    }
13409
13410    _PyUnicode_DATA_ANY(self) = data;
13411    if (share_utf8) {
13412        _PyUnicode_UTF8_LENGTH(self) = length;
13413        _PyUnicode_UTF8(self) = data;
13414    }
13415    if (share_wstr) {
13416        _PyUnicode_WSTR_LENGTH(self) = length;
13417        _PyUnicode_WSTR(self) = (wchar_t *)data;
13418    }
13419
13420    Py_MEMCPY(data, PyUnicode_DATA(unicode),
13421              kind * (length + 1));
13422    Py_DECREF(unicode);
13423    assert(_PyUnicode_CheckConsistency(self, 1));
13424#ifdef Py_DEBUG
13425    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13426#endif
13427    return (PyObject *)self;
13428
13429onError:
13430    Py_DECREF(unicode);
13431    Py_DECREF(self);
13432    return NULL;
13433}
13434
13435PyDoc_STRVAR(unicode_doc,
13436             "str(string[, encoding[, errors]]) -> str\n\
13437\n\
13438Create a new string object from the given encoded string.\n\
13439encoding defaults to the current default string encoding.\n\
13440errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
13441
13442static PyObject *unicode_iter(PyObject *seq);
13443
13444PyTypeObject PyUnicode_Type = {
13445    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13446    "str",              /* tp_name */
13447    sizeof(PyUnicodeObject),        /* tp_size */
13448    0,                  /* tp_itemsize */
13449    /* Slots */
13450    (destructor)unicode_dealloc,    /* tp_dealloc */
13451    0,                  /* tp_print */
13452    0,                  /* tp_getattr */
13453    0,                  /* tp_setattr */
13454    0,                  /* tp_reserved */
13455    unicode_repr,           /* tp_repr */
13456    &unicode_as_number,         /* tp_as_number */
13457    &unicode_as_sequence,       /* tp_as_sequence */
13458    &unicode_as_mapping,        /* tp_as_mapping */
13459    (hashfunc) unicode_hash,        /* tp_hash*/
13460    0,                  /* tp_call*/
13461    (reprfunc) unicode_str,     /* tp_str */
13462    PyObject_GenericGetAttr,        /* tp_getattro */
13463    0,                  /* tp_setattro */
13464    0,                  /* tp_as_buffer */
13465    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
13466    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
13467    unicode_doc,            /* tp_doc */
13468    0,                  /* tp_traverse */
13469    0,                  /* tp_clear */
13470    PyUnicode_RichCompare,      /* tp_richcompare */
13471    0,                  /* tp_weaklistoffset */
13472    unicode_iter,           /* tp_iter */
13473    0,                  /* tp_iternext */
13474    unicode_methods,            /* tp_methods */
13475    0,                  /* tp_members */
13476    0,                  /* tp_getset */
13477    &PyBaseObject_Type,         /* tp_base */
13478    0,                  /* tp_dict */
13479    0,                  /* tp_descr_get */
13480    0,                  /* tp_descr_set */
13481    0,                  /* tp_dictoffset */
13482    0,                  /* tp_init */
13483    0,                  /* tp_alloc */
13484    unicode_new,            /* tp_new */
13485    PyObject_Del,           /* tp_free */
13486};
13487
13488/* Initialize the Unicode implementation */
13489
13490void _PyUnicode_Init(void)
13491{
13492    int i;
13493
13494    /* XXX - move this array to unicodectype.c ? */
13495    Py_UCS2 linebreak[] = {
13496        0x000A, /* LINE FEED */
13497        0x000D, /* CARRIAGE RETURN */
13498        0x001C, /* FILE SEPARATOR */
13499        0x001D, /* GROUP SEPARATOR */
13500        0x001E, /* RECORD SEPARATOR */
13501        0x0085, /* NEXT LINE */
13502        0x2028, /* LINE SEPARATOR */
13503        0x2029, /* PARAGRAPH SEPARATOR */
13504    };
13505
13506    /* Init the implementation */
13507    unicode_empty = PyUnicode_New(0, 0);
13508    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
13509    if (!unicode_empty)
13510        Py_FatalError("Can't create empty string");
13511
13512    for (i = 0; i < 256; i++)
13513        unicode_latin1[i] = NULL;
13514    if (PyType_Ready(&PyUnicode_Type) < 0)
13515        Py_FatalError("Can't initialize 'unicode'");
13516
13517    /* initialize the linebreak bloom filter */
13518    bloom_linebreak = make_bloom_mask(
13519        PyUnicode_2BYTE_KIND, linebreak,
13520        Py_ARRAY_LENGTH(linebreak));
13521
13522    PyType_Ready(&EncodingMapType);
13523}
13524
13525/* Finalize the Unicode implementation */
13526
13527int
13528PyUnicode_ClearFreeList(void)
13529{
13530    return 0;
13531}
13532
13533void
13534_PyUnicode_Fini(void)
13535{
13536    int i;
13537
13538    Py_XDECREF(unicode_empty);
13539    unicode_empty = NULL;
13540
13541    for (i = 0; i < 256; i++) {
13542        if (unicode_latin1[i]) {
13543            Py_DECREF(unicode_latin1[i]);
13544            unicode_latin1[i] = NULL;
13545        }
13546    }
13547    _PyUnicode_ClearStaticStrings();
13548    (void)PyUnicode_ClearFreeList();
13549}
13550
13551void
13552PyUnicode_InternInPlace(PyObject **p)
13553{
13554    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13555    PyObject *t;
13556#ifdef Py_DEBUG
13557    assert(s != NULL);
13558    assert(_PyUnicode_CHECK(s));
13559#else
13560    if (s == NULL || !PyUnicode_Check(s))
13561        return;
13562#endif
13563    /* If it's a subclass, we don't really know what putting
13564       it in the interned dict might do. */
13565    if (!PyUnicode_CheckExact(s))
13566        return;
13567    if (PyUnicode_CHECK_INTERNED(s))
13568        return;
13569    if (_PyUnicode_READY_REPLACE(p)) {
13570        assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
13571        return;
13572    }
13573    s = (PyUnicodeObject *)(*p);
13574    if (interned == NULL) {
13575        interned = PyDict_New();
13576        if (interned == NULL) {
13577            PyErr_Clear(); /* Don't leave an exception */
13578            return;
13579        }
13580    }
13581    /* It might be that the GetItem call fails even
13582       though the key is present in the dictionary,
13583       namely when this happens during a stack overflow. */
13584    Py_ALLOW_RECURSION
13585        t = PyDict_GetItem(interned, (PyObject *)s);
13586    Py_END_ALLOW_RECURSION
13587
13588        if (t) {
13589            Py_INCREF(t);
13590            Py_DECREF(*p);
13591            *p = t;
13592            return;
13593        }
13594
13595    PyThreadState_GET()->recursion_critical = 1;
13596    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13597        PyErr_Clear();
13598        PyThreadState_GET()->recursion_critical = 0;
13599        return;
13600    }
13601    PyThreadState_GET()->recursion_critical = 0;
13602    /* The two references in interned are not counted by refcnt.
13603       The deallocator will take care of this */
13604    Py_REFCNT(s) -= 2;
13605    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13606}
13607
13608void
13609PyUnicode_InternImmortal(PyObject **p)
13610{
13611    PyUnicodeObject *u = (PyUnicodeObject *)*p;
13612
13613    PyUnicode_InternInPlace(p);
13614    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13615        _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
13616        Py_INCREF(*p);
13617    }
13618}
13619
13620PyObject *
13621PyUnicode_InternFromString(const char *cp)
13622{
13623    PyObject *s = PyUnicode_FromString(cp);
13624    if (s == NULL)
13625        return NULL;
13626    PyUnicode_InternInPlace(&s);
13627    return s;
13628}
13629
13630void
13631_Py_ReleaseInternedUnicodeStrings(void)
13632{
13633    PyObject *keys;
13634    PyUnicodeObject *s;
13635    Py_ssize_t i, n;
13636    Py_ssize_t immortal_size = 0, mortal_size = 0;
13637
13638    if (interned == NULL || !PyDict_Check(interned))
13639        return;
13640    keys = PyDict_Keys(interned);
13641    if (keys == NULL || !PyList_Check(keys)) {
13642        PyErr_Clear();
13643        return;
13644    }
13645
13646    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13647       detector, interned unicode strings are not forcibly deallocated;
13648       rather, we give them their stolen references back, and then clear
13649       and DECREF the interned dict. */
13650
13651    n = PyList_GET_SIZE(keys);
13652    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13653            n);
13654    for (i = 0; i < n; i++) {
13655        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
13656        if (PyUnicode_READY(s) == -1) {
13657            assert(0 && "could not ready string");
13658            fprintf(stderr, "could not ready string\n");
13659        }
13660        switch (PyUnicode_CHECK_INTERNED(s)) {
13661        case SSTATE_NOT_INTERNED:
13662            /* XXX Shouldn't happen */
13663            break;
13664        case SSTATE_INTERNED_IMMORTAL:
13665            Py_REFCNT(s) += 1;
13666            immortal_size += PyUnicode_GET_LENGTH(s);
13667            break;
13668        case SSTATE_INTERNED_MORTAL:
13669            Py_REFCNT(s) += 2;
13670            mortal_size += PyUnicode_GET_LENGTH(s);
13671            break;
13672        default:
13673            Py_FatalError("Inconsistent interned string state.");
13674        }
13675        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13676    }
13677    fprintf(stderr, "total size of all interned strings: "
13678            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13679            "mortal/immortal\n", mortal_size, immortal_size);
13680    Py_DECREF(keys);
13681    PyDict_Clear(interned);
13682    Py_DECREF(interned);
13683    interned = NULL;
13684}
13685
13686
13687/********************* Unicode Iterator **************************/
13688
13689typedef struct {
13690    PyObject_HEAD
13691    Py_ssize_t it_index;
13692    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
13693} unicodeiterobject;
13694
13695static void
13696unicodeiter_dealloc(unicodeiterobject *it)
13697{
13698    _PyObject_GC_UNTRACK(it);
13699    Py_XDECREF(it->it_seq);
13700    PyObject_GC_Del(it);
13701}
13702
13703static int
13704unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13705{
13706    Py_VISIT(it->it_seq);
13707    return 0;
13708}
13709
13710static PyObject *
13711unicodeiter_next(unicodeiterobject *it)
13712{
13713    PyUnicodeObject *seq;
13714    PyObject *item;
13715
13716    assert(it != NULL);
13717    seq = it->it_seq;
13718    if (seq == NULL)
13719        return NULL;
13720    assert(_PyUnicode_CHECK(seq));
13721
13722    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13723        int kind = PyUnicode_KIND(seq);
13724        void *data = PyUnicode_DATA(seq);
13725        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13726        item = PyUnicode_FromOrdinal(chr);
13727        if (item != NULL)
13728            ++it->it_index;
13729        return item;
13730    }
13731
13732    Py_DECREF(seq);
13733    it->it_seq = NULL;
13734    return NULL;
13735}
13736
13737static PyObject *
13738unicodeiter_len(unicodeiterobject *it)
13739{
13740    Py_ssize_t len = 0;
13741    if (it->it_seq)
13742        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13743    return PyLong_FromSsize_t(len);
13744}
13745
13746PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13747
13748static PyMethodDef unicodeiter_methods[] = {
13749    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
13750     length_hint_doc},
13751    {NULL,      NULL}       /* sentinel */
13752};
13753
13754PyTypeObject PyUnicodeIter_Type = {
13755    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13756    "str_iterator",         /* tp_name */
13757    sizeof(unicodeiterobject),      /* tp_basicsize */
13758    0,                  /* tp_itemsize */
13759    /* methods */
13760    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
13761    0,                  /* tp_print */
13762    0,                  /* tp_getattr */
13763    0,                  /* tp_setattr */
13764    0,                  /* tp_reserved */
13765    0,                  /* tp_repr */
13766    0,                  /* tp_as_number */
13767    0,                  /* tp_as_sequence */
13768    0,                  /* tp_as_mapping */
13769    0,                  /* tp_hash */
13770    0,                  /* tp_call */
13771    0,                  /* tp_str */
13772    PyObject_GenericGetAttr,        /* tp_getattro */
13773    0,                  /* tp_setattro */
13774    0,                  /* tp_as_buffer */
13775    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13776    0,                  /* tp_doc */
13777    (traverseproc)unicodeiter_traverse, /* tp_traverse */
13778    0,                  /* tp_clear */
13779    0,                  /* tp_richcompare */
13780    0,                  /* tp_weaklistoffset */
13781    PyObject_SelfIter,          /* tp_iter */
13782    (iternextfunc)unicodeiter_next,     /* tp_iternext */
13783    unicodeiter_methods,            /* tp_methods */
13784    0,
13785};
13786
13787static PyObject *
13788unicode_iter(PyObject *seq)
13789{
13790    unicodeiterobject *it;
13791
13792    if (!PyUnicode_Check(seq)) {
13793        PyErr_BadInternalCall();
13794        return NULL;
13795    }
13796    if (PyUnicode_READY(seq) == -1)
13797        return NULL;
13798    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13799    if (it == NULL)
13800        return NULL;
13801    it->it_index = 0;
13802    Py_INCREF(seq);
13803    it->it_seq = (PyUnicodeObject *)seq;
13804    _PyObject_GC_TRACK(it);
13805    return (PyObject *)it;
13806}
13807
13808#define UNIOP(x) Py_UNICODE_##x
13809#define UNIOP_t Py_UNICODE
13810#include "uniops.h"
13811#undef UNIOP
13812#undef UNIOP_t
13813#define UNIOP(x) Py_UCS4_##x
13814#define UNIOP_t Py_UCS4
13815#include "uniops.h"
13816#undef UNIOP
13817#undef UNIOP_t
13818
13819Py_UNICODE*
13820PyUnicode_AsUnicodeCopy(PyObject *object)
13821{
13822    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13823    Py_UNICODE *copy;
13824    Py_ssize_t size;
13825
13826    if (!PyUnicode_Check(unicode)) {
13827        PyErr_BadArgument();
13828        return NULL;
13829    }
13830    /* Ensure we won't overflow the size. */
13831    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13832        PyErr_NoMemory();
13833        return NULL;
13834    }
13835    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13836    size *= sizeof(Py_UNICODE);
13837    copy = PyMem_Malloc(size);
13838    if (copy == NULL) {
13839        PyErr_NoMemory();
13840        return NULL;
13841    }
13842    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13843    return copy;
13844}
13845
13846/* A _string module, to export formatter_parser and formatter_field_name_split
13847   to the string.Formatter class implemented in Python. */
13848
13849static PyMethodDef _string_methods[] = {
13850    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13851     METH_O, PyDoc_STR("split the argument as a field name")},
13852    {"formatter_parser", (PyCFunction) formatter_parser,
13853     METH_O, PyDoc_STR("parse the argument as a format string")},
13854    {NULL, NULL}
13855};
13856
13857static struct PyModuleDef _string_module = {
13858    PyModuleDef_HEAD_INIT,
13859    "_string",
13860    PyDoc_STR("string helper module"),
13861    0,
13862    _string_methods,
13863    NULL,
13864    NULL,
13865    NULL,
13866    NULL
13867};
13868
13869PyMODINIT_FUNC
13870PyInit__string(void)
13871{
13872    return PyModule_Create(&_string_module);
13873}
13874
13875
13876#ifdef __cplusplus
13877}
13878#endif
13879